# Machine Learning with Spark

---

### Data preparation and first insight

Import needed libraries

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = 50

import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import lit, col
from pyspark.sql.types import DoubleType

Creation of a spark session and load the csv file into a dataframe

In [3]:
spk_sess = SparkSession \
    .builder \
    .appName("_Project_Spark_App") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spk_sess.read.csv("./file_with_datetime", header=True, sep=",");

df = df.select('FR10', 'FR21', 'FR22', 'time', 'year', 'month', 'week', 'day', 'hour')
df.show(20)

+--------------------+--------------------+-------------------+-------------------+----+-----+----+---+----+
|                FR10|                FR21|               FR22|               time|year|month|week|day|hour|
+--------------------+--------------------+-------------------+-------------------+----+-----+----+---+----+
|                 0.0|                 0.0|                0.0|1986-01-01 00:00:00|1986|    1|   1|  1|   0|
|                 0.0|                 0.0|                0.0|1986-01-01 01:00:00|1986|    1|   1|  1|   1|
|                 0.0|                 0.0|                0.0|1986-01-01 02:00:00|1986|    1|   1|  1|   2|
|                 0.0|                 0.0|                0.0|1986-01-01 03:00:00|1986|    1|   1|  1|   3|
|                 0.0|                 0.0|                0.0|1986-01-01 04:00:00|1986|    1|   1|  1|   4|
|                 0.0|                 0.0|                0.0|1986-01-01 05:00:00|1986|    1|   1|  1|   5|
|                 0

Shape of the dataframe

In [4]:
df.count(), len(df.columns)

(262968, 9)

All columns are strings, change stations' efficiencies to double float 

In [5]:
for c in df.columns:
    if c == 'time':
        df = df.withColumn(c, df[c].cast("string"))
    else:
        df = df.withColumn(c, df[c].cast(DoubleType()))
        
df.dtypes

[('FR10', 'double'),
 ('FR21', 'double'),
 ('FR22', 'double'),
 ('time', 'string'),
 ('year', 'double'),
 ('month', 'double'),
 ('week', 'double'),
 ('day', 'double'),
 ('hour', 'double')]

Show basic stats

In [6]:
df.select('FR10', 'FR21', 'FR22').describe().show()

+-------+-------------------+-------------------+-------------------+
|summary|               FR10|               FR21|               FR22|
+-------+-------------------+-------------------+-------------------+
|  count|             262968|             262968|             262968|
|   mean|0.13080652623132852|0.12925481366554123|  0.125993219212984|
| stddev|0.20840717745319268| 0.2052150311485998|0.20127526900909193|
|    min|                0.0|                0.0|                0.0|
|    max|            0.91125|            0.91662|            0.91613|
+-------+-------------------+-------------------+-------------------+



There isn't any Nan values

In [7]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

# drop na values if needed / not the case here
df = df.na.drop()

+----+----+----+----+----+-----+----+---+----+
|FR10|FR21|FR22|time|year|month|week|day|hour|
+----+----+----+----+----+-----+----+---+----+
|   0|   0|   0|   0|   0|    0|   0|  0|   0|
+----+----+----+----+----+-----+----+---+----+



[Handling missing values](https://fr.coursera.org/lecture/big-data-machine-learning/handling-missing-values-in-spark-Goh1z)

Correlations

In [15]:
temp_df = df.select('FR10', 'FR21', 'FR22')

# There is a correlation function in the ml subpackage pyspark.ml.stat. 
# However, it requires you to provide a column of type Vector. So you need to convert your columns 
# into a vector column first using the VectorAssembler and then apply the correlation:


from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=temp_df.columns, outputCol=vector_col)
temp_df_vector = assembler.transform(temp_df).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(temp_df_vector, vector_col)

# the result as a numpy array
pd.DataFrame(matrix.collect()[0]["pearson({})".format(vector_col)].values)

Unnamed: 0,0
0,1.0
1,0.94775
2,0.964299
3,0.94775
4,1.0
5,0.939765
6,0.964299
7,0.939765
8,1.0


In [16]:
df.show()

+--------------------+--------------------+-------------------+-------------------+------+-----+----+---+----+
|                FR10|                FR21|               FR22|               time|  year|month|week|day|hour|
+--------------------+--------------------+-------------------+-------------------+------+-----+----+---+----+
|                 0.0|                 0.0|                0.0|1986-01-01 00:00:00|1986.0|  1.0| 1.0|1.0| 0.0|
|                 0.0|                 0.0|                0.0|1986-01-01 01:00:00|1986.0|  1.0| 1.0|1.0| 1.0|
|                 0.0|                 0.0|                0.0|1986-01-01 02:00:00|1986.0|  1.0| 1.0|1.0| 2.0|
|                 0.0|                 0.0|                0.0|1986-01-01 03:00:00|1986.0|  1.0| 1.0|1.0| 3.0|
|                 0.0|                 0.0|                0.0|1986-01-01 04:00:00|1986.0|  1.0| 1.0|1.0| 4.0|
|                 0.0|                 0.0|                0.0|1986-01-01 05:00:00|1986.0|  1.0| 1.0|1.0| 5.0|
|

---

### Training ML models

Decision Tree Regression

In [None]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils


# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    impurity='variance', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
    float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
