In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_5'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_5


In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sb

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassificationModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import GBTClassificationModel

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

Part 1: Chapter 6

In [None]:
df = spark.read.csv("Ecommerce_Customers.csv", header=True, inferSchema=True)
df.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [None]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [None]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [None]:
input_cols = df.columns[3:-1]
target_col = df.columns[-1]

assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
df_pre = assembler.transform(df)
final_df = df_pre.select('features', 'Yearly Amount Spent')

In [None]:
df_pre[['features']].show(5, False)

+----------------------------------------------------------------------------+
|features                                                                    |
+----------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]  |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424]|
|[34.30555662975554,13.717513665142507,36.72128267790313,3.120178782748092]  |
|[33.33067252364639,12.795188551078114,37.53665330059473,4.446308318351434]  |
+----------------------------------------------------------------------------+
only showing top 5 rows



In [None]:
train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=42)
train_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                348|
|   mean| 499.19978013022717|
| stddev|  78.89099369192157|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [None]:
test_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                152|
|   mean|  499.5756292366865|
| stddev|  80.53861579586098|
|    min|   266.086340948469|
|    max|  744.2218671047146|
+-------+-------------------+



In [None]:
lr = LinearRegression(featuresCol='features', 
                      labelCol='Yearly Amount Spent',
                      predictionCol='Predict_Yearly_Amount_Spent'
                      )
model = lr.fit(train_df)

In [None]:
print('Coefficients: {} -  Intercept: {}'.format(model.coefficients, model.intercept))

Coefficients: [25.93215475417773,38.31861228860246,0.6756981758947298,61.36799727890117] -  Intercept: -1061.6126556938295


In [None]:
test_results = model.evaluate(test_df)
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| 0.5230324245964653|
|-17.069653184407684|
|  7.546772405139109|
|-20.438521626093575|
| 23.359389939390212|
+-------------------+
only showing top 5 rows



In [None]:
test_results = model.evaluate(test_df)

print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('MAE: {}'.format(test_results.meanAbsoluteError))
print('R2: {}'.format(test_results.r2))

RMSE: 9.850957450983932
MSE: 97.04136270109586
MAE: 7.933457771534967
R2: 0.9849403386897637


In [None]:
predictions = model.transform(test_df)
predictions.columns

['features', 'Yearly Amount Spent', 'Predict_Yearly_Amount_Spent']

In [None]:
predictions[['Yearly Amount Spent', 'Predict_Yearly_Amount_Spent']].show(5)

+-------------------+---------------------------+
|Yearly Amount Spent|Predict_Yearly_Amount_Spent|
+-------------------+---------------------------+
| 442.06441375806565|          441.5413813334692|
|   266.086340948469|          283.1559941328767|
|  494.6386097568927|          487.0918373517536|
|  486.9470538397658|         507.38557546585935|
|  591.7810894256675|          568.4216994862772|
+-------------------+---------------------------+
only showing top 5 rows



In [None]:
model.save('linear_model_ecommerce_customers')

In [None]:
from pyspark.ml.regression import LinearRegressionModel
model2 = LinearRegressionModel.load('linear_model_ecommerce_customers')

In [None]:
model2.evaluate(test_df).r2

0.9849403386897637

Part 2: Chapter 7

In [5]:
df = spark.read.csv("flights.csv", header=True, inferSchema=True)
df.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65|   NA|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [None]:
assembler = VectorAssembler(inputCols=['mile'], outputCol='features')
df_pre = assembler.transform(df)
final_df = df_pre.select('features', 'duration')

In [None]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)
train_df.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             40101|
|   mean|151.72883469240168|
| stddev| 87.01466029323022|
|    min|                30|
|    max|               560|
+-------+------------------+



In [None]:
test_df.describe().show()

+-------+-----------------+
|summary|         duration|
+-------+-----------------+
|  count|             9899|
|   mean|151.9156480452571|
| stddev|87.17240587329398|
|    min|               30|
|    max|              560|
+-------+-----------------+



In [None]:
lr = LinearRegression(featuresCol='features', 
                      labelCol='duration',
                      )
model = lr.fit(train_df)

In [None]:
print('Coefficients: {} -  Intercept: {}'.format(model.coefficients, model.intercept))

Coefficients: [0.12181053792915653] -  Intercept: 44.28863533713087


In [None]:
test_results = model.evaluate(test_df)

print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('MAE: {}'.format(test_results.meanAbsoluteError))
print('R2: {}'.format(test_results.r2))

RMSE: 17.4276664696533
MSE: 303.7235585774779
MAE: 13.299173461705045
R2: 0.960027225836811


Part 3: Pipeline

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

df = spark.read.csv("flights.csv", header=True, inferSchema=True)
df.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65|   NA|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [7]:
final_df = df.withColumn('km', round(df.mile * 1.60934, 0))
final_df = final_df.withColumn('label', (df.delay >= 15).cast('integer'))
final_df.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|1989.0|    0|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65|   NA| 415.0| null|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



In [8]:
no_label_df = final_df[final_df.label.isNull()]
no_label_df.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65|   NA| 415.0| null|
|  0|  8|  2|     UA|   549|ORD| 334| 11.08|      85|   NA| 538.0| null|
|  5|  8|  0|     UA|   574|ORD| 235| 14.48|      79|   NA| 378.0| null|
|  1| 13|  3|     US|  2189|LGA| 214|  20.0|      82|   NA| 344.0| null|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



In [9]:
final_df = final_df[final_df.label.isNotNull()]
final_df = final_df.dropna()
final_df.count()

47022

In [36]:
train_df, test_df = final_df.randomSplit([0.8, 0.2])

In [43]:
indexer_carrier = StringIndexer(inputCol='carrier', outputCol='carrier_idx')
indexer_org = StringIndexer(inputCol='org', outputCol='org_idx')


onehot = OneHotEncoderEstimator(inputCols=['carrier_idx', 'org_idx', 'dow'],
                       outputCols=['carrier_dummy', 'org_dummy', 'dow_dummy'])

input_cols = ['km', 'carrier_idx', 'org_idx', 'dow', 'mon', 'dom', 'depart', 'duration']
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

pipeline_preprocessing = Pipeline(stages=[indexer_carrier, indexer_org, onehot, assembler])

model = LogisticRegression()

pipeline = Pipeline(stages=[pipeline_preprocessing, model])

In [12]:
model = pipeline.fit(train_df)
predictions = model.transform(test_df)
predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
predictions.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 5 rows



In [13]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
prediction_and_label = predictions.select(['prediction', 'label'])
metrics = MulticlassMetrics(prediction_and_label.rdd)
metrics_binary = BinaryClassificationMetrics(prediction_and_label.rdd)

print('Accuracy: {:.4f}'.format(metrics.accuracy))
print('Precisions: {:.4f}'.format(metrics.precision(label=1)))
print('Recall: {:.4f}'.format(metrics.recall(label=1)))
print('f1_score: {:.4f}'.format(metrics.fMeasure()))
print('AUC: {:.4f}'.format(metrics_binary.areaUnderROC))

Accuracy: 0.5902
Precisions: 0.5785
Recall: 0.7383
f1_score: 0.5902
AUC: 0.5864


In [None]:
import pandas as pd
result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
result_confusion_matrix

Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,1969.0,2563.0
Actual Pos,1247.0,3518.0


In [None]:
model.save('logistic_model_flights_2')

In [None]:
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import PipelineModel
model2 = PipelineModel.load('logistic_model_flights_2')

In [None]:
predictions2 = model2.transform(no_label_df)
predictions2 = predictions2.withColumn('label', predictions2.label.cast(DoubleType()))
predictions2.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
| null|       0.0|
| null|       1.0|
| null|       1.0|
| null|       1.0|
| null|       0.0|
+-----+----------+
only showing top 5 rows



Part 4: Decision Tree, Random Forest

In [None]:
tree = DecisionTreeClassifier()
pipeline_decision_tree = Pipeline(stages=[pipeline_preprocessing, tree])

model = pipeline_decision_tree.fit(train_df)
predictions = model.transform(test_df)
predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
predictions.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       0.0|
+-----+----------+
only showing top 5 rows



In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
prediction_and_label = predictions.select(['prediction', 'label'])
metrics = MulticlassMetrics(prediction_and_label.rdd)
metrics_binary = BinaryClassificationMetrics(prediction_and_label.rdd)

print('Accuracy: {:.4f}'.format(metrics.accuracy))
print('Precisions: {:.4f}'.format(metrics.precision(label=1)))
print('Recall: {:.4f}'.format(metrics.recall(label=1)))
print('f1_score: {:.4f}'.format(metrics.fMeasure()))
print('AUC: {:.4f}'.format(metrics_binary.areaUnderROC))

Accuracy: 0.5664
Precisions: 0.5753
Recall: 0.5885
f1_score: 0.5664
AUC: 0.5659


Random Forest

In [25]:
model = RandomForestClassifier()
pipeline_random_forest = Pipeline(stages=[pipeline_preprocessing, model])

pipe = pipeline_random_forest.fit(train_df)
predictions = pipe.transform(test_df)
predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
predictions.select('label', 'prediction').show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 5 rows



In [24]:
pipe.stages[-1].getNumTrees

20

In [27]:
pipe.stages[-1].featureImportances

SparseVector(22, {0: 0.133, 1: 0.0052, 2: 0.0399, 3: 0.003, 4: 0.1524, 5: 0.0006, 6: 0.0444, 7: 0.0535, 8: 0.1589, 9: 0.1237, 10: 0.0132, 11: 0.0172, 12: 0.0149, 13: 0.0767, 14: 0.0918, 15: 0.0218, 16: 0.003, 17: 0.002, 18: 0.0027, 19: 0.0104, 20: 0.0016, 21: 0.0301})

In [28]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
prediction_and_label = predictions.select(['prediction', 'label'])
metrics = MulticlassMetrics(prediction_and_label.rdd)
metrics_binary = BinaryClassificationMetrics(prediction_and_label.rdd)

print('Accuracy: {:.4f}'.format(metrics.accuracy))
print('Precisions: {:.4f}'.format(metrics.precision(label=1)))
print('Recall: {:.4f}'.format(metrics.recall(label=1)))
print('f1_score: {:.4f}'.format(metrics.fMeasure()))
print('AUC: {:.4f}'.format(metrics_binary.areaUnderROC))

Accuracy: 0.5871
Precisions: 0.5652
Recall: 0.8422
f1_score: 0.5871
AUC: 0.5805


In [29]:
predictions.select('label', 'prediction', 'probability').show(5, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |1.0       |[0.4365815214830092,0.5634184785169908] |
|1.0  |1.0       |[0.4362594203733951,0.563740579626605]  |
|1.0  |1.0       |[0.46150440466494286,0.5384955953350572]|
|0.0  |1.0       |[0.4336975644634159,0.5663024355365842] |
|1.0  |1.0       |[0.46534751834823307,0.5346524816517669]|
+-----+----------+----------------------------------------+
only showing top 5 rows



Boosting - Gradient-Boosted Trees

In [59]:
list_model = [('Logistic Regression', LogisticRegression()),
              ('Decision Tree', DecisionTreeClassifier()),
              ('Random Forest', RandomForestClassifier()),
              ('Gradient Boosting', GBTClassifier()),
              ]

for name, model in list_model:
    pipeline = Pipeline(stages=[pipeline_preprocessing, model])

    pipe = pipeline.fit(train_df)
    predictions = pipe.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
    # predictions.select('label', 'prediction').show(5)
    print('Model : {}'.format(name))
    print('Classification result: ')
    from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
    from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)

    bi_evaluator = BinaryClassificationEvaluator()
    evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

    print('  Accuracy:\t {:.4f}'.format(metrics.accuracy))
    # print('  Accuracy:\t {:.4f}'.format(evaluator.evaluate(predictions)))
    print('  Precisions:\t {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall:\t {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score:\t {:.4f}'.format(metrics.fMeasure()))
    print('  AUC:   \t {:.4f}'.format(bi_evaluator.evaluate(predictions)))

    print('-'*40)

Model : Logistic Regression
Classification result: 
  Accuracy:	 0.6100
  Accuracy:	 0.6100
  Precisions:	 0.6141
  Recall:	 0.6474
  f1_score:	 0.6100
  AUC:   	 0.6456
----------------------------------------
Model : Decision Tree
Classification result: 
  Accuracy:	 0.6368
  Accuracy:	 0.6368
  Precisions:	 0.6411
  Recall:	 0.6650
  f1_score:	 0.6368
  AUC:   	 0.6356
----------------------------------------
Model : Random Forest
Classification result: 
  Accuracy:	 0.6320
  Accuracy:	 0.6320
  Precisions:	 0.6304
  Recall:	 0.6851
  f1_score:	 0.6320
  AUC:   	 0.6794
----------------------------------------
Model : Gradient Boosting
Classification result: 
  Accuracy:	 0.6625
  Accuracy:	 0.6625
  Precisions:	 0.6626
  Recall:	 0.6985
  f1_score:	 0.6625
  AUC:   	 0.7239
----------------------------------------


In [52]:
predictions.show(5, False)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+-------------+----------------------------------------+------------------------------------------+----------------------------------------+----------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|carrier_dummy|org_dummy    |dow_dummy    |features                                |rawPrediction                             |probability                             |prediction|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+-----------+-------+-------------+-------------+-------------+----------------------------------------+------------------------------------------+----------------------------------------+----------+
|0  |1  |2  |AA     |73    |ORD|4243|9.08  |560     |39   |6828.0|1.0  |1.0        |0.0    |(8,[1],[1.0])|(7,[0],[1.0])|(6,[2],[1.0])|[6828.0,1.0,0.0,2.0,0.0,1.0,9.08,560.0] |[-0.80556270