In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
import findspark
findspark.init()

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_5'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_5


In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [None]:
df = spark.read.csv("customer_churn.csv", header=True, inferSchema=True)
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [None]:
df.count()

900

In [None]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [None]:
df.groupby('Churn').count().show()

+-----+-----+
|Churn|count|
+-----+-----+
|    1|  150|
|    0|  750|
+-----+-----+



In [None]:
df.groupby('Account_Manager').count().show()

+---------------+-----+
|Account_Manager|count|
+---------------+-----+
|              1|  433|
|              0|  467|
+---------------+-----+



In [None]:
df[['Company']].distinct().count()

873

In [None]:
df[['Location']].show(20, False)

+--------------------------------------------------------+
|Location                                                |
+--------------------------------------------------------+
|10265 Elizabeth Mission Barkerburgh, AK 89518           |
|6157 Frank Gardens Suite 019 Carloshaven, RI 17756      |
|1331 Keith Court Alyssahaven, DE 90114                  |
|13120 Daniel Mount Angelabury, WY 30645-4695            |
|765 Tricia Row Karenshire, MH 71730                     |
|6187 Olson Mountains East Vincentborough, PR 74359      |
|4846 Savannah Road West Justin, IA 87713-3460           |
|25271 Roy Expressway Suite 147 Brownport, FM 59852-6150 |
|3725 Caroline Stravenue South Christineview, MA 82059   |
|363 Sandra Lodge Suite 144 South Ann, WI 51655-7561     |
|Unit 8120 Box 9160 DPO AA 43432                         |
|Unit 1895 Box 0949 DPO AA 40249                         |
|897 Kelley Overpass Suite 349 West Rebekahport, AZ 44793|
|11488 Weaver Cape Hernandezberg, WI 63417-8544         

Nhận xét:
- Các biến numeric: Age, Total_Purchase, Years, Num_sites
- Biến categoric: Account_Manager (0,1).
- Biến Company có đến 873 distinct / total 900 mẫu, nên gần như không có giá trị dự đoán.
- Biến Onboard_date có thể tách ra thành Onboard_year và Onboard_month.
- Biến Location có thể tách ra để lấy tên bang của Mỹ (ví dụ: AK là Alaska, AZ là Arizona...). Nước Mỹ có khoảng 60 bang.

In [None]:
df = df.withColumn('Onboard_year', year(col('Onboard_date')))
df = df.withColumn('Onboard_month', month(col('Onboard_date')))
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------+-------------+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|Onboard_year|Onboard_month|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------+-------------+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|        2013|            8|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|        2013|            8|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|        2016|            6|
|   

In [None]:
import re
pattern = r'.*\s([A-Z]{2})\s.*'
func_extract_state = udf(lambda x: re.split(pattern, x)[1])
df = df.withColumn('State', func_extract_state(col('Location')))
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------+-------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|Onboard_year|Onboard_month|State|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+------------+-------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|        2013|            8|   AK|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|        2013|            8|   RI|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|  

In [None]:
df[['State']].distinct().count()

62

In [None]:
total = df.count()
count = df.groupby('Churn').count()
count = count.withColumn('weight', (total - col('count'))/total)
count.show(5)

+-----+-----+-------------------+
|Churn|count|             weight|
+-----+-----+-------------------+
|    1|  150| 0.8333333333333334|
|    0|  750|0.16666666666666666|
+-----+-----+-------------------+



In [None]:
df = df.join(count, on='Churn', how='left_outer')

In [None]:
df = df.withColumnRenamed('Churn', 'label')

indexer_state = StringIndexer(inputCol='State', outputCol='State_idx')
onehot = OneHotEncoder(inputCols=['State_idx'], outputCols=['State_dummy'])

input_cols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_year', 'State_dummy']
assembler = VectorAssembler(inputCols=input_cols, outputCol='features_not_scale')
scaler = MinMaxScaler(inputCol="features_not_scale", outputCol="features")

pipeline_preprocessing = Pipeline(stages=[indexer_state, onehot, assembler, scaler])
pipe_preprocessing = pipeline_preprocessing.fit(df)

final_df = pipe_preprocessing.transform(df)

train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=42)

Nhận xét:
- Mục đích chính của đề bài đưa ra là dự đoán chính xác các khách hàng sẽ không tiếp tục sử dụng dịch vụ (Churn - Stop buying service - label=1), để từ đó có các chiến lược phù hợp để giữ chân khách hàng.
- Vì vậy sẽ tập trung vào metrics f1_score (trung hoà giữa precision và recall) của label = 1.
- Vì data nhỏ nên có thể test nhiều model khác nhau và tunning model bằng Grid.
- Đồng thời data bị lệch sang 0 nên sử dụng weight đã tính toán để cân bằng lại, đưa ra forecast tối ưu nhất cho label = 1 (bị lệch).
- Lưu ý, weightCol chỉ hoạt động trên pyspark 3.0 trở lên.

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

list_model = [('Logistic Regression', LogisticRegression(weightCol='weight')),
              ('Decision Tree', DecisionTreeClassifier(seed=1, weightCol='weight')), 
              ('Random Forest', RandomForestClassifier(seed=1, weightCol='weight')), 
              ('Gradient Boosting', GBTClassifier(seed=1, weightCol='weight')),
              ('Linear SVC', LinearSVC(weightCol='weight')),
            #   ('Multilayer Perception', MultilayerPerceptronClassifier(maxIter=100, layers=[5, 4, 3], blockSize=128, seed=1)),
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    evaluator = MulticlassClassificationEvaluator()
    print('-'*30)
    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
    print('  AUC \t\t\t: {:.4f}'.format(evaluator.evaluate(predictions)))
    result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    display(result_confusion_matrix)

------------------------------
[1mLogistic Regression[0m

  Accuracy 		: 0.8197
  Precisions (label=1)	: 0.4630
  Recall (label=1)	: 0.6579
  f1_score (label=1)	: 0.5435
  AUC 			: 0.8316


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,166.0,29.0
Actual Pos,13.0,25.0


------------------------------
[1mDecision Tree[0m

  Accuracy 		: 0.7983
  Precisions (label=1)	: 0.4366
  Recall (label=1)	: 0.8158
  f1_score (label=1)	: 0.5688
  AUC 			: 0.8195


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,155.0,40.0
Actual Pos,7.0,31.0


------------------------------
[1mRandom Forest[0m

  Accuracy 		: 0.8798
  Precisions (label=1)	: 0.6087
  Recall (label=1)	: 0.7368
  f1_score (label=1)	: 0.6667
  AUC 			: 0.8843


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,177.0,18.0
Actual Pos,10.0,28.0


------------------------------
[1mGradient Boosting[0m

  Accuracy 		: 0.8498
  Precisions (label=1)	: 0.5306
  Recall (label=1)	: 0.6842
  f1_score (label=1)	: 0.5977
  AUC 			: 0.8571


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,172.0,23.0
Actual Pos,12.0,26.0


------------------------------
[1mLinear SVC[0m

  Accuracy 		: 0.7210
  Precisions (label=1)	: 0.3291
  Recall (label=1)	: 0.6842
  f1_score (label=1)	: 0.4444
  AUC 			: 0.7535


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,142.0,53.0
Actual Pos,12.0,26.0


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
model_rdn_forest = RandomForestClassifier(seed=42)

param_rdn_forest = ParamGridBuilder()\
            .addGrid(model_rdn_forest.numTrees, [100, 200])\
            .addGrid(model_rdn_forest.maxDepth, [2, 3, 6, 7])\
            .addGrid(model_rdn_forest.featureSubsetStrategy, ['sqrt', 'log2', 'auto'])\
            .addGrid(model_rdn_forest.weightCol, ['weight'])\
            .build()

def tunning_model(model, model_name, param, metric):
    import time
    from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
    from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
    tic = time.time()
    tvs = TrainValidationSplit(estimator=model, 
                            estimatorParamMaps=param,
                            evaluator = MulticlassClassificationEvaluator(metricName=metric), 
                            trainRatio=0.8,
    )

    best_model = tvs.fit(train_df)
    predictions = best_model.transform(test_df)
    toc = time.time()
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    bi_evaluator = BinaryClassificationEvaluator()
    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
    print('  AUC \t\t\t: {:.4f}'.format(bi_evaluator.evaluate(predictions)))
    print('  Total time tunning model: {:.2f} seconds'.format(toc-tic))
    result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    display(result_confusion_matrix)
    return best_model.getEstimator()

best_model = tunning_model(model_rdn_forest, 'Random Forest', param_rdn_forest, 'f1')

[1mRandom Forest[0m

  Accuracy 		: 0.8884
  Precisions (label=1)	: 0.6154
  Recall (label=1)	: 0.8421
  f1_score (label=1)	: 0.7111
  AUC 			: 0.9217
  Total time tunning model: 48.61 seconds


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,175.0,20.0
Actual Pos,6.0,32.0


In [None]:
# model_boosting = GBTClassifier(seed=42, weightCol='weight')

# param_boosting = ParamGridBuilder()\
#             .addGrid(model_boosting.maxIter, [50, 100])\
#             .addGrid(model_boosting.maxDepth, [2, 3, 6, 7, 8])\
#             .addGrid(model_boosting.featureSubsetStrategy, ['sqrt', 'log2', 'auto'])\
#             .build()
# tunning_model(model_boosting, 'Gradient Boosting', param_boosting, 'f1')

Nhận xét:
- Model Random Forest sau khi Tunning có kết quả cải thiện đáng kể, Recall đạt đến 84%, có nghĩa là xác định được đến 84% các trường hợp Churn trong tổng số actual Churn customer.
- Sử dụng best_model tốt nhất, fit lại trên toàn bộ data set, rồi sử dụng để dự đoán data mới new_customers.csv

In [None]:
df_new = spark.read.csv("new_customers.csv", header=True, inferSchema=True)
df_new.show(5)

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|  

In [None]:
df_new.count()

6

In [None]:
df_new = df_new.withColumn('Onboard_year', year(col('Onboard_date')))
df_new = df_new.withColumn('Onboard_month', month(col('Onboard_date')))
df_new = df_new.withColumn('State', func_extract_state(col('Location')))
test_new_df = pipe_preprocessing.transform(df_new)

best_trained_model = best_model.fit(final_df)
predictions = best_trained_model.transform(test_new_df)
predictions = predictions.withColumn('Prediction', predictions['Prediction'].cast(IntegerType()))
show_col = ['Names', 'Age', 'Total_Purchase', 'Prediction']
predictions.select(*show_col).show()

+--------------+----+--------------+----------+
|         Names| Age|Total_Purchase|Prediction|
+--------------+----+--------------+----------+
| Andrew Mccall|37.0|       9935.53|         0|
|Michele Wright|23.0|       7526.94|         0|
|  Jeremy Chang|65.0|         100.0|         0|
|Megan Ferguson|32.0|        6487.5|         0|
|  Taylor Young|32.0|      13147.71|         0|
| Jessica Drake|22.0|       8445.26|         0|
+--------------+----+--------------+----------+



Forecast bằng model cho kết quả tất cả các mẫu đều thuộc nhóm NOT Churn, có nghĩa là các khách hàng đều ở lại tiếp tục sử dụng dịch vụ.