In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

In [27]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_6'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_3/data_day_6


In [28]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [29]:
df = spark.read.csv("College.csv", header=True, inferSchema=True)
df.show(5)

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [30]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [31]:
def result_null_data(df):
    total_rows = df.count()
    null_table = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas().T
    null_table = null_table.rename(columns={0: 'count_na'})
    null_table['percent_na'] = null_table['count_na']/total_rows
    return null_table
result_null_data(df)

Unnamed: 0,count_na,percent_na
School,0,0.0
Private,0,0.0
Apps,0,0.0
Accept,0,0.0
Enroll,0,0.0
Top10perc,0,0.0
Top25perc,0,0.0
F_Undergrad,0,0.0
P_Undergrad,0,0.0
Outstate,0,0.0


In [32]:
df.groupBy('Private').count().show()

+-------+-----+
|Private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



In [33]:
df = df.withColumn('label', when(col('Private') == 'Yes', 1).otherwise(0))

input_cols = df.columns[2:-1]
target_col = df.columns[-1]

In [34]:
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
final_df = assembler.transform(df)

In [35]:
train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=2)

In [44]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression

list_model = [('Decision Tree', DecisionTreeClassifier()), 
              ('Random Forest', RandomForestClassifier()), 
              ('Gradient Boosting', GBTClassifier()),
]
for model_name, model in list_model:
    from pyspark.mllib.evaluation import MulticlassMetrics
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    bi_evaluator = BinaryClassificationEvaluator()
    print('-'*30)
    print(model_name)
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
    print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
    print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
    print('  AUC \t\t\t: {:.4f}'.format(bi_evaluator.evaluate(predictions)))
    result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
    display(result_confusion_matrix)

------------------------------
Decision Tree
  Accuracy 		: 0.9336
  Precisions (label=1)	: 0.9874
  Recall (label=1)	: 0.9235
  f1_score (label=1)	: 0.9544
  AUC 			: 0.9259


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,54.0,2.0
Actual Pos,13.0,157.0


------------------------------
Random Forest
  Accuracy 		: 0.9513
  Precisions (label=1)	: 0.9649
  Recall (label=1)	: 0.9706
  f1_score (label=1)	: 0.9677
  AUC 			: 0.9696


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,50.0,6.0
Actual Pos,5.0,165.0


------------------------------
Gradient Boosting
  Accuracy 		: 0.9204
  Precisions (label=1)	: 0.9524
  Recall (label=1)	: 0.9412
  f1_score (label=1)	: 0.9467
  AUC 			: 0.9517


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,48.0,8.0
Actual Pos,10.0,160.0


In [52]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
model = RandomForestClassifier()

paramGrid = ParamGridBuilder()\
            .addGrid(model.numTrees, [20, 50, 100])\
            .addGrid(model.maxDepth, [2, 3, 4, 5])\
            .build()

tvs = TrainValidationSplit(estimator=model, 
                           estimatorParamMaps=paramGrid,
                        #    evaluator = MulticlassClassificationEvaluator(metricName='accuracy'),
                           evaluator = BinaryClassificationEvaluator(), #AUC
                           trainRatio=0.8,
)

best_model = tvs.fit(train_df)
predictions = best_model.transform(test_df)

predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
prediction_and_label = predictions.select(['prediction', 'label'])
metrics = MulticlassMetrics(prediction_and_label.rdd)
bi_evaluator = BinaryClassificationEvaluator()
print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
print('  Precisions (label=1)\t: {:.4f}'.format(metrics.precision(label=1)))
print('  Recall (label=1)\t: {:.4f}'.format(metrics.recall(label=1)))
print('  f1_score (label=1)\t: {:.4f}'.format(metrics.fMeasure(label=1.0)))
print('  AUC \t\t\t: {:.4f}'.format(bi_evaluator.evaluate(predictions)))
result_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray(), columns=['Predict Neg', 'Predict Pos'], index=['Actual Neg', 'Actual Pos'])
display(result_confusion_matrix)

  Accuracy 		: 0.9425
  Precisions (label=1)	: 0.9486
  Recall (label=1)	: 0.9765
  f1_score (label=1)	: 0.9623
  AUC 			: 0.9690


Unnamed: 0,Predict Neg,Predict Pos
Actual Neg,47.0,9.0
Actual Pos,4.0,166.0
