In [160]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
from pyspark.sql import *
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [8]:
#SparkSession object creation
spark = SparkSession.builder.appName('supervised_ml').getOrCreate()

In [9]:
#Load data
df = spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)

In [6]:
print((df.count(), len(df.columns)))

(1232, 6)


In [7]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)



In [10]:
df.show(15)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
|  583|  794|   55|0.302|0.236|0.371|
|  676|  746|   72|0.317|0.265|  0.4|
|  767|  699|   89|0.332|0.274|0.433|
|  637|  597|   86|0.317|0.252|0.374|
|  609|  724|   69|0.308|0.244|0.382|
+-----+-----+-----+-----+-----+-----+
only showing top 15 rows



Regression 

In [47]:
#Feature engineering 
vec_assmebler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol='features')

In [51]:
df = vec_assmebler.transform(df)

In [52]:
df.show()

+-----+-----+-----+-----+-----+-----+--------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|
+-----+-----+-----+-----+-----+-----+--------------------+
|  734|  688|   81|0.328|0.259|0.418|[734.0,688.0,81.0...|
|  700|  600|   94| 0.32|0.247|0.389|[700.0,600.0,94.0...|
|  712|  705|   93|0.311|0.247|0.417|[712.0,705.0,93.0...|
|  734|  806|   69|0.315| 0.26|0.415|[734.0,806.0,69.0...|
|  613|  759|   61|0.302| 0.24|0.378|[613.0,759.0,61.0...|
|  748|  676|   85|0.318|0.255|0.422|[748.0,676.0,85.0...|
|  669|  588|   97|0.315|0.251|0.411|[669.0,588.0,97.0...|
|  667|  845|   68|0.324|0.251|0.381|[667.0,845.0,68.0...|
|  758|  890|   64| 0.33|0.274|0.436|[758.0,890.0,64.0...|
|  726|  670|   88|0.335|0.268|0.422|[726.0,670.0,88.0...|
|  583|  794|   55|0.302|0.236|0.371|[583.0,794.0,55.0...|
|  676|  746|   72|0.317|0.265|  0.4|[676.0,746.0,72.0...|
|  767|  699|   89|0.332|0.274|0.433|[767.0,699.0,89.0...|
|  637|  597|   86|0.317|0.252|0.374|[637.0,597.0,86.0..

In [53]:
df.select(['features','label']).show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



In [55]:
#Split data to train and test set
train, test = df.randomSplit([0.75, 0.25])

In [59]:
#Build and Train Linear Regression Model using linear regression
lr = LinearRegression()
lr_model = lr.fit(train)

In [60]:
#Test the obtained model 
predictions_df=lr_model.transform(test)

In [61]:
predictions_df.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  486|  610|   61|0.293|0.233|0.332|[486.0,610.0,61.0...|  0.319523752689337|
|  495|  752|   50|0.277|0.221|0.327|[495.0,752.0,50.0...|0.33322253929570955|
|  510|  588|   72|0.298|0.231|0.317|[510.0,588.0,72.0...|0.32372786185483404|
|  511|  576|   76| 0.29|0.231|0.329|[511.0,576.0,76.0...|0.33001600659149755|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33103700410186143|
|  532|  690|   69|0.303|0.245|0.351|[532.0,690.0,69.0...| 0.3395877748586824|
|  536|  531|   83|0.292|0.214|0.318|[536.0,531.0,83.0...| 0.3271572884145917|
|  543|  747|   60|  0.3|0.238|0.342|[543.0,747.0,60.0...| 0.3433508995433038|
|  545|  661|   61| 0.31|0.251| 0.35|[545.0,661.0,61.0...| 0.3397315256756518|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77

In [62]:
#Evaluate our model on the test set 

model_predictions=lr_model.evaluate(test)

In [65]:
#Using the r2 metric 
model_predictions.r2

0.8750608374757474

In [66]:
#Using the meanSquaredError metric 
model_predictions.meanSquaredError

0.00013349687769823547

In [68]:
#Build and train Linear Regression model using Decision Tree
dec_tree = DecisionTreeRegressor()
dec_tree_model = dec_tree.fit(train)
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9603, 1: 0.0159, 2: 0.0055, 3: 0.007, 4: 0.0113})

In [69]:
#Evaluate the model on test data 

model_predictions = dec_tree_model.transform(test)
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  486|  610|   61|0.293|0.233|0.332|[486.0,610.0,61.0...|0.31923076923076926|
|  495|  752|   50|0.277|0.221|0.327|[495.0,752.0,50.0...| 0.3276666666666667|
|  510|  588|   72|0.298|0.231|0.317|[510.0,588.0,72.0...|0.31923076923076926|
|  511|  576|   76| 0.29|0.231|0.329|[511.0,576.0,76.0...|0.31923076923076926|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33475000000000005|
|  532|  690|   69|0.303|0.245|0.351|[532.0,690.0,69.0...|0.33475000000000005|
|  536|  531|   83|0.292|0.214|0.318|[536.0,531.0,83.0...|0.31923076923076926|
|  543|  747|   60|  0.3|0.238|0.342|[543.0,747.0,60.0...|              0.355|
|  545|  661|   61| 0.31|0.251| 0.35|[545.0,661.0,61.0...|0.33475000000000005|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77

In [73]:
#Evaluation using r2 metric
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(dt_r2)

0.8190687807285287


In [74]:
#Evaluation using RootMeanSquaredError metric
dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(model_predictions)
print(dt_rmse)

0.013904104200400598


Classification

In [90]:
#Read the dataset 
df = spark.read.csv('bank_data.csv', inferSchema=True, header=True)

In [91]:
df.count()

41188

In [95]:
df = df.select(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'target_class']).limit(9500)

In [96]:
df.show()

+---+-----------+--------+-------------------+-------+-------+----+------------+
|age|        job| marital|          education|default|housing|loan|target_class|
+---+-----------+--------+-------------------+-------+-------+----+------------+
| 56|  housemaid| married|           basic.4y|     no|     no|  no|          no|
| 57|   services| married|        high.school|unknown|     no|  no|          no|
| 37|   services| married|        high.school|     no|    yes|  no|          no|
| 40|     admin.| married|           basic.6y|     no|     no|  no|          no|
| 56|   services| married|        high.school|     no|     no| yes|          no|
| 45|   services| married|           basic.9y|unknown|     no|  no|          no|
| 59|     admin.| married|professional.course|     no|     no|  no|          no|
| 41|blue-collar| married|            unknown|unknown|     no|  no|          no|
| 24| technician|  single|professional.course|     no|    yes|  no|          no|
| 25|   services|  single|  

In [128]:
#Feature engineering
df=df.withColumn("label", F.when(df.target_class =='no', F.lit(0)).otherwise(F.lit(1)))

In [129]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0| 9189|
|    1|  311|
+-----+-----+



In [147]:
def cat_to_num(df):
    for col in df.columns:
        stringIndexer = StringIndexer(inputCol=col, outputCol=col+"_index")
        model = stringIndexer.fit(df)
        indexed = model.transform(df)
        encoder = OneHotEncoder(inputCol=col+"_index", outputCol=col+"_vec")
        df = encoder.fit(indexed).transform(indexed)
    df_assembler = VectorAssembler(inputCols=['age','marital_vec','education_vec','default_vec','housing_vec','loan_vec'], outputCol="features")
    df = df_assembler.transform(df)
    return df.select(['features','label'])

In [148]:
df_new=cat_to_num(df)

In [149]:
df_new.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(16,[0,1,7,11,12,...|    0|
|(16,[0,1,4,12,14]...|    0|
|(16,[0,1,4,11,13,...|    0|
|(16,[0,1,9,11,12,...|    0|
|(16,[0,1,4,11,12,...|    0|
|(16,[0,1,6,12,14]...|    0|
|(16,[0,1,8,11,12,...|    0|
|(16,[0,1,10,12,14...|    0|
|(16,[0,2,8,11,13,...|    0|
|(16,[0,2,4,11,13,...|    0|
|(16,[0,1,10,12,14...|    0|
|(16,[0,2,4,11,13,...|    0|
|(16,[0,2,4,11,12,...|    0|
|(16,[0,3,7,11,13,...|    0|
|(16,[0,1,9,11,13,...|    0|
|(16,[0,1,6,13,15]...|    0|
|(16,[0,1,9,11,13,...|    0|
|(16,[0,1,9,13,15]...|    0|
|(16,[0,1,6,11,13,...|    0|
|(16,[0,2,6,12,14]...|    0|
+--------------------+-----+
only showing top 20 rows



In [150]:
#Split the data t train and test set
train, test = df_new.randomSplit([0.75, 0.25])

In [152]:
#Build and test a binary classification model with Logistic Regression
lr = LogisticRegression()
lr_model = lr.fit(train)

In [153]:
#Evaluate the model on the test set 

model_predictions = lr_model.transform(test)

In [155]:
model_predictions.select(['label','probability', 'prediction']).show(10,False)

+-----+------------------------------------------+----------+
|label|probability                               |prediction|
+-----+------------------------------------------+----------+
|0    |[0.996000744459765,0.003999255540234947]  |0.0       |
|1    |[0.9955227362870526,0.004477263712947432] |0.0       |
|0    |[0.9964772622062369,0.003522737793763109] |0.0       |
|0    |[0.9963616951017137,0.0036383048982863153]|0.0       |
|0    |[0.9962423509939713,0.003757649006028707] |0.0       |
|0    |[0.9962119105411723,0.0037880894588276925]|0.0       |
|0    |[0.9959918379074197,0.004008162092580259] |0.0       |
|0    |[0.9954398294853619,0.004560170514638084] |0.0       |
|0    |[0.9722201945341459,0.027779805465854124] |0.0       |
|0    |[0.9715564080215768,0.028443591978423166] |0.0       |
+-----+------------------------------------------+----------+
only showing top 10 rows



In [157]:
#Model evaluation with areaUnderROC metric 
lr_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')

In [158]:
lr_auroc = lr_evaluator.evaluate(model_predictions)

In [159]:
print(lr_auroc)

0.5110503058324896


In [161]:
#Build and train binary classifier model using Naive Bayes

nb = NaiveBayes()
nb_model = nb.fit(train)

In [162]:
#Predictions
model_predictions = nb_model.transform(test)

In [163]:
model_predictions.select(['label','probability', 'prediction']).show(10,False)

+-----+-----------------------------------------+----------+
|label|probability                              |prediction|
+-----+-----------------------------------------+----------+
|0    |[0.9714328683803048,0.028567131619695114]|0.0       |
|1    |[0.972559331855972,0.027440668144028044] |0.0       |
|0    |[0.9705415154916822,0.029458484508317825]|0.0       |
|0    |[0.970877742459704,0.029122257540295918] |0.0       |
|0    |[0.9712102457077758,0.02878975429222407] |0.0       |
|0    |[0.971292794225895,0.028707205774105085] |0.0       |
|0    |[0.9718642353190503,0.028135764680949724]|0.0       |
|0    |[0.9731292078454521,0.02687079215454787] |0.0       |
|0    |[0.9678541840266542,0.03214581597334577] |0.0       |
|0    |[0.9681289844549504,0.03187101554504961] |0.0       |
+-----+-----------------------------------------+----------+
only showing top 10 rows



In [164]:
#Evaluate Performance on Test Data
nb_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
nb_auroc = nb_evaluator.evaluate(model_predictions)

In [165]:
print(nb_auroc)

0.481662091219958
