## Configure PySpark Setup

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"


import findspark
findspark.init()


import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark

In [2]:
# check number of cores PySpark is using
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")

You are working with 1 core(s)


In [3]:
!cp /content/drive/MyDrive/Datasets.zip .
!unzip Datasets.zip

Archive:  Datasets.zip
   creating: Datasets/
  inflating: Datasets/fifa19.csv     
  inflating: Datasets/.DS_Store      
  inflating: Datasets/zomato.csv     
  inflating: Datasets/nyc_air_bnb.csv  
  inflating: Datasets/supermarket_sales.csv  
  inflating: Datasets/users3.parquet  
  inflating: Datasets/Toddler Autism dataset July 2018.csv  
   creating: Datasets/uw-madison-courses/
  inflating: Datasets/uw-madison-courses/course_offerings.csv  
  inflating: Datasets/uw-madison-courses/sections.csv  
  inflating: Datasets/uw-madison-courses/schedules.csv  
  inflating: Datasets/uw-madison-courses/database.sqlite3  
  inflating: Datasets/uw-madison-courses/rooms.csv  
  inflating: Datasets/uw-madison-courses/teachings.csv  
  inflating: Datasets/uw-madison-courses/subjects.csv  
  inflating: Datasets/uw-madison-courses/subject_memberships.csv  
  inflating: Datasets/uw-madison-courses/grade_distributions.csv  
  inflating: Datasets/uw-madison-courses/instructors.csv  
  inflating: Dat

# Load Libraries

In [4]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler,OneHotEncoder,StandardScaler
from pyspark.sql.types import * 

from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql.functions import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd

# ML Model Building

In [5]:
path ="Datasets/"
df = spark.read.csv(path+'Toddler Autism dataset July 2018.csv',inferSchema=True,header=True)

In [6]:
df.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes


In [7]:
df.printSchema()

root
 |-- Case_No: integer (nullable = true)
 |-- A1: integer (nullable = true)
 |-- A2: integer (nullable = true)
 |-- A3: integer (nullable = true)
 |-- A4: integer (nullable = true)
 |-- A5: integer (nullable = true)
 |-- A6: integer (nullable = true)
 |-- A7: integer (nullable = true)
 |-- A8: integer (nullable = true)
 |-- A9: integer (nullable = true)
 |-- A10: integer (nullable = true)
 |-- Age_Mons: integer (nullable = true)
 |-- Qchat-10-Score: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- Jaundice: string (nullable = true)
 |-- Family_mem_with_ASD: string (nullable = true)
 |-- Who completed the test: string (nullable = true)
 |-- Class/ASD Traits : string (nullable = true)



In [8]:
df.groupBy("Class/ASD Traits ").count().show()

+-----------------+-----+
|Class/ASD Traits |count|
+-----------------+-----+
|               No|  326|
|              Yes|  728|
+-----------------+-----+



In [9]:
# input and output columns
input_columns = df.columns              # Collect the column names as a list
input_columns = input_columns[1:-1]     # keep only relevant columns: from column 1 to 

dependent_var = 'Class/ASD Traits '

In [10]:
# Using StringIndexer built in method to format output label with zero index

# change data type to string type
renamed = df.withColumn("label_str", df[dependent_var].cast(StringType())) 

# transform with string indexer
indexer = StringIndexer(inputCol="label_str", outputCol="label")  
indexed = indexer.fit(renamed).transform(renamed)

In [11]:
renamed.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No,No
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes,Yes
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes,Yes


In [12]:
indexed.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No,No,1.0
1,2,1,1,0,0,0,1,1,0,0,0,36,4,m,White European,yes,no,family member,Yes,Yes,0.0
2,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes,Yes,0.0


In [13]:
df.rdd.id(),renamed.rdd.id(), indexed.rdd.id()

(40, 46, 52)

In [14]:
# train test split
train = indexed.sampleBy("label", fractions={0: 0.7, 1: 0.7}, seed=10)
test = indexed.subtract(train)

In [15]:
train.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  494|
|  1.0|  230|
+-----+-----+



In [16]:
test.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  234|
|  1.0|   96|
+-----+-----+



In [17]:
train.count(),test.count()

(724, 330)

In [18]:
# Convert String data type to numeric

numeric_inputs = []
string_inputs = []
labels = {}

for column in input_columns:
    
    if str(train.schema[column].dataType) == 'StringType':
        
        # convert string to numeric
        indexer = StringIndexer(inputCol=column, outputCol=column+"_num") 
        indexer = indexer.fit(train)
        train  =  indexer.transform(train)
        test    =  indexer.transform(test)
        labels[column]=indexer.labels

        # encode numeric string to one hot encoder -default drop lasts category
        encoder = OneHotEncoder(inputCol=column+"_num", outputCol=column+"_enc")
        encoder = encoder.fit(train)
        train = encoder.transform(train)
        test = encoder.transform(test)

        new_col_name = column+"_enc"
        string_inputs.append(new_col_name)
    
    else:
       numeric_inputs.append(column)

In [19]:
train.limit(3).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label,Sex_num,Sex_enc,Ethnicity_num,Ethnicity_enc,Jaundice_num,Jaundice_enc,Family_mem_with_ASD_num,Family_mem_with_ASD_enc,Who completed the test_num,Who completed the test_enc
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No,No,1.0,1.0,(0.0),2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,(0.0),0.0,(1.0),0.0,"(1.0, 0.0, 0.0, 0.0)"
1,3,1,0,0,0,0,0,1,1,0,1,36,4,m,middle eastern,yes,no,family member,Yes,Yes,0.0,0.0,(1.0),2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,(0.0),0.0,(1.0),0.0,"(1.0, 0.0, 0.0, 0.0)"
2,5,1,1,0,1,1,1,1,1,1,1,20,9,f,White European,no,yes,family member,Yes,Yes,0.0,1.0,(0.0),0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,(1.0),1.0,(0.0),0.0,"(1.0, 0.0, 0.0, 0.0)"


In [20]:
labels

{'Ethnicity': ['White European',
  'asian',
  'middle eastern',
  'black',
  'south asian',
  'Others',
  'Hispanic',
  'Latino',
  'mixed',
  'Pacifica',
  'Native Indian'],
 'Family_mem_with_ASD': ['no', 'yes'],
 'Jaundice': ['no', 'yes'],
 'Sex': ['m', 'f'],
 'Who completed the test': ['family member',
  'Health Care Professional',
  'Health care professional',
  'Others',
  'Self']}

In [21]:
# treating outlier by caopping quantiles value
d = {}
for col in numeric_inputs: 
    d[col] = train.approxQuantile(col,[0.01,0.99],0.25)
d

{'A1': [0.0, 1.0],
 'A10': [0.0, 1.0],
 'A2': [0.0, 1.0],
 'A3': [0.0, 1.0],
 'A4': [0.0, 1.0],
 'A5': [0.0, 1.0],
 'A6': [0.0, 1.0],
 'A7': [0.0, 1.0],
 'A8': [0.0, 1.0],
 'A9': [0.0, 1.0],
 'Age_Mons': [12.0, 36.0],
 'Qchat-10-Score': [0.0, 10.0]}

In [22]:
# treatment for skewness and outlier

for key,data in {'train':train,'test':test}.items():
    for col in numeric_inputs:
        
        # compute skewness    
        skew = data.agg(skewness(data[col])).collect() 
        skew = skew[0][0]

        # treatement for right skew - floor, cap and log(x+1)
        if skew > 1: 
            data = data.withColumn(col,log(when(data[col] < d[col][0],d[col][0]).when(data[col] > d[col][1], d[col][1]).otherwise(data[col] ) +1).alias(col))
            print(f"{key} {col} has been treated for positive (right) skewness. (skew = {skew})")
        
        # treatment for left skew - floor, cap and exp(x)
        elif skew < -1:
            data = data.withColumn(col,exp(when(data[col] < d[col][0],d[col][0]).when(data[col] > d[col][1], d[col][1]).otherwise(data[col] )).alias(col))
            print(f"{key} {col} has been treated for negative (left) skewness. (skew = {skew})")

In [23]:
features_list = numeric_inputs + string_inputs
features_list

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'A6',
 'A7',
 'A8',
 'A9',
 'A10',
 'Age_Mons',
 'Qchat-10-Score',
 'Sex_enc',
 'Ethnicity_enc',
 'Jaundice_enc',
 'Family_mem_with_ASD_enc',
 'Who completed the test_enc']

In [24]:
features_names = []
for name in features_list:
    if "_enc" in name:
        x = name.split('_enc')[0]
         # last category is dropped hence not adding to feature names
        for idx in range(0,len(labels[x])-1):
            enc = labels[x][idx]
            value = f"{x}-{enc}"
            features_names.append(value)
    else:
        value = name
        features_names.append(value)
features_names

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'A6',
 'A7',
 'A8',
 'A9',
 'A10',
 'Age_Mons',
 'Qchat-10-Score',
 'Sex-m',
 'Ethnicity-White European',
 'Ethnicity-asian',
 'Ethnicity-middle eastern',
 'Ethnicity-black',
 'Ethnicity-south asian',
 'Ethnicity-Others',
 'Ethnicity-Hispanic',
 'Ethnicity-Latino',
 'Ethnicity-mixed',
 'Ethnicity-Pacifica',
 'Jaundice-no',
 'Family_mem_with_ASD-no',
 'Who completed the test-family member',
 'Who completed the test-Health Care Professional',
 'Who completed the test-Health care professional',
 'Who completed the test-Others']

In [25]:
len(features_names)

29

In [26]:
# vectorized input features
assembler = VectorAssembler(inputCols=features_list,outputCol='features')
train_vec = assembler.transform(train)
train_vec.limit(1).toPandas()

Unnamed: 0,Case_No,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Qchat-10-Score,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,label_str,label,Sex_num,Sex_enc,Ethnicity_num,Ethnicity_enc,Jaundice_num,Jaundice_enc,Family_mem_with_ASD_num,Family_mem_with_ASD_enc,Who completed the test_num,Who completed the test_enc,features
0,1,0,0,0,0,0,0,1,1,0,1,28,3,f,middle eastern,yes,no,family member,No,No,1.0,1.0,(0.0),2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,(0.0),0.0,(1.0),0.0,"(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."


In [27]:
# select input and output columm
train_vec = train_vec.select('features','label')
train_vec.limit(3).toPandas()

Unnamed: 0,features,label
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...",1.0
1,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...",0.0
2,"(1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0


In [28]:
test_vec = assembler.transform(test)
test_vec = test_vec.select('features','label')
test_vec.limit(3).toPandas()

Unnamed: 0,features,label
0,"(1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...",0.0
1,"(1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...",0.0
2,"(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0


In [29]:
# scaling the vectors
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures" )
scaler = scaler.fit(train_vec)

final_train = scaler.transform(train_vec)
final_test = scaler.transform(test_vec)

final_train = final_train.select('label','scaledFeatures').withColumnRenamed("scaledFeatures","features")
final_test = final_test.select('label','scaledFeatures').withColumnRenamed("scaledFeatures","features")

final_train.limit(3).toPandas()

Unnamed: 0,label,features
0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0778303943445..."
1,0.0,"(2.010932390109971, 0.0, 0.0, 0.0, 0.0, 0.0, 2..."
2,0.0,"(2.010932390109971, 2.011561859255275, 0.0, 1...."


In [30]:
# compute class weight
neg_count = train.where(train['label'] == 1).count() 
data_size = train.count()
balance_ratio =  (data_size - neg_count)/ data_size
balance_ratio

0.6823204419889503

In [31]:
# function to assing class weight to each tow
get_class_weights = udf(lambda x: 1 * balance_ratio if x == 0 else (1 * (1.0 - balance_ratio)), DoubleType())

final_train = final_train.withColumn("class_weight", get_class_weights("label"))
final_test = final_test.withColumn("class_weight", get_class_weights("label"))


final_train.limit(3).toPandas()

Unnamed: 0,label,features,class_weight
0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0778303943445...",0.31768
1,0.0,"(2.010932390109971, 0.0, 0.0, 0.0, 0.0, 0.0, 2...",0.68232
2,0.0,"(2.010932390109971, 2.011561859255275, 0.0, 1....",0.68232


In [32]:
# model evaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') 


mc_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

## Logistic Regression


In [33]:
classifier = LogisticRegression(weightCol="class_weight",featuresCol='features', labelCol='label')
model = classifier.fit(final_train)
prediction = model.transform(final_test)

In [34]:
prediction.limit(3).show()

+-----+--------------------+------------------+--------------------+--------------------+----------+
|label|            features|      class_weight|       rawPrediction|         probability|prediction|
+-----+--------------------+------------------+--------------------+--------------------+----------+
|  0.0|(29,[0,1,3,4,6,10...|0.6823204419889503|[83.1369133695920...|[1.0,7.8360514913...|       0.0|
|  0.0|(29,[0,1,4,5,6,8,...|0.6823204419889503|[181.260967183362...|[1.0,1.9026642925...|       0.0|
|  0.0|(29,[0,1,2,3,4,5,...|0.6823204419889503|[327.535860172602...|[1.0,5.6621751785...|       0.0|
+-----+--------------------+------------------+--------------------+--------------------+----------+



In [35]:
prediction.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  234|
|       1.0|   96|
+----------+-----+



In [36]:
model.summary.predictions.describe().show()

+-------+-------------------+-------------------+-------------------+
|summary|              label|       class_weight|         prediction|
+-------+-------------------+-------------------+-------------------+
|  count|                724|                724|                724|
|   mean|0.31767955801104975| 0.5664814871340962|0.31767955801104975|
| stddev| 0.4658959774100717|0.16988472104455643| 0.4658959774100717|
|    min|                0.0|0.31767955801104975|                0.0|
|    max|                1.0| 0.6823204419889503|                1.0|
+-------+-------------------+-------------------+-------------------+



In [37]:
[model.summary.falsePositiveRateByLabel,
model.summary.truePositiveRateByLabel,
model.summary.precisionByLabel,
model.summary.recallByLabel,
model.summary.accuracy,
model.summary.weightedFalsePositiveRate,
model.summary.weightedTruePositiveRate,
model.summary.weightedFMeasure(),
model.summary.weightedPrecision,
model.summary.weightedRecall]

[[0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0], 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]

In [38]:
#important: need to cast to float type, and order by prediction, else it won't work

prediction = prediction.withColumn("label", prediction["label"].cast(FloatType())) 
prediction = prediction.withColumn("prediction", prediction["prediction"].cast(FloatType())) 

preds_and_labels = prediction.select(['prediction','label']).orderBy('prediction')
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

[[234.   0.]
 [  0.  96.]]


In [39]:
# Overall statistics
print(f"Accuracy {metrics.accuracy}")

# Statistics by class
labels = [0.0,1.0]
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

# Weighted stats
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Accuracy 1.0
Class 0.0 precision = 1.0
Class 0.0 recall = 1.0
Class 0.0 F1 Measure = 1.0
Class 1.0 precision = 1.0
Class 1.0 recall = 1.0
Class 1.0 F1 Measure = 1.0
Weighted recall = 1.0
Weighted precision = 1.0
Weighted F(1) Score = 1.0
Weighted F(0.5) Score = 1.0
Weighted false positive rate = 0.0


In [40]:
len(features_names)

29

In [41]:
model.coefficientMatrix.toArray().shape

(1, 29)

In [42]:
coeff=model.coefficientMatrix.toArray()[0,:].tolist()
score = spark.createDataFrame(zip(features_names,coeff), schema=['feature','coeff'])
score.show(truncate=False)

+------------------------+-------------------+
|feature                 |coeff              |
+------------------------+-------------------+
|A1                      |-15.950664498498998|
|A2                      |-21.819768862854446|
|A3                      |-18.669951219085146|
|A4                      |-19.406783877606994|
|A5                      |-17.456711403135092|
|A6                      |-16.428954386063094|
|A7                      |-17.807156166738167|
|A8                      |-20.009924200895362|
|A9                      |-20.453743899013013|
|A10                     |-15.262479549696668|
|Age_Mons                |4.154841800882121  |
|Qchat-10-Score          |-36.330078047845   |
|Sex-m                   |0.4021956898875008 |
|Ethnicity-White European|9.714454633201173  |
|Ethnicity-asian         |5.0914974659666195 |
|Ethnicity-middle eastern|7.733229776950162  |
|Ethnicity-black         |5.3499788450763734 |
|Ethnicity-south asian   |3.640500991961943  |
|Ethnicity-Ot

In [43]:
model.interceptVector

DenseVector([10.2714])

## Perceptron Classifier

In [44]:
# Count how many features you have
features = final_train.select(['features']).collect()
features_count = len(features[0][0])

# Count how many classes you have 
class_count = final_train.select(countDistinct("label")).collect()
classes = class_count[0][0]

# define layers - input, 1st hidden, 2nd hidden, output
layers = [features_count, features_count+1, features_count, classes]

# Instaniate the classifier
classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# Fit the model
model = classifier.fit(final_train)

# Print the model Weights
print('\033[1m' + "Model Weights: "+ '\033[0m',model.weights.size)
   
# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

[1mModel Weights: [0m 1859
Accuracy:  99.39393939393939


## Naive Bayes

In [45]:
# Instaniate the classifier
classifier = NaiveBayes()

# Fit the model
model = classifier.fit(final_train)

# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

Accuracy:  92.42424242424242


## Support Vector Machine

In [46]:
# Instaniate the classifier
classifier = LinearSVC()

# Fit the model
model = classifier.fit(final_train)

# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

Accuracy:  99.39393939393939


In [47]:
model.coefficients.toArray().shape

(29,)

In [48]:
coeff=model.coefficients.toArray().tolist()
score = spark.createDataFrame(zip(features_names,coeff), schema=['feature','coeff'])
score.show(truncate=False)

+------------------------+--------------------+
|feature                 |coeff               |
+------------------------+--------------------+
|A1                      |-0.9626037513284502 |
|A2                      |-1.3192705206779651 |
|A3                      |-0.8696585364757117 |
|A4                      |-1.0758019059004766 |
|A5                      |-0.9198731850046276 |
|A6                      |-0.9558246596143612 |
|A7                      |-1.065362678117776  |
|A8                      |-1.1636261726315236 |
|A9                      |-1.2005388082487953 |
|A10                     |-0.8165198841917235 |
|Age_Mons                |0.4060187386903567  |
|Qchat-10-Score          |-2.1171335511181555 |
|Sex-m                   |0.03955975737865233 |
|Ethnicity-White European|0.5256061755623616  |
|Ethnicity-asian         |0.0883029886588798  |
|Ethnicity-middle eastern|0.4360643581468559  |
|Ethnicity-black         |0.25173292772750333 |
|Ethnicity-south asian   |0.147510827933

## Decision Tree Classifier


In [49]:
# Instaniate the classifier
classifier = DecisionTreeClassifier()

# Fit the model
model = classifier.fit(final_train)

# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

Accuracy:  100.0


In [50]:
imp_score =model.featureImportances.toArray().tolist()
score = spark.createDataFrame(zip(features_names,imp_score), schema=['feature','imp score'])
score.show(truncate=False)

+------------------------+---------+
|feature                 |imp score|
+------------------------+---------+
|A1                      |0.0      |
|A2                      |0.0      |
|A3                      |0.0      |
|A4                      |0.0      |
|A5                      |0.0      |
|A6                      |0.0      |
|A7                      |0.0      |
|A8                      |0.0      |
|A9                      |0.0      |
|A10                     |0.0      |
|Age_Mons                |0.0      |
|Qchat-10-Score          |1.0      |
|Sex-m                   |0.0      |
|Ethnicity-White European|0.0      |
|Ethnicity-asian         |0.0      |
|Ethnicity-middle eastern|0.0      |
|Ethnicity-black         |0.0      |
|Ethnicity-south asian   |0.0      |
|Ethnicity-Others        |0.0      |
|Ethnicity-Hispanic      |0.0      |
+------------------------+---------+
only showing top 20 rows



## Random Forest

In [51]:
# Instaniate the classifier
classifier = RandomForestClassifier(maxDepth=5)

# Fit the model
model = classifier.fit(final_train)

# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

Accuracy:  100.0


In [52]:
model.featureImportances.toArray()
imp_score =model.featureImportances.toArray().tolist()
score = spark.createDataFrame(zip(features_names,imp_score), schema=['feature','imp score'])
score.show(truncate=False)

+------------------------+---------------------+
|feature                 |imp score            |
+------------------------+---------------------+
|A1                      |0.07871928839597764  |
|A2                      |0.02414839401092051  |
|A3                      |0.017816012372591526 |
|A4                      |0.03576924139260175  |
|A5                      |0.09412356564155808  |
|A6                      |0.06803484817608683  |
|A7                      |0.10572241231002515  |
|A8                      |0.03663690745236217  |
|A9                      |0.08455059775335107  |
|A10                     |0.00777176331837468  |
|Age_Mons                |0.008316120289854964 |
|Qchat-10-Score          |0.420198253901028    |
|Sex-m                   |0.001377935610914058 |
|Ethnicity-White European|0.0011866918776980515|
|Ethnicity-asian         |0.0010477900550730126|
|Ethnicity-middle eastern|0.0028008189525761847|
|Ethnicity-black         |0.0                  |
|Ethnicity-south asi

## Gradient Boosting

In [53]:
# Instaniate the classifier
classifier = GBTClassifier(maxIter=20,maxDepth=5,maxBins=3)

# Fit the model
model = classifier.fit(final_train)

# Generate predictions on test dataframe
predictions = model.transform(final_test)

# Calculate accuracy score
accuracy = (mc_evaluator.evaluate(predictions))*100

# Print accuracy score
print("Accuracy: ",accuracy)

Accuracy:  100.0


In [54]:
imp_score =model.featureImportances.toArray().tolist()
score = spark.createDataFrame(zip(features_names,imp_score), schema=['feature','imp score'])
score.show(truncate=False)

+------------------------+----------------------+
|feature                 |imp score             |
+------------------------+----------------------+
|A1                      |0.0                   |
|A2                      |0.0                   |
|A3                      |1.0910696012422205E-16|
|A4                      |0.0                   |
|A5                      |2.150666040910148E-17 |
|A6                      |3.0686332534937454E-17|
|A7                      |0.0                   |
|A8                      |0.0                   |
|A9                      |3.1473161574294823E-18|
|A10                     |2.8011113801122394E-16|
|Age_Mons                |8.046638309161375E-16 |
|Qchat-10-Score          |0.9999999999999961    |
|Sex-m                   |0.0                   |
|Ethnicity-White European|0.0                   |
|Ethnicity-asian         |0.0                   |
|Ethnicity-middle eastern|0.0                   |
|Ethnicity-black         |2.0850969542970332E-17|


## Cross Validation - Finetuning Gradient Boosted Tree

In [55]:
classifier = GBTClassifier()

# define parameters
paramGrid = (ParamGridBuilder().addGrid(classifier.maxDepth, [2, 5, 10]) \
                                .addGrid(classifier.maxBins, [10, 20]) \
                                .addGrid(classifier.maxIter, [10, 15])
                                .build())

#Cross Validator requires all of the following parameters:
crossval = CrossValidator(estimator=classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=2) 

# Fit Model: Run cross-validation, and choose the best set of parameters.
model = crossval.fit(final_train)
best_model = model.bestModel

feature_importances = best_model.featureImportances.toArray()
print("Feature Importances: ",feature_importances)
    
predictions = model.transform(final_test)
accuracy = (mc_evaluator.evaluate(predictions))*100

print(" ")
print("Accuracy: ",accuracy)

Feature Importances:  [0.00000000e+00 0.00000000e+00 6.28015576e-16 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.27645442e-16 1.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 7.07674310e-16 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.95804351e-15
 0.00000000e+00]
 
Accuracy:  100.0


In [56]:
best_model.getMaxBins(),best_model.getMaxDepth()

(10, 2)

In [57]:
imp_score =best_model.featureImportances.toArray().tolist()
score = spark.createDataFrame(zip(features_names,imp_score), schema=['feature','imp score'])
score.show(truncate=False)

+------------------------+----------------------+
|feature                 |imp score             |
+------------------------+----------------------+
|A1                      |0.0                   |
|A2                      |0.0                   |
|A3                      |6.280155758562281E-16 |
|A4                      |0.0                   |
|A5                      |0.0                   |
|A6                      |0.0                   |
|A7                      |0.0                   |
|A8                      |0.0                   |
|A9                      |0.0                   |
|A10                     |0.0                   |
|Age_Mons                |1.2764544224720084E-16|
|Qchat-10-Score          |0.9999999999999956    |
|Sex-m                   |0.0                   |
|Ethnicity-White European|0.0                   |
|Ethnicity-asian         |0.0                   |
|Ethnicity-middle eastern|0.0                   |
|Ethnicity-black         |0.0                   |


### Extras Correlation Matrix Syntax

In [58]:
from pyspark.ml.stat import Correlation
corr = Correlation.corr(final_train, 'features', 'pearson').collect()
corr

[Row(pearson(features)=DenseMatrix(29, 29, [1.0, 0.4743, 0.2522, 0.2447, 0.2889, 0.3771, 0.3411, 0.2219, ..., -0.0059, -0.0048, 0.0389, 0.0285, -0.3103, -0.0111, -0.0042, 1.0], False))]

In [59]:
print(str(corr[0][0]).replace('nan', 'NaN'))

DenseMatrix([[ 1.00000000e+00,  4.74253038e-01,  2.52236475e-01,
               2.44651964e-01,  2.88920672e-01,  3.77063322e-01,
               3.41149623e-01,  2.21928011e-01,  3.56717268e-01,
               1.32033318e-01,  5.80931015e-02,  6.20835843e-01,
               1.45973013e-01,  8.27926546e-02, -3.03656891e-02,
              -7.75247866e-02,  1.82539381e-02,  1.12241323e-02,
               1.47905263e-02,  1.80821012e-03, -3.56030847e-02,
              -4.08223709e-02,  2.92106955e-02,  1.98823908e-02,
              -9.39199698e-02, -6.05621399e-02,  8.84369136e-02,
              -2.88056591e-02,  5.77308437e-02],
             [ 4.74253038e-01,  1.00000000e+00,  2.05470334e-01,
               2.64158397e-01,  2.65273001e-01,  3.05920463e-01,
               2.92556826e-01,  2.70477744e-01,  2.81435051e-01,
               1.71006516e-01,  3.47166815e-02,  5.94435783e-01,
               6.97696884e-02,  5.77164817e-02, -1.21696253e-01,
              -4.39218528e-02,  1.3373266