#Lecture 1

In [2]:
from pyspark.sql import SparkSession
spart = SparkSession.builder.appName('mytree').getOrCreate()

In [3]:
# Importing classifiers
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, DecisionTreeClassifier

In [4]:
# Importing dataset
df = spark.read.format('libsvm').load('/FileStore/tables/sample_libsvm_data.txt')

In [5]:
df.show()

In [6]:
# Splitting the data
train_data, test_data = df.randomSplit([0.7, 0.3])

In [7]:
# Instantiating the classifiers
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [8]:
# Fitting the classifiers
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = dtc.fit(train_data)

In [9]:
# Doing predictions
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [10]:
# Showing predictions
dtc_preds.show()

In [11]:
rfc_preds.show()

In [12]:
gbt_preds.show()

In [13]:
# Using evaluator to get accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy_eval = MulticlassClassificationEvaluator(metricName='accuracy')
print('DTC Accuracy:', accuracy_eval.evaluate(dtc_preds))
print('RFC Accuracy:', accuracy_eval.evaluate(rfc_preds))
print('GBT Accuracy:', accuracy_eval.evaluate(gbt_preds))

In [14]:
# Retrieving the importance of model's features. The higher the number, the more important it was
rfc_model.featureImportances

#Lecture 2

In [16]:
# Importing the dataset
df = spark.sql('SELECT * FROM College_csv')

In [17]:
df.printSchema()

In [18]:
# Formatting the dataframe
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=df.columns[2:], outputCol='features')
output = assembler.transform(df)

In [19]:
# Indexing categorical dependent variable into numerical dependent variable
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')
output_fixed = indexer.fit(output).transform(output)

In [20]:
output_fixed.printSchema()

In [21]:
# Getting final format df
final_df = output_fixed.select('features', 'PrivateIndex')

In [22]:
# Splitting the data
train_data, test_data = final_df.randomSplit([0.7, 0.3])

In [23]:
# Instantiating the classifiers
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(numTrees=150, labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [24]:
# Training the calssifiers
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [25]:
# Getting predictions
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [26]:
# Getting evaluation metrics (area under the curve here)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_binary_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')
print('DTC area under the curve:', my_binary_eval.evaluate(dtc_preds))
print('RFC area under the curve:', my_binary_eval.evaluate(rfc_preds))
print('GBT area under the curve:', my_binary_eval.evaluate(gbt_preds))

In [27]:
gbt_preds.printSchema()

In [28]:
# Getting the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy')
print('DTC Accuracy:', accuracy_eval.evaluate(dtc_preds))
print('RFC Accuracy:', accuracy_eval.evaluate(rfc_preds))
print('GBT Accuracy:', accuracy_eval.evaluate(gbt_preds))