In [132]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [133]:
spark = SparkSession.builder.appName('L6Q1').getOrCreate()

In [134]:
df = spark.read.csv('datasets/dt_data.csv', inferSchema=True, header=True)

In [135]:
df.show(5)

+---+----+-------+----------+--------------+------+--------------+
| id| age| income| education|marital_status|region|defaulted_loan|
+---+----+-------+----------+--------------+------+--------------+
|  1|NULL|   NULL|HighSchool|      Divorced|  NULL|           Yes|
|  2|36.0|   NULL|    Master|       Married| North|            No|
|  3|NULL|   NULL|    Master|       Married|  NULL|           Yes|
|  4|39.0|   NULL|HighSchool|        Single| South|           Yes|
|  5|NULL|91438.0|HighSchool|      Divorced| North|           Yes|
+---+----+-------+----------+--------------+------+--------------+
only showing top 5 rows


In [136]:
df.describe().show()

+-------+-----------------+------------------+-----------------+---------+--------------+------+--------------+
|summary|               id|               age|           income|education|marital_status|region|defaulted_loan|
+-------+-----------------+------------------+-----------------+---------+--------------+------+--------------+
|  count|              500|               242|              248|      401|           382|   414|           500|
|   mean|            250.5| 40.79752066115702| 64836.9314516129|     NULL|          NULL|  NULL|          NULL|
| stddev|144.4818327679989|11.365135321733483|20285.73657135096|     NULL|          NULL|  NULL|          NULL|
|    min|                1|              22.0|          31152.0| Bachelor|      Divorced|  East|            No|
|    max|              500|              60.0|          99796.0|      PhD|        Single|  West|           Yes|
+-------+-----------------+------------------+-----------------+---------+--------------+------+--------

In [137]:
df = df.na.drop(subset=['region', 'defaulted_loan', 'education', 'marital_status'])

In [138]:
means = df.agg({'age': 'avg', 'income': 'avg'}).collect()
age_mean = int(means[0]['avg(age)'])
income_mean = int(means[0]['avg(income)'])

In [139]:
df = df.na.fill(age_mean, subset=['age'])
df = df.na.fill(income_mean, subset=['income'])

In [140]:
indexer = StringIndexer(inputCols=['education', 'marital_status', 'region', 'defaulted_loan'], outputCols=['education_i', 'marital_status_i', 'region_i', 'defaulted_loan_i'])
df = indexer.fit(df).transform(df)

In [141]:
vectorizer = VectorAssembler(inputCols=['age', 'income', 'education_i', 'marital_status_i', 'region_i'], outputCol='features')
df = vectorizer.transform(df)

In [142]:
df_train, df_test = df.randomSplit([0.8, 0.2], seed=819)

In [143]:
dt = DecisionTreeClassifier(labelCol="defaulted_loan_i", featuresCol="features")
model = dt.fit(df_train)
pred = model.transform(df_test)

In [144]:
eval_acc = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='defaulted_loan_i', metricName='accuracy')
eval_pre = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='defaulted_loan_i', metricName='weightedPrecision')
eval_rec = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='defaulted_loan_i', metricName='weightedRecall')

In [145]:
accuracy = eval_acc.evaluate(pred)
precision = eval_pre.evaluate(pred)
recall = eval_rec.evaluate(pred)

print(f'Precision: {precision}, Accuracy: {accuracy}, Recall: {recall}')

Precision: 0.6751089324618736, Accuracy: 0.6481481481481481, Recall: 0.6481481481481481


In [146]:
spark.stop()