In [25]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('EndSem5').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/27 00:20:01 WARN Utils: Your hostname, Karthikeya, resolves to a loopback address: 127.0.1.1; using 172.25.191.235 instead (on interface zt4homnczt)
25/10/27 00:20:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/27 00:20:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [69]:
df = spark.read.csv('file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/Week6/datasets/dt_data.csv', inferSchema=True, header=True)

In [70]:
df.show(5)

+---+----+-------+----------+--------------+------+--------------+
| id| age| income| education|marital_status|region|defaulted_loan|
+---+----+-------+----------+--------------+------+--------------+
|  1|NULL|   NULL|HighSchool|      Divorced|  NULL|           Yes|
|  2|36.0|   NULL|    Master|       Married| North|            No|
|  3|NULL|   NULL|    Master|       Married|  NULL|           Yes|
|  4|39.0|   NULL|HighSchool|        Single| South|           Yes|
|  5|NULL|91438.0|HighSchool|      Divorced| North|           Yes|
+---+----+-------+----------+--------------+------+--------------+
only showing top 5 rows


In [71]:
df = df.na.drop()

In [72]:
df.show(4)

+---+----+-------+---------+--------------+------+--------------+
| id| age| income|education|marital_status|region|defaulted_loan|
+---+----+-------+---------+--------------+------+--------------+
| 20|38.0|65375.0|      PhD|      Divorced| North|            No|
| 26|37.0|61704.0|   Master|      Divorced| North|           Yes|
| 29|35.0|74075.0|   Master|       Married|  West|           Yes|
| 67|34.0|99307.0| Bachelor|        Single|  East|            No|
+---+----+-------+---------+--------------+------+--------------+
only showing top 4 rows


In [73]:
string_indexer = StringIndexer(inputCols=['education', 'marital_status', 'region', 'defaulted_loan'], outputCols=['Ieducation', 'Imarital_status', 'Iregion', 'Idefaulted_loan'])
vectorizer = VectorAssembler(inputCols=['Ieducation', 'Imarital_status', 'Iregion', 'age'], outputCol='features')
indxtostr = IndexToString(inputCol='prediction', outputCol='Pdefaulted_loan')

In [74]:
train, test = df.randomSplit([0.8, 0.2])

In [75]:
dc = DecisionTreeClassifier(featuresCol='features', labelCol='Idefaulted_loan', maxDepth=5, impurity='gini')

In [79]:
pipe = Pipeline(stages=[string_indexer, vectorizer, dc])

In [80]:
model = pipe.fit(train)

In [81]:
pred = model.transform(train)

In [82]:
pred.show(5)

+---+----+-------+----------+--------------+------+--------------+----------+---------------+-------+---------------+------------------+-------------+-----------+----------+
| id| age| income| education|marital_status|region|defaulted_loan|Ieducation|Imarital_status|Iregion|Idefaulted_loan|          features|rawPrediction|probability|prediction|
+---+----+-------+----------+--------------+------+--------------+----------+---------------+-------+---------------+------------------+-------------+-----------+----------+
| 20|38.0|65375.0|       PhD|      Divorced| North|            No|       2.0|            0.0|    2.0|            1.0|[2.0,0.0,2.0,38.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|
| 29|35.0|74075.0|    Master|       Married|  West|           Yes|       1.0|            1.0|    0.0|            0.0|[1.0,1.0,0.0,35.0]|    [1.0,1.0]|  [0.5,0.5]|       0.0|
| 67|34.0|99307.0|  Bachelor|        Single|  East|            No|       3.0|            2.0|    1.0|            1.0|[3.0,2.0,1.0,

In [83]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Idefaulted_loan', metricName='accuracy')
acc = evaluator.evaluate(pred)

In [84]:
acc

0.9

In [85]:
indxtostr = IndexToString(inputCol='Idefaulted_loan', outputCol='Pdefaulted_loan')
pred = indxtostr.transform(pred)

In [86]:
pred.show(5)

+---+----+-------+----------+--------------+------+--------------+----------+---------------+-------+---------------+------------------+-------------+-----------+----------+---------------+
| id| age| income| education|marital_status|region|defaulted_loan|Ieducation|Imarital_status|Iregion|Idefaulted_loan|          features|rawPrediction|probability|prediction|Pdefaulted_loan|
+---+----+-------+----------+--------------+------+--------------+----------+---------------+-------+---------------+------------------+-------------+-----------+----------+---------------+
| 20|38.0|65375.0|       PhD|      Divorced| North|            No|       2.0|            0.0|    2.0|            1.0|[2.0,0.0,2.0,38.0]|    [0.0,2.0]|  [0.0,1.0]|       1.0|             No|
| 29|35.0|74075.0|    Master|       Married|  West|           Yes|       1.0|            1.0|    0.0|            0.0|[1.0,1.0,0.0,35.0]|    [1.0,1.0]|  [0.5,0.5]|       0.0|            Yes|
| 67|34.0|99307.0|  Bachelor|        Single|  East

In [87]:
spark.stop()