In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
training = spark.read.option("inferSchema","true").csv("Classification_Train.csv",header=True)
testing = spark.read.option("inferSchema","true").csv("Classification_Test.csv",header=True)

In [3]:
training = training.select("Education Level","Married","Salary Income","Depressed")
testing = testing.select("Education Level","Married","Salary Income","Depressed")

In [5]:
training = training.na.drop()
testing = testing.na.drop()

In [6]:
from pyspark.sql.functions import when

training = training.withColumn("Education Level",when(training["Education Level"] == "Low",0)
                                                .when(training["Education Level"] == "Intermediate",1)
                                                .otherwise(2))

testing = testing.withColumn("Education Level",when(testing["Education Level"] == "Low",0)
                                                .when(testing["Education Level"] == "Intermediate",1)
                                                .otherwise(2))

training = training.withColumn("Married",when(training["Married"] == "No",0)
                                        .otherwise(1))

testing = testing.withColumn("Married",when(testing["Married"] == "No",0)
                                        .otherwise(1))

training = training.withColumn("Depressed",when(training["Depressed"] == "No",0)
                                        .otherwise(1))

testing = testing.withColumn("Depressed",when(testing["Depressed"] == "No",0)
                                        .otherwise(1))

In [9]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

cols = training.columns
cols.remove("Depressed")

training = VectorAssembler(inputCols=cols, outputCol="Features").transform(training)
testing = VectorAssembler(inputCols=cols, outputCol="Features").transform(testing)

In [12]:
scaler = StandardScaler(inputCol="Features",outputCol="Scaled Features")
training = scaler.fit(training).transform(training)
testing = scaler.fit(testing).transform(testing)

In [15]:
from pyspark.ml.classification import LogisticRegression

model = LogisticRegression(featuresCol="Scaled Features",labelCol="Depressed",maxIter=10).fit(training)
prediction = model.transform(testing)

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Depressed")
result = evaluator.evaluate(prediction)
print("Accuracy: {}%".format(result * 100))

Accuracy: 87.06157923010207%


In [14]:
training.show(3,False)
testing.show(3,False)

+---------------+-------+-------------+---------+----------------+----------------------------------------------------------+
|Education Level|Married|Salary Income|Depressed|Features        |Scaled Features                                           |
+---------------+-------+-------------+---------+----------------+----------------------------------------------------------+
|1              |1      |85000000     |0        |[1.0,1.0,8.5E7] |[1.2959574236207398,2.0004291810120303,2.0520111267842664]|
|1              |0      |14000000     |0        |[1.0,0.0,1.4E7] |[1.2959574236207398,0.0,0.3379783032350556]               |
|0              |0      |148000000    |1        |[0.0,0.0,1.48E8]|[0.0,0.0,3.5729134913420166]                              |
+---------------+-------+-------------+---------+----------------+----------------------------------------------------------+
only showing top 3 rows

+---------------+-------+-------------+---------+----------------+---------------------------