In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.getOrCreate()

In [2]:
df_train = spark.read.option('inferSchema', 'true').csv('Classification_Train.csv', header = True)
df_test = spark.read.option('inferSchema', 'true').csv('Classification_Test.csv', header = True)

In [3]:
df_train.show()

+-------------------+------+------+---------------+---------+-------+-------------+---------+
|               Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+-------------------+------+------+---------------+---------+-------+-------------+---------+
|      Sax Tesseyman|Female|   174|   Intermediate|     Blue|    Yes|     85000000|       No|
|        Niels Greet|  Male|   165|   Intermediate|    Black|     No|     14000000|       No|
|     Minetta Santry|Female|   160|            Low|    Black|     No|    148000000|      Yes|
|     Sherm Gossipin|Female|   144|           High|    Black|     No|     50000000|      Yes|
|   Cathie Blackmuir|  Male|   168|   Intermediate|    Black|    Yes|    101000000|       No|
|     Early Cardenas|  Male|   151|            Low|    Black|    Yes|    145000000|      Yes|
|   Willard Pendrick|Female|   141|   Intermediate|    Brown|     No|     55000000|      Yes|
|   Penelopa Spensly|Female|   144|   Intermediate|     Blue

In [4]:
df_test.show()

+------------------+------+------+---------------+---------+-------+-------------+---------+
|              Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+------------------+------+------+---------------+---------+-------+-------------+---------+
|      Lila Bracher|  Male|   151|            Low|    Black|     No|     74000000|      Yes|
|  Archibaldo Bigly|  Male|   162|           High|     Blue|    Yes|     29000000|       No|
|      Dion Stopher|  Male|   155|   Intermediate|    Brown|    Yes|    134000000|       No|
|    Genna Hallagan|  Male|   143|   Intermediate|     Blue|     No|     13000000|      Yes|
|         Alis Fass|Female|   163|            Low|     Blue|    Yes|      7000000|      Yes|
|Ellene Cumberbatch|Female|   150|   Intermediate|     Gray|     No|    106000000|      Yes|
| Ardella Rossander|  Male|   153|           High|     Gray|     No|     35000000|      Yes|
|  Wallache Wandtke|Female|   183|           High|     Gray|    Yes|  

In [5]:
df_train = df_train.select('Education Level', 'Married', 'Salary Income', 'Depressed')
df_test = df_test .select('Education Level', 'Married', 'Salary Income', 'Depressed')

In [6]:
df_train = df_train.na.drop()
df_test = df_test.na.drop()

In [7]:
def transform(df):
    df = df.withColumn("Education Level", when(df['Education Level'] == 'Low', 0).when(df['Education Level'] == 'Intermediate', 1).otherwise(2))
    df = df.withColumn("Married", when(df['Married'] == 'No', 0).otherwise(1))
    df = df.withColumn("Depressed", when(df['Depressed'] == "No", 0).otherwise(1))
    
    return df

In [8]:
df_train = transform(df_train)
df_test = transform(df_test)
df_train.show()

+---------------+-------+-------------+---------+
|Education Level|Married|Salary Income|Depressed|
+---------------+-------+-------------+---------+
|              1|      1|     85000000|        0|
|              1|      0|     14000000|        0|
|              0|      0|    148000000|        1|
|              2|      0|     50000000|        1|
|              1|      1|    101000000|        0|
|              0|      1|    145000000|        1|
|              1|      0|     55000000|        1|
|              1|      1|     51000000|        0|
|              2|      0|     97000000|        1|
|              0|      0|     41000000|        1|
|              2|      1|     27000000|        0|
|              2|      1|      3000000|        0|
|              2|      0|      9000000|        1|
|              1|      0|     12000000|        1|
|              1|      1|     81000000|        0|
|              2|      0|     53000000|        1|
|              0|      0|     10000000|        1|


In [9]:
def normalize(df):
    cols = df.columns
    cols.remove('Depressed')
    
    df = VectorAssembler(inputCols = cols, outputCol='Features').transform(df)
    df = StandardScaler(inputCol = 'Features', outputCol = 'Scaled Features').fit(df).transform(df)
    
    return df

In [10]:
df_train = normalize(df_train)
df_test = normalize(df_test)
df_train.show()

+---------------+-------+-------------+---------+-------------------+--------------------+
|Education Level|Married|Salary Income|Depressed|           Features|     Scaled Features|
+---------------+-------+-------------+---------+-------------------+--------------------+
|              1|      1|     85000000|        0|    [1.0,1.0,8.5E7]|[1.29595742362073...|
|              1|      0|     14000000|        0|    [1.0,0.0,1.4E7]|[1.29595742362073...|
|              0|      0|    148000000|        1|   [0.0,0.0,1.48E8]|[0.0,0.0,3.572913...|
|              2|      0|     50000000|        1|    [2.0,0.0,5.0E7]|[2.59191484724147...|
|              1|      1|    101000000|        0|   [1.0,1.0,1.01E8]|[1.29595742362073...|
|              0|      1|    145000000|        1|   [0.0,1.0,1.45E8]|[0.0,2.0004291810...|
|              1|      0|     55000000|        1|    [1.0,0.0,5.5E7]|[1.29595742362073...|
|              1|      1|     51000000|        0|    [1.0,1.0,5.1E7]|[1.29595742362073...|

In [11]:
model = LogisticRegression(featuresCol = "Scaled Features", labelCol = "Depressed").fit(df_train)
prediction = model.transform(df_test)
prediction.show()

+---------------+-------+-------------+---------+-------------------+--------------------+--------------------+--------------------+----------+
|Education Level|Married|Salary Income|Depressed|           Features|     Scaled Features|       rawPrediction|         probability|prediction|
+---------------+-------+-------------+---------+-------------------+--------------------+--------------------+--------------------+----------+
|              0|      0|     74000000|        1|    [0.0,0.0,7.4E7]|[0.0,0.0,1.803968...|[-1.9261546431117...|[0.12717681419040...|       1.0|
|              2|      1|     29000000|        0|    [2.0,1.0,2.9E7]|[2.54974382356388...|[1.65744753049940...|[0.83989506676115...|       0.0|
|              1|      1|    134000000|        0|   [1.0,1.0,1.34E8]|[1.27487191178194...|[2.65562370463603...|[0.93435675966068...|       0.0|
|              1|      0|     13000000|        1|    [1.0,0.0,1.3E7]|[1.27487191178194...|[-2.2037390843082...|[0.09941522026129...|    

In [12]:
evaluator = BinaryClassificationEvaluator(labelCol = 'Depressed')
print(evaluator.evaluate(prediction) * 100)

87.06157923010207
