In [None]:
# Import Libraries

# Steps of Classification

# 1. Load Data
# 2. Select Features
# 3. Data Preprocessing -> FIltering data
# 4. Transform Data -> Normalization (Easy to Check Data)
# 5. Generate Model
# 6. Model test and Evaluation

In [None]:
# Import Libraries

# Import SparkSsession
from pyspark.sql import SparkSession
# Import the ML Classification -> LogisticRegression
from pyspark.ml.classification import LogisticRegression
# Import Evaluation -> BianryClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Import when (if else condition in pyspark)
from pyspark.sql.functions import when
# For Normalization import VectortAssembler, StandardScaler
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [None]:
# Step 1 Create Spark Session

spark = SparkSession.builderGetOrCreate()

In [None]:
# Step 2 Load Data

# Find factor that have a big impact of it's output which is 'Depressed'

# Training
df_train = spark.read.option("inferSchema", "true").csv("data/Classification_Train.csv", header=True)

# Testing
df_test = spark.read.option("inferSchema", "true").csv("data/Classification_Test.csv", header=True)

df_test.show()

In [None]:
# Step 3 Data Preprocessing

# Clean data
df_train = df_train.na.drop()
df_test = df_test.na.drop()

df_test.show()

In [None]:
# Step 4 Select Feature (Basically SELECT)

# Education Level - Eye Color - Married - Salary Income - Depressed
df_train = df_train.select("Education Level", "Eye Color", "Married", "Salary Income", "Depressed")

df_test = df_test.select("Education Level", "Eye Color", "Married", "Salary Income", "Depressed")

In [None]:
# Step 5 Data Transformation -> Converting Data String into Integer
# Must Integer to easy to make the scatter plot

# Ex : Education Level : High - Intermediate - Low
# Ex 2 : NULL -> YES = 1, NO = 0

def transform(df):
    df = df.withColumn("Education Level", when(df["Education Level"] =="High", 3).when(df["Education Level"] == "Intermediate",2).when(df["Education Level"] == "Low", 1))
    df = df.withColumn("Married", when(df["Married"] == "Yes", 1).when(df["Married"]=="No", 0))
    df = df.withColumn("Depressed", when(df["Depressed"] == "Yes" , 1).when(df["Depressed"] == "No", 0))

    return df

df_train = transform(df_train)
df_test = transform(df_test)

# Show it's output
df_train.show()

In [None]:
# For Dynamic
cols = df_train.column
cols.remove("Depressed") #Because Depressed is the output

assembler = VectorAssembler(inputCols=cols, outputCol="Features")
scaler = StandardScaler(inputCol="Features", outputCol="ScaledFeatures")

df_train = assembler.transform(df_train)
df_train = scaler.fit(df_train).transform(df_train)

df_test = assembler.transform(df_test)
df_test = scaler.fit(df_test).transform(df_test)

df_test.show()

# just going to test it
test = assembler.transform(df_train)
test = scaler.fit(test).transform(test)
test.show()

In [None]:
# Generate Model with LogisticRegression

# Compare ScaledFeatures with Depressed
model = LogisticRegression(featuresCol="ScaledFeatures", labelCol="Depressed").fit(df_train)
# Generate Prediction
prediction = model.transform(df_test)
prediction.select("Depressed", "Prediction", "ScaledFeatures").show(5)

In [None]:
# Model Testing and Evaluation

evaluator = BinaryClassificationEvaluator(labelCol="Depressed")
accuracy = round(evaluator.evaluate(prediction) * 100 ,2)

print(f"{accuracy}%")