In [42]:
import sys
sys.path.append('../../../kaggleLibrary')

from judeKaggleLibrary import load_data, create_csv

In [43]:
train_df, test_df = load_data("playground-series-s4e1")

In [44]:
test_df.show()

+------+----------+----------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+
|    id|CustomerId|   Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|
+------+----------+----------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+
|165034|  15773898|  Lucchese|        586|   France|Female|23.0|     2|      0.0|            2|      0.0|           1.0|      160976.75|
|165035|  15782418|      Nott|        683|   France|Female|46.0|     2|      0.0|            1|      1.0|           0.0|       72549.27|
|165036|  15807120|        K?|        656|   France|Female|34.0|     7|      0.0|            2|      1.0|           0.0|      138882.09|
|165037|  15808905| O'Donnell|        681|   France|  Male|36.0|     8|      0.0|            1|      1.0|           0.0|      113931.57|
|165038|  15607314|   Higgins|        752

In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct

train_df.show()

train_df.describe().show()

# Count unique values for each column
for column in train_df.columns:
    unique_count = train_df.agg(countDistinct(column).alias(column)).collect()[0][column]
    print(f"Number of unique values in {column}: {unique_count}")

+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
| id|CustomerId|       Surname|CreditScore|Geography|Gender| Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---+----------+--------------+-----------+---------+------+----+------+---------+-------------+---------+--------------+---------------+------+
|  0|  15674932|Okwudilichukwu|        668|   France|  Male|33.0|     3|      0.0|            2|      1.0|           0.0|      181449.97|     0|
|  1|  15749177| Okwudiliolisa|        627|   France|  Male|33.0|     1|      0.0|            2|      1.0|           1.0|        49503.5|     0|
|  2|  15694510|         Hsueh|        678|   France|  Male|40.0|    10|      0.0|            2|      1.0|           0.0|      184866.69|     0|
|  3|  15741417|           Kao|        581|   France|  Male|34.0|     2|148882.54|            1|      1.0|           1.0|       84

Dropping Useless Features

In [46]:
train_df = train_df.drop(*["Surname"])

Creating the model

In [47]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# Feature Types
categorical_features = ['Geography', 'Gender']
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Assembler Steps - Do not fit indexers here
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_features]
encoders = [OneHotEncoder(inputCols=[column + "_index"], outputCols=[column + "_vec"]) for column in categorical_features]
assembler = VectorAssembler(inputCols=[col + "_vec" for col in categorical_features] + numerical_features, outputCol="features")

# Initialize the logistic regression model with the correct label column
rf = RandomForestClassifier(featuresCol="features", labelCol="Exited") 

# Combine all stages into a pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler, rf])

# Fit the pipeline on the training data
model = pipeline.fit(train_df)

Make Predictions

In [48]:
# Make predictions on the test data
predictions = model.transform(test_df)

# Show some prediction results
predictions.select("id", "prediction").show()

+------+----------+
|    id|prediction|
+------+----------+
|165034|       0.0|
|165035|       1.0|
|165036|       0.0|
|165037|       0.0|
|165038|       0.0|
|165039|       0.0|
|165040|       0.0|
|165041|       0.0|
|165042|       1.0|
|165043|       0.0|
|165044|       0.0|
|165045|       0.0|
|165046|       0.0|
|165047|       0.0|
|165048|       0.0|
|165049|       0.0|
|165050|       0.0|
|165051|       0.0|
|165052|       0.0|
|165053|       0.0|
+------+----------+
only showing top 20 rows



Create Submission File

In [49]:
create_csv("playground-series-s4e1", predictions)