Data Load

In [458]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder.appName('KaggleData').getOrCreate()

# Read the CSV data into a DataFrame
train_df = spark.read.csv('../data/playground-series-s3e24/train.csv', header=True, inferSchema=True)
test_df = spark.read.csv('../data/playground-series-s3e24/test.csv', header=True, inferSchema=True)

# Show the DataFrame to verify it's loaded
train_df.show()

+---+---+----------+----------+---------+--------------+---------------+-------------+--------------+--------+----------+-------------------+-----------+------------+---+---+----------+-------------+----------------+---+---+---+-------------+-------+
| id|age|height(cm)|weight(kg)|waist(cm)|eyesight(left)|eyesight(right)|hearing(left)|hearing(right)|systolic|relaxation|fasting blood sugar|Cholesterol|triglyceride|HDL|LDL|hemoglobin|Urine protein|serum creatinine|AST|ALT|Gtp|dental caries|smoking|
+---+---+----------+----------+---------+--------------+---------------+-------------+--------------+--------+----------+-------------------+-----------+------------+---+---+----------+-------------+----------------+---+---+---+-------------+-------+
|  0| 55|       165|        60|     81.0|           0.5|            0.6|            1|             1|     135|        87|                 94|        172|         300| 40| 75|      16.5|            1|             1.0| 22| 25| 27|            0|     

EDA

In [459]:
summary_stats = train_df.describe()
summary_stats.show()

+-------+-----------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+-------------------+
|summary|               id|               age|       height(cm)|        weight(kg)|        waist(cm)|    eyesight(left)|   eyesight(right)|      hearing(left)|     hearing(right)|          systolic|       relaxation|fasting blood sugar|       Cholesterol|      triglyceride|               HDL|               LDL|        hemoglobin|      Urine protein|   serum creatinine|               AST|               ALT|               Gtp|      dental caries|            smoking|
+-------+-----------------+------------------+----------------

In [460]:
# Sample a fraction of the DataFrame
fraction = 0.01
seed = 42  # Seed for reproducibility

sampled_df = train_df.sample(withReplacement=False, fraction=fraction, seed=seed)

# Now convert the sampled DataFrame to Pandas for visualization
sampled_pd_df = sampled_df.toPandas()

In [461]:
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession

# Convert the Spark DataFrame to a Pandas DataFrame
# df_pandas = train_df.toPandas()

# Use Seaborn to create a pair plot
# sns.pairplot(sampled_pd_df, hue='smoking')

# Show the plot
# plt.show()

Check for missing values

In [462]:
from pyspark.sql.functions import col, count, isnan, when

for col in train_df.columns:
    missing_count = train_df.filter((train_df[col] == "") | train_df[col].isNull() | isnan(train_df[col])).count()
    total_count = train_df.count()
    print(f"Column {col} has {(missing_count / total_count) * 100}% missing values")


Column id has 0.0% missing values
Column age has 0.0% missing values
Column height(cm) has 0.0% missing values
Column weight(kg) has 0.0% missing values
Column waist(cm) has 0.0% missing values
Column eyesight(left) has 0.0% missing values
Column eyesight(right) has 0.0% missing values
Column hearing(left) has 0.0% missing values
Column hearing(right) has 0.0% missing values
Column systolic has 0.0% missing values
Column relaxation has 0.0% missing values
Column fasting blood sugar has 0.0% missing values
Column Cholesterol has 0.0% missing values
Column triglyceride has 0.0% missing values
Column HDL has 0.0% missing values
Column LDL has 0.0% missing values
Column hemoglobin has 0.0% missing values
Column Urine protein has 0.0% missing values
Column serum creatinine has 0.0% missing values
Column AST has 0.0% missing values
Column ALT has 0.0% missing values
Column Gtp has 0.0% missing values
Column dental caries has 0.0% missing values
Column smoking has 0.0% missing values


Data Clean

In [463]:
train_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- height(cm): integer (nullable = true)
 |-- weight(kg): integer (nullable = true)
 |-- waist(cm): double (nullable = true)
 |-- eyesight(left): double (nullable = true)
 |-- eyesight(right): double (nullable = true)
 |-- hearing(left): integer (nullable = true)
 |-- hearing(right): integer (nullable = true)
 |-- systolic: integer (nullable = true)
 |-- relaxation: integer (nullable = true)
 |-- fasting blood sugar: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- triglyceride: integer (nullable = true)
 |-- HDL: integer (nullable = true)
 |-- LDL: integer (nullable = true)
 |-- hemoglobin: double (nullable = true)
 |-- Urine protein: integer (nullable = true)
 |-- serum creatinine: double (nullable = true)
 |-- AST: integer (nullable = true)
 |-- ALT: integer (nullable = true)
 |-- Gtp: integer (nullable = true)
 |-- dental caries: integer (nullable = true)
 |-- smoking: integer (n

In [464]:
unique_values_count = train_df.select("Urine protein").distinct().count()

print(f"Number of unique values': {unique_values_count}")


Number of unique values': 6


In [465]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Define StringIndexer stages
indexer1 = StringIndexer(inputCol="dental caries", outputCol="dentalCariesIndex")
indexer3 = StringIndexer(inputCol="hearing(left)", outputCol="hearingLeftIndex")
indexer4 = StringIndexer(inputCol="hearing(right)", outputCol="hearingRightIndex")

# Define OneHotEncoder stages
encoder1 = OneHotEncoder(inputCol="dentalCariesIndex", outputCol="dentalCariesVec")
encoder3 = OneHotEncoder(inputCol="hearingLeftIndex", outputCol="hearingLeftVec")
encoder4 = OneHotEncoder(inputCol="hearingRightIndex", outputCol="hearingRightVec")

# Define VectorAssembler stage
assembler = VectorAssembler(
    inputCols=["age", "height(cm)", "weight(kg)", "waist(cm)", "eyesight(left)", "eyesight(right)", 
               "systolic", "relaxation", "fasting blood sugar", "Cholesterol", "triglyceride", "HDL", "LDL", 
               "hemoglobin", "Urine protein", "serum creatinine", "AST", "ALT", "Gtp", 
               "dentalCariesVec", "hearingLeftVec", "hearingRightVec"], 
    outputCol="features")

# Define the Pipeline
pipeline = Pipeline(stages=[indexer1, indexer3, indexer4, encoder1, encoder3, encoder4, assembler])

Train Model

In [466]:
from pyspark.ml.classification import LogisticRegression

# Fit the pipeline to the training data
pipeline_model = pipeline.fit(train_df)

# Transform the training data
train_df_transformed = pipeline_model.transform(train_df)

# Train the Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='smoking')
lr_model = lr.fit(train_df_transformed)

Create Predictions

In [467]:
# Fit the pipeline to the training data
pipeline_model = pipeline.fit(test_df)

# Transform the training data
test_df_transformed = pipeline_model.transform(test_df)

In [468]:
# Make predictions
predictions = lr_model.transform(test_df_transformed)

# Show predictions
predictions.select("id", "prediction").show()

+------+----------+
|    id|prediction|
+------+----------+
|159256|       0.0|
|159257|       0.0|
|159258|       1.0|
|159259|       0.0|
|159260|       1.0|
|159261|       1.0|
|159262|       0.0|
|159263|       0.0|
|159264|       0.0|
|159265|       0.0|
|159266|       0.0|
|159267|       0.0|
|159268|       0.0|
|159269|       0.0|
|159270|       0.0|
|159271|       1.0|
|159272|       1.0|
|159273|       1.0|
|159274|       0.0|
|159275|       0.0|
+------+----------+
only showing top 20 rows



Create Submission File

In [469]:
import pandas as pd

# Assuming 'df' is your Spark DataFrame
# Convert the Spark DataFrame to a Pandas DataFrame
pandas_df = predictions.select("id", "prediction").toPandas()

# Define the local path where you want to save the CSV
output_csv_path = "../data/playground-series-s3e24/submission.csv"

# Save the Pandas DataFrame as a CSV file
pandas_df.to_csv(output_csv_path, index=False)