# Kaggle - Space Titanic - Spark framework

This notebooks is used as practice for ML applied to Kaggle Space Titanic training problem and pySpark practice
https://www.kaggle.com/competitions/spaceship-titanic/data?select=train.csv

In [76]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split
from pyspark.ml import pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import pandas as pd

## Import data to local Spark Framework

Load data from CSV and organize schema

In [98]:
# set env viroment for localhost if needed
spark = SparkSession.builder.config("spark.driver.host", "localhost").getOrCreate()

train_spark = spark.read.format('csv').option("header",True).load('./train.csv')
test_spark = spark.read.format('csv').option("header",True).load('./test.csv')

In [99]:
train_spark.show()
train_spark.printSchema()

+-----------+----------+---------+-----+-------------+----+-----+-----------+---------+------------+------+------+------------------+-----------+
|PassengerId|HomePlanet|CryoSleep|Cabin|  Destination| Age|  VIP|RoomService|FoodCourt|ShoppingMall|   Spa|VRDeck|              Name|Transported|
+-----------+----------+---------+-----+-------------+----+-----+-----------+---------+------------+------+------+------------------+-----------+
|    0001_01|    Europa|    False|B/0/P|  TRAPPIST-1e|39.0|False|        0.0|      0.0|         0.0|   0.0|   0.0|   Maham Ofracculy|      False|
|    0002_01|     Earth|    False|F/0/S|  TRAPPIST-1e|24.0|False|      109.0|      9.0|        25.0| 549.0|  44.0|      Juanna Vines|       True|
|    0003_01|    Europa|    False|A/0/S|  TRAPPIST-1e|58.0| True|       43.0|   3576.0|         0.0|6715.0|  49.0|     Altark Susent|      False|
|    0003_02|    Europa|    False|A/0/S|  TRAPPIST-1e|33.0|False|        0.0|   1283.0|       371.0|3329.0| 193.0|      Sola

## Data wrangling

In [118]:
# cast types
train_spark = train_spark.withColumn("CryoSleep",col("CryoSleep").cast("boolean"))
train_spark = train_spark.withColumn("VIP",col("VIP").cast("boolean"))
train_spark = train_spark.withColumn("Transported",col("Transported").cast("boolean"))
train_spark = train_spark.withColumn("Age",col("Age").cast("float"))
train_spark = train_spark.withColumn("RoomService",col("RoomService").cast("float"))
train_spark = train_spark.withColumn("FoodCourt",col("FoodCourt").cast("float"))
train_spark = train_spark.withColumn("ShoppingMall",col("ShoppingMall").cast("float"))
train_spark = train_spark.withColumn("Spa",col("Spa").cast("float"))
train_spark = train_spark.withColumn("VRDeck",col("VRDeck").cast("float"))

# breakdown cabin address
train_spark =  train_spark.withColumn('deck', split(train_spark['Cabin'], '/').getItem(0)) \
       .withColumn('num', split(train_spark['Cabin'], '/').getItem(1)) \
       .withColumn('side', split(train_spark['Cabin'], '/').getItem(2))
train_spark = train_spark.withColumn("num",col("num").cast("float"))

# one hot encode
train_spark_ohe = StringIndexer(inputCol='HomePlanet', outputCol='HomePlanet_idx').fit(train_spark).transform(train_spark)
train_spark_ohe = StringIndexer(inputCol='Destination', outputCol='Destination_idx').fit(train_spark_ohe).transform(train_spark_ohe)
train_spark_ohe = StringIndexer(inputCol='deck', outputCol='deck_idx', handleInvalid='keep').fit(train_spark_ohe).transform(train_spark_ohe)
train_spark_ohe = StringIndexer(inputCol='side', outputCol='side_idx', handleInvalid='keep').fit(train_spark_ohe).transform(train_spark_ohe)

# normalize numerical data


train_spark_ohe.show()
train_spark_ohe.printSchema()

+-----------+----------+---------+-----+-------------+----+-----+-----------+---------+------------+------+------+------------------+-----------+-------+-------+-------+----+----+----+--------------+---------------+--------+--------+
|PassengerId|HomePlanet|CryoSleep|Cabin|  Destination| Age|  VIP|RoomService|FoodCourt|ShoppingMall|   Spa|VRDeck|              Name|Transported|Cabin_1|Cabin_2|Cabin_3|deck| num|side|HomePlanet_idx|Destination_idx|deck_idx|side_idx|
+-----------+----------+---------+-----+-------------+----+-----+-----------+---------+------------+------+------+------------------+-----------+-------+-------+-------+----+----+----+--------------+---------------+--------+--------+
|    0001_01|    Europa|    false|B/0/P|  TRAPPIST-1e|39.0|false|        0.0|      0.0|         0.0|   0.0|   0.0|   Maham Ofracculy|      false|      B|      0|      P|   B| 0.0|   P|           1.0|            0.0|     3.0|     1.0|
|    0002_01|     Earth|    false|F/0/S|  TRAPPIST-1e|24.0|false

In [None]:
# onehot encode and normalization



In [115]:
train_spark_ohe.select("Cabin_1").distinct().show()

+-------+
|Cabin_1|
+-------+
|      F|
|      E|
|      T|
|      B|
|      D|
|      C|
|      A|
|      G|
|   NULL|
+-------+



In [42]:
Correlation.corr(column=)

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported']