# Space Titanic dataset analyse

Import required packages

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Load the dataset

In [4]:
filename = "train.csv"
df = pd.read_csv(os.path.join("data", filename))

# Inspect the dataset

In [9]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [24]:
print(f"Shape: {df.shape}")
print()
print(f"Datatypes:\n{df.dtypes}")

print()

Shape: (8693, 14)

Datatypes:
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object



## Inspect missing values

In [19]:
df[df.isna().sum(axis=1) > 0]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False
16,0014_01,Mars,False,F/3/P,55 Cancri e,27.0,False,1286.0,122.0,,0.0,0.0,Flats Eccle,False
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,0.0,0.0,Mollen Mcfaddennon,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,9250_01,Europa,False,E/597/P,TRAPPIST-1e,29.0,False,0.0,2972.0,,28.0,188.0,Chain Reedectied,True
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False
8675,9259_01,Earth,,F/1893/P,TRAPPIST-1e,44.0,False,1030.0,1015.0,0.0,11.0,,Annah Gilleyons,True
8684,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True


In [20]:
df[df.isna().sum(axis=1) > 0].isna().sum(axis=1).value_counts()

1    1867
2     203
3      17
Name: count, dtype: int64

In [16]:
df[df["HomePlanet"].isna()].isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep         2
Cabin             6
Destination       4
Age               2
VIP               3
RoomService       6
FoodCourt         2
ShoppingMall      7
Spa               3
VRDeck            1
Name              7
Transported       0
dtype: int64

In [32]:
df["CryoSleep"].value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

In [40]:
df[df["CryoSleep"]==True].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,2955.0,2969.0,2967.0,2941.0,2972.0,2975.0
mean,27.405415,0.0,0.0,0.0,0.0,0.0
std,15.080469,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,0.0,0.0,0.0,0.0,0.0
max,78.0,0.0,0.0,0.0,0.0,0.0


Conclusions:
 - Dataset contains 8693 rows described by 14 attributes
 - Columns are either floats or strings. Only target column is bool.
 - Almost all columns contain around 200 missing values. The majority of rows have 0 or 1 missing values.
    - 1867 rows have one missing value. Because of large percentage of those rows in dataset, some imputation will be needed.
    - Only 203 rows have 2 missing values. Only 17 rows have 3 missing values. There is no row with more than 3 missing values.
 - Column 'Transported' is target, that we have to predict.
 - 'Passenger_id' column contains two numbers: first indicates a group (number of ticket?) the passenger is travelling with. Second number is their number within the group.
 - 'CryoSleep' columns indicate whether passenger is confied to his room. Those people don't spend any money - this information may be helpful for data imputing.
 - VIP passenger has paid for special services
 - Last name of passenger may indicate, if passengers in group are a family.
 

Things to do during feature engineering and data cleaning phase:
 - impute missing values or remove rows containing them.
 - divide columns PassengerId and Cabin into more atomic attributes
 - bucketize 'Age' and expenses columns
 - Create new features: 'is_travelling_alone', 'is_travelling_with_family'

# Data preparation

Data cleaning and feature engineering that will faciliate analysis and help to get better predictions.

## Impute values

### Rows with three missing values

In [58]:
df[df.isna().sum(axis=1) == 3]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,"(TicketId, InvidualId)",TicketId,InvidualId,LastName,Deck,Num,Side
813,0865_01,Europa,True,B/35/S,55 Cancri e,36.0,False,0.0,,0.0,...,0.0,,True,"[0865, 01]",865,1,,B,35,S
833,0890_01,Mars,False,F/184/P,TRAPPIST-1e,16.0,False,172.0,,338.0,...,0.0,,False,"[0890, 01]",890,1,,F,184,P
1203,1284_01,Mars,True,F/247/S,,,False,0.0,,0.0,...,0.0,Hal Knité,True,"[1284, 01]",1284,1,Knité,F,247,S
1207,1287_02,Mars,True,F/259/P,55 Cancri e,4.0,False,0.0,0.0,0.0,...,0.0,,True,"[1287, 02]",1287,2,,F,259,P
1616,1711_01,Earth,,F/327/S,TRAPPIST-1e,51.0,False,437.0,244.0,908.0,...,0.0,,False,"[1711, 01]",1711,1,,F,327,S
1617,1712_01,Mars,,F/339/P,TRAPPIST-1e,17.0,False,139.0,0.0,959.0,...,0.0,,False,"[1712, 01]",1712,1,,F,339,P
1718,1829_01,Europa,False,D/56/P,55 Cancri e,48.0,False,0.0,3578.0,80.0,...,,,True,"[1829, 01]",1829,1,,D,56,P
1726,1834_01,Mars,False,F/354/S,,26.0,False,17.0,0.0,1582.0,...,0.0,,True,"[1834, 01]",1834,1,,F,354,S
1755,1865_04,Earth,False,G/292/S,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,...,,,False,"[1865, 04]",1865,4,,G,292,S
1855,1978_01,,True,G/311/S,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,...,0.0,,True,"[1978, 01]",1978,1,,G,311,S


### Impute names

### Impute Age

### Imput cryosleep

### Impute Cabin

### Impute Destination

### Impute HomePlanet

### Impute Expenses

If person was put in cryosleep, then all of missing values in these columns will be filled with `0`. Otherwise they will be filled with mean value, regarding their Age and Vip status.

In [57]:
expenses_columns = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in expenses_columns:
    print(f"{col}")
    print(df[df[col].isna()]["CryoSleep"].value_counts())
    print()

RoomService
CryoSleep
False    111
True      68
Name: count, dtype: int64

FoodCourt
CryoSleep
False    110
True      70
Name: count, dtype: int64

ShoppingMall
CryoSleep
False    104
True      96
Name: count, dtype: int64

Spa
CryoSleep
False    113
True      65
Name: count, dtype: int64

VRDeck
CryoSleep
False    119
True      62
Name: count, dtype: int64



## Create new columns

### Divide id column 

In [48]:
df[["TicketId", "InvidualId"]] = df["PassengerId"].str.split("_", expand=True)

### Divide name column

In [53]:
df[["FirstName", "LastName"]] = df["Name"].str.split(" ", expand=True)

Actually, I don't think that we need FirstName for this analysis

In [55]:
df = df.drop(columns=["FirstName"])

### Divide cabin column

In [56]:
df[["Deck", "Num", "Side"]] = df["Cabin"].str.split("/", expand=True)

## Bin continous values

# Data Analyse

# Predictions - stage 1

# Predictions - stage 2