## Exploration

In [297]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [298]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [300]:
titanic = pd.read_csv("train.csv")
titanic.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [318]:
titanic.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
5495,5860_03,Mars,False,F/1118/S,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Chex Raste,True
3481,3745_01,Earth,False,G/608/P,TRAPPIST-1e,19.0,False,,2.0,14.0,755.0,0.0,Ellena Mckinsond,False
2747,2949_01,Earth,True,G/482/P,TRAPPIST-1e,22.0,False,0.0,0.0,0.0,,0.0,Florey Boltertley,False
5631,5986_01,Europa,,D/188/P,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Zubeneb Flesping,True
8318,8881_01,Earth,False,F/1719/S,55 Cancri e,34.0,False,0.0,0.0,717.0,0.0,0.0,Timmy Garnettiz,True


In [303]:
titanic.corr(numeric_only=True)

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
Age,1.0,0.068723,0.130421,0.033133,0.12397,0.101007,-0.075026
RoomService,0.068723,1.0,-0.015889,0.05448,0.01008,-0.019581,-0.244611
FoodCourt,0.130421,-0.015889,1.0,-0.014228,0.221891,0.227995,0.046566
ShoppingMall,0.033133,0.05448,-0.014228,1.0,0.013879,-0.007322,0.010141
Spa,0.12397,0.01008,0.221891,0.013879,1.0,0.153821,-0.221131
VRDeck,0.101007,-0.019581,0.227995,-0.007322,0.153821,1.0,-0.207075
Transported,-0.075026,-0.244611,0.046566,0.010141,-0.221131,-0.207075,1.0


### Explore cabin column

In [304]:
titanic.sample(10)["Cabin"]

1250      E/97/S
6954    G/1198/S
7271     D/246/P
3058     G/525/S
6526    F/1319/S
4743     C/196/S
519       C/20/P
8415     B/291/P
962      G/152/P
8505     E/595/S
Name: Cabin, dtype: object

In [305]:
split_cabin = titanic["Cabin"].apply(lambda x: pd.Series(str(x).split("/")))
split_cabin

Unnamed: 0,0,1,2
0,B,0,P
1,F,0,S
2,A,0,S
3,A,0,S
4,F,1,S
...,...,...,...
8688,A,98,P
8689,G,1499,S
8690,G,1500,S
8691,E,608,S


In [306]:
print(split_cabin[0].unique())
print(split_cabin[1].unique())
print(len(split_cabin[1]))
print(type(split_cabin[1][42]))
print(split_cabin[2].unique())

['B' 'F' 'A' 'G' 'nan' 'E' 'D' 'C' 'T']
['0' '1' '2' ... '1892' '1893' '1894']
8693
<class 'str'>
['P' 'S' nan]


## Preparation

In [307]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def transform_feature_data(raw_df):
    df = raw_df.drop(columns=['PassengerId', 'Name'], axis='columns')
    df[['Cabin_1', 'Cabin_2', 'Cabin_3']] = df["Cabin"].apply(lambda x: pd.Series(str(x).split("/")))
    df = df.drop(columns=['Cabin', 'Cabin_2'], axis='columns')

    categorical_cols = df.select_dtypes("object").columns.tolist() + df.select_dtypes("bool").columns.tolist()
    numerical_cols = df.select_dtypes("number").columns.tolist()

    assert len(df.columns) == len(numerical_cols) + len(categorical_cols)

    categorical_steps = [
        ("one_hot_encoder", OneHotEncoder()), 
        ('categorical_imputer', SimpleImputer(strategy="most_frequent"))
    ]
    numerical_steps = [
        ('numerical_imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler())
    ]
    
    categorical_pipeline = Pipeline(categorical_steps)
    numerical_pipeline = Pipeline(numerical_steps)

    transformer = ColumnTransformer(transformers=[
        ('number_transformer', numerical_pipeline, numerical_cols),
        ('category_transformer', categorical_pipeline, categorical_cols)
    ])

    return transformer.fit_transform(df)

def transform_target_data(y):
    return y.astype(int) ez nem egy classification problema? probald randomforestclassifier-rel

In [308]:
from sklearn.model_selection import train_test_split

X = titanic.drop('Transported', axis=1)
y = titanic.Transported

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [309]:
X_train = transform_feature_data(X_train)
X_test = transform_feature_data(X_test)

y_train = transform_target_data(y_train)
y_test = transform_target_data(y_test)

In [310]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6954, 32)
(1739, 31)
(6954,)
(1739,)


## Training


In [311]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [10, 100]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search_rand_forest = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True, n_jobs=8)
grid_search_rand_forest.fit(X_train, y_train)

In [312]:
print(grid_search_rand_forest.best_params_)
print(grid_search_rand_forest.best_score_)

{'n_estimators': 100}
-0.14795131912836026


## Submission

In [313]:
titanic_submission_data = pd.read_csv("test.csv")
passenger_ids = titanic_submission_data.PassengerId

In [314]:
X = transform_feature_data(titanic_submission_data)
submission_predictions = grid_search_rand_forest.predict(X)

In [315]:
submission_predictions[submission_predictions > 0.5] = True
submission_predictions[submission_predictions <= 0.5] = False
submission_predictions = [bool(prediction) for prediction in submission_predictions]
submission_predictions[0:10]

[True, False, True, True, True, True, True, True, True, True]

In [316]:
submission_predictions = pd.DataFrame({'PassengerId': passenger_ids, 'Transported': submission_predictions})
submission_predictions

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [317]:
submission_predictions.to_csv("submission.csv", index=False)

# Ideas, questions:
 
- what about stratification