In [1]:
import numpy as np
import pandas as pd

In [2]:
filepath_train = "files/train.csv"
filepath_test = "files/test.csv"
sample = 'files/sample_submission.csv'
ss_titanic = pd.read_csv(filepath_train, index_col = 'PassengerId')

In [3]:
ss_titanic

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


## Clean and prepare data

In [24]:
def clean(filepath):
    # read in CSV, set index to PassengerID
    df = pd.read_csv(filepath, index_col = 'PassengerId')
    
    # drops null values
    df.dropna(inplace=True)
    
    # converts target column from bool to int
    df["Transported"] = df["Transported"].apply(lambda x: int(x))
    
    #split cabin into three columns: Deck, room number, and ship side
    df['Cabin'] = df['Cabin'].apply(lambda x: x.split('/'))
    df['deck'] = df['Cabin'].apply(lambda x: x[0])
    df['room_num'] = df['Cabin'].apply(lambda x: x[1])
    df['ship_side'] = df['Cabin'].apply(lambda x: x[2])
    df.drop(['Cabin'], axis=1, inplace=True)
    
    #convert columns with two categoaries into 0,1
    df['ship_side'] = df['ship_side'].replace('P', 1).replace('S', 0)
    df['HomePlanet'] = ss_titanic['HomePlanet'].replace('Earth', 0).replace('Europa', 1)
    df['CryoSleep'] = df['CryoSleep'].apply(lambda x: int(x))
    df['VIP'] = df['VIP'].apply(lambda x: int(x))
    
    # drop High cardinality columns
    df.drop(['Name'], axis=1, inplace=True)
    
    return df
    

In [25]:
df = clean(filepath_train)

In [26]:
df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,room_num,ship_side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,1,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,1
0002_01,0,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,0
0003_01,1,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,0
0003_02,1,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,0
0004_01,0,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,1,0,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,A,98,1
9278_01,0,1,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,0,G,1499,0
9279_01,0,0,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,G,1500,0
9280_01,1,0,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,E,608,0


## Split Data

In [28]:
from sklearn.model_selection import train_test_split

target = 'Transported'
X = df.drop(columns=target)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)

## Baseline Accuracy

In [29]:
baseline_acc = y_train.value_counts(normalize=True).max()
print('Baseline Accuracy Score:', baseline_acc)

Baseline Accuracy Score: 0.5030280090840272


## Build Model

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

model_lr = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(),
            LogisticRegression())

model_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['HomePlanet', 'Destination', 'deck',
                                      'room_num'],
                                mapping=[{'col': 'HomePlanet',
                                          'data_type': dtype('O'),
                                          'mapping': 1       1
Mars    2
0       3
NaN    -2
dtype: int64},
                                         {'col': 'Destination',
                                          'data_type': dtype('O'),
                                          'mapping': 55 Cancri e      1
TRAPPIST-1e      2
PSO J318.5-22    3
NaN             -2
dtype: int64},
                                         {'col': 'deck',
                                          'data_type': dtype('O'),
                                          'mapping': A      1
C      2
F      3
G      4
B      5
D      6
E      7
T      8
NaN   -2
dtype: int64},
                                         {'col': 'roo

In [39]:
from sklearn.metrics import accuracy_score
training_acc = accuracy_score(y_train, model_lr.predict(X_train))
print("Training accuracy: ", training_acc)

val_acc = accuracy_score(y_val, model_lr.predict(X_val))
print("Validation Accuracy:", val_acc)

Training accuracy:  0.7848221044663134
Validation Accuracy: 0.7768532526475038


In [36]:
from sklearn.tree import DecisionTreeClassifier
model_dt = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(),
            DecisionTreeClassifier(random_state=42))
model_dt.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['HomePlanet', 'Destination', 'deck',
                                      'room_num'],
                                mapping=[{'col': 'HomePlanet',
                                          'data_type': dtype('O'),
                                          'mapping': 1       1
Mars    2
0       3
NaN    -2
dtype: int64},
                                         {'col': 'Destination',
                                          'data_type': dtype('O'),
                                          'mapping': 55 Cancri e      1
TRAPPIST-1e      2
PSO J318.5-22    3
NaN             -2
dtype: int64},
                                         {'col': 'deck',
                                          'data_type': dtype('O'),
                                          'mapping': A      1
C      2
F      3
G      4
B      5
D      6
E      7
T      8
NaN   -2
dtype: int64},
                                         {'col': 'roo

In [37]:
training_acc = accuracy_score(y_train, model_dt.predict(X_train))
print("Training accuracy: ", training_acc)

Training accuracy:  0.9998107494322483


In [38]:
val_acc = accuracy_score(y_val, model_dt.predict(X_val))
print("Validation Accuracy: ", val_acc)

Validation Accuracy:  0.7322239031770046
