In [1]:
import numpy as np
import pandas as pd

In [2]:
filepath_train = "files/train.csv"
filepath_test = "files/test.csv"
sample = 'files/sample_submission.csv'
ss_titanic = pd.read_csv(filepath_train, index_col = 'PassengerId')

In [3]:
ss_titanic

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


## Clean and prepare data

In [4]:
def clean(filepath):
    # read in CSV, set index to PassengerID
    df = pd.read_csv(filepath, index_col = 'PassengerId')

    
    #split cabin into three columns: Deck, room number, and ship side
    df['Cabin'] = df['Cabin'].apply(lambda x: x.split('/'))
    df['deck'] = df['Cabin'].apply(lambda x: x[0])
    df['room_num'] = df['Cabin'].apply(lambda x: x[1])
    df['ship_side'] = df['Cabin'].apply(lambda x: x[2])
    df.drop(['Cabin'], axis=1, inplace=True)
    
    #convert columns with two categoaries into 0,1
    df['ship_side'] = df['ship_side'].replace('P', 1).replace('S', 0)
    df['HomePlanet'] = ss_titanic['HomePlanet'].replace('Earth', 0).replace('Europa', 1)
    df['CryoSleep'] = df['CryoSleep'].apply(lambda x: int(x))
    df['VIP'] = df['VIP'].apply(lambda x: int(x))
    
    # drop High cardinality columns
    df.drop(['Name'], axis=1, inplace=True)
    
    return df
    

In [5]:
df = clean(filepath_train)

AttributeError: 'float' object has no attribute 'split'

In [None]:
df

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

target = 'Transported'
X = df.drop(columns=target)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42)

## Baseline Accuracy

In [None]:
baseline_acc = y_train.value_counts(normalize=True).max()
print('Baseline Accuracy Score:', baseline_acc)

## Build Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

model_lr = make_pipeline(
            OneHotEncoder(),
            SimpleImputer(),
            LogisticRegression())

model_lr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
training_acc = accuracy_score(y_train, model_lr.predict(X_train))
print("Training accuracy: ", training_acc)

val_acc = accuracy_score(y_val, model_lr.predict(X_val))
print("Validation Accuracy:", val_acc)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt = make_pipeline(
            OneHotEncoder(),
            SimpleImputer(),
            DecisionTreeClassifier(random_state=42))
model_dt.fit(X_train, y_train)

In [None]:
training_acc = accuracy_score(y_train, model_dt.predict(X_train))
print("Training accuracy: ", training_acc)

In [None]:
val_acc = accuracy_score(y_val, model_dt.predict(X_val))
print("Validation Accuracy: ", val_acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = make_pipeline(
            OneHotEncoder(),
            SimpleImputer(),
            RandomForestClassifier(random_state=42))
model_rfc.fit(X_train, y_train)

training_acc = accuracy_score(y_train, model_rfc.predict(X_train))
val_acc = accuracy_score(y_val, model_rfc.predict(X_val))

print("Training accuracy: ", training_acc)
print("Validation accuracy: ", val_acc)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = make_pipeline(
            OneHotEncoder(),
            SimpleImputer(),
            GradientBoostingClassifier(random_state=42))
model_gb.fit(X_train, y_train)

training_acc = accuracy_score(y_train, model_gb.predict(X_train))
val_acc = accuracy_score(y_val, model_gb.predict(X_val))

print("Training Accuracy: ", training_acc)
print("Validation Accuracy: ", val_acc)

In [None]:
from xgboost import XGBClassifier
model_xgb = make_pipeline(
            OneHotEncoder(),
            SimpleImputer(),
            StandardScaler(),
            XGBClassifier(booster='dart'))
model_xgb.fit(X_train, y_train)

training_acc = accuracy_score(y_train, model_xgb.predict(X_train))
val_acc = accuracy_score(y_val, model_xgb.predict(X_val))

print("Training Accuracy: ", training_acc)
print("Validation Accuracy: ", val_acc)

In [None]:
test_df = clean(filepath_test)

In [None]:
predictions = pd.DataFrame(data=model_xgb.predict(test_df), index = test_df.index)

In [None]:
predictions.rename(columns={0:'Transported'}, inplace = True)

In [None]:
predictions

In [None]:
predictions.to_csv("new_submission.csv")