In [2]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score

In [6]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
train_df.shape

(8693, 14)

In [8]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [9]:
test_df.shape

(4277, 13)

In [10]:
train_profile = ProfileReport(train_df, title="Spaceship Titanic", explorative=True, dark_mode=True)

In [11]:
train_profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
train_copy = train_df.copy()

In [13]:
def split(x):
    if len(str(x).split('/'))<3:
        return ['Missing', 'Missing', 'Missing']
    else:
        return str(x).split('/')

In [14]:
def preprocessing(df):
    # Filling missing values in HomePlanet
    df['HomePlanet'].fillna('unknown', inplace=True)
    #df.dropna(inplace = True, axis=0, subset=['CryoSleep'])
    df['CryoSleep'].fillna('unknown', inplace=True)
    # Filling missing values in Destination
    df['Destination'].fillna('unknown', inplace=True)
    df['Cabin'] = df['Cabin'].apply(lambda x: split(x))
    df['Deck'] = df['Cabin'].apply(lambda x:x[0])
    df['Side'] = df['Cabin'].apply(lambda x:x[2])
    df.drop('Cabin', axis=1, inplace=True)
    #treating missing values in age
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    #df.dropna(inplace = True, axis=0, subset=['VIP'])
    df['VIP'].fillna('unknown', inplace=True)
    df['RoomService'].fillna(0, inplace=True)
    df['FoodCourt'].fillna(0, inplace=True)
    df['ShoppingMall'].fillna(0, inplace=True)
    df['Spa'].fillna(0, inplace=True)
    df['VRDeck'].fillna(0, inplace=True)
    df.drop('Name', axis=1, inplace = True)
    df.dropna(inplace=True)
    
    return df
    

In [15]:
preprocessing(train_copy)
train_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S


In [16]:
train_copy = pd.get_dummies(train_copy, columns=['HomePlanet', 'Destination',
                                                 'Deck', 'Side', 'VIP', 'CryoSleep'])
'''
train_copy['VIP'] = train_copy['VIP'].replace({True:1, False:0})
train_copy['CryoSleep'] = train_copy['CryoSleep'].replace({True:1, False:0})
train_copy['Transported'] = train_copy['Transported'].replace({True:1, False:0})
'''

"\ntrain_copy['VIP'] = train_copy['VIP'].replace({True:1, False:0})\ntrain_copy['CryoSleep'] = train_copy['CryoSleep'].replace({True:1, False:0})\ntrain_copy['Transported'] = train_copy['Transported'].replace({True:1, False:0})\n"

In [13]:
train_copy.dtypes

PassengerId                   object
Age                          float64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
Transported                     bool
HomePlanet_Earth               uint8
HomePlanet_Europa              uint8
HomePlanet_Mars                uint8
HomePlanet_unknown             uint8
Destination_55 Cancri e        uint8
Destination_PSO J318.5-22      uint8
Destination_TRAPPIST-1e        uint8
Destination_unknown            uint8
Deck_A                         uint8
Deck_B                         uint8
Deck_C                         uint8
Deck_D                         uint8
Deck_E                         uint8
Deck_F                         uint8
Deck_G                         uint8
Deck_Missing                   uint8
Deck_T                         uint8
Side_Missing                   uint8
Side_P                         uint8
S

In [17]:
X = train_copy.drop(['Transported','PassengerId'], axis=1)
y = train_copy['Transported']

In [18]:
X.shape

(8693, 32)

In [19]:
X.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_unknown', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_Missing', 'Deck_T', 'Side_Missing', 'Side_P',
       'Side_S', 'VIP_False', 'VIP_True', 'VIP_unknown', 'CryoSleep_False',
       'CryoSleep_True', 'CryoSleep_unknown'],
      dtype='object')

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 12)

In [21]:
X_train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_unknown,...,Deck_T,Side_Missing,Side_P,Side_S,VIP_False,VIP_True,VIP_unknown,CryoSleep_False,CryoSleep_True,CryoSleep_unknown
7503,29.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,...,0,0,0,1,1,0,0,0,1,0
7300,26.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,...,0,0,0,1,1,0,0,0,1,0
1853,25.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,0,0,1,1,0,0,0,1,0
5962,28.82793,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
4805,27.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0


In [22]:
y_train.head()

7503    True
7300    True
1853    True
5962    True
4805    True
Name: Transported, dtype: bool

In [23]:
test_copy = test_df.copy()

In [24]:
preprocessing(test_copy)
test_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S


In [25]:
test_copy = pd.get_dummies(test_copy, columns=['HomePlanet', 'Destination', 'Deck',
                                               'Side', 'VIP', 'CryoSleep'])
'''
test_copy['VIP'] = test_copy['VIP'].replace({True:1, False:0})
test_copy['CryoSleep'] = test_copy['CryoSleep'].replace({True:1, False:0})
'''

"\ntest_copy['VIP'] = test_copy['VIP'].replace({True:1, False:0})\ntest_copy['CryoSleep'] = test_copy['CryoSleep'].replace({True:1, False:0})\n"

In [26]:
test_copy.drop('PassengerId', inplace=True, axis=1)
test_copy.shape

(4277, 32)

In [27]:
test_copy.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_unknown', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_Missing', 'Deck_T', 'Side_Missing', 'Side_P',
       'Side_S', 'VIP_False', 'VIP_True', 'VIP_unknown', 'CryoSleep_False',
       'CryoSleep_True', 'CryoSleep_unknown'],
      dtype='object')

**Setting up ML Pipeline**

In [28]:
pipelines = {
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=12)),
    'ex' : make_pipeline(StandardScaler(), ExtraTreesClassifier(random_state=12)),
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=12)),
    'dt' : make_pipeline(StandardScaler(), DecisionTreeClassifier(random_state=12)), 
}

In [29]:
grid = {
    'rf' : {
        'randomforestclassifier__n_estimators':[100, 200, 300]
    },
    'ex' : {
        'extratreesclassifier__n_estimators':[100,200, 300]
    },
    'gb' : {
        'gradientboostingclassifier__n_estimators':[100,200, 300]
    },
    'dt' : {
        'decisiontreeclassifier__criterion': ['gini', 'entropy']
    }
}

In [30]:
fit_model = {}
for algo, pipeline in pipelines.items():
    print(f'Training the {algo} model.')
    model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
    model.fit(X_train, y_train)
    fit_model[algo] = model

Training the rf model.
Training the ex model.
Training the gb model.
Training the dt model.


In [31]:
fit_model

{'rf': GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                        ('randomforestclassifier',
                                         RandomForestClassifier(random_state=12))]),
              n_jobs=-1,
              param_grid={'randomforestclassifier__n_estimators': [100, 200,
                                                                   300]}),
 'ex': GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                        ('extratreesclassifier',
                                         ExtraTreesClassifier(random_state=12))]),
              n_jobs=-1,
              param_grid={'extratreesclassifier__n_estimators': [100, 200, 300]}),
 'gb': GridSearchCV(cv=10,
              estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                        ('gradientboostingclassifier',
                             

In [32]:
for algo, model in fit_model.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    #r2 = r2_score(y_test, y_pred)
    print(f'Metrics for {algo}:Accuracy : {acc},Precision : {prec}, recall : {recall}')
    

Metrics for rf:Accuracy : 0.7843588269120184,Precision : 0.8134777376654633, recall : 0.7544642857142857
Metrics for ex:Accuracy : 0.7705577918343876,Precision : 0.8049079754601227, recall : 0.7321428571428571
Metrics for gb:Accuracy : 0.7952846463484762,Precision : 0.7980132450331126, recall : 0.8069196428571429
Metrics for dt:Accuracy : 0.7326049453709028,Precision : 0.7370737073707371, recall : 0.7477678571428571


In [33]:
import pickle
with open('gradientboosted.pkl', 'wb') as f:
    pickle.dump(fit_model['gb'], f)

In [34]:
with open('gradientboosted.pkl', 'rb') as f:
    Model = pickle.load(f)

In [35]:
Model

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier(random_state=12))]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__n_estimators': [100, 200,
                                                                      300]})

In [36]:
sub = Model.predict(test_copy)

In [37]:
submission = pd.DataFrame([test_df.PassengerId, sub]).T
submission.columns = ['PassengerId', 'Transported']

In [38]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [39]:
submission.to_csv('Submission.csv')