In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('titanictrain.csv')
test_data = pd.read_csv('titanictest.csv')
y_train = train_data['Survived']
X_train = train_data

In [3]:
X_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
X_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
X_train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
X_train['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:
X_train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
y_train.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [10]:
X_train['Age'].median()

28.0

# Preprocessing the data
Cleaning / working on the data

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.impute import SimpleImputer

>Numerical part of the data

Custom Transformer for adding new attribute

In [12]:
SibSp, Parch = 1,2
class Addattribute():
    def __init__(self, Relatives = True):
        self.Relatives = Relatives
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        if self.Relatives:
            relatives_per_individual = X[:,Parch] + X[:,SibSp]
            return np.c_[X,relatives_per_individual]
        else:
            pass

Pipeline for Numerical attributes

In [13]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attrb_adder', Addattribute()),
    ('stdscale', StandardScaler())
])

>Categorical Part of the data

Custom Transformer for settling the missing values

In [14]:
class CommonCategory(BaseEstimator, TransformerMixin):
    def __init__(self, missing_data = True):
        self.missing_data = missing_data
    def fit(self,X,y=None):
        common = [X[i].value_counts().index[0] for i in X]
        self.commoncat = pd.Series(common, index = X.columns)
        return self
    def transform(self,X,y=None):
        return X.fillna(self.commoncat)

Pipeline for Categorical Attributes

In [15]:
cat_pipeline = Pipeline([
    ('com_cat', CommonCategory()),
    ('cat_encode', OneHotEncoder(sparse=False))
])

# Turning the two Pipeline (Numerical & Categorical) into a single Pipeline

In [16]:
from sklearn.compose import ColumnTransformer
num_attrb_name = ['Age','SibSp','Parch','Fare']
cat_attrb_name = ['Pclass','Sex','Embarked']
complete_pipeline = ColumnTransformer([
    ('num_attributes',num_pipeline,num_attrb_name),
    ('cat_attributes',cat_pipeline,cat_attrb_name)
])

In [17]:
train_set = complete_pipeline.fit_transform(X_train)

In [18]:
train_set.shape

(891, 13)

# Training a model
Ensemble Learning

>Algorithm used:
RandomForestClassifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, cross_val_score

In [20]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(train_set,y_train)

RandomForestClassifier(random_state=42)

Checking the accuracy

In [21]:
rfc_score = cross_val_score(rfc,train_set,y_train,cv=10)
rfc_score.mean()

0.8036704119850187

# Fine-tunning the rfc model to improve its efficiency using RandomizedSearchCv

In [22]:
param_distribs = {
        'n_estimators': randint(low=1, high=300),
        'max_features': randint(low=1, high=10),
    }
rnd_search = RandomizedSearchCV(rfc, param_distributions=param_distribs,
                                n_iter=10, cv=10, scoring='accuracy', random_state=42)
rnd_search.fit(train_set,y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025321C08280>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025321BD57C0>},
                   random_state=42, scoring='accuracy')

In [23]:
rnd_search.best_params_

{'max_features': 8, 'n_estimators': 100}

In [24]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.8171785268414482 {'max_features': 7, 'n_estimators': 271}
0.8250312109862673 {'max_features': 8, 'n_estimators': 189}
0.8149313358302122 {'max_features': 5, 'n_estimators': 103}
0.8115355805243445 {'max_features': 3, 'n_estimators': 215}
0.8272534332084893 {'max_features': 8, 'n_estimators': 100}
0.8238951310861424 {'max_features': 8, 'n_estimators': 152}
0.8070536828963796 {'max_features': 3, 'n_estimators': 150}
0.8115730337078653 {'max_features': 5, 'n_estimators': 258}
0.8227840199750313 {'max_features': 8, 'n_estimators': 294}
0.8070536828963796 {'max_features': 2, 'n_estimators': 192}


0.8272534332084893 is better than the score before tunning (i.e., 0.8036704119850187)

In [25]:
final_model = rnd_search.best_estimator_

Confirming the performance

In [26]:
final_model.predict([train_set[779]])

array([1], dtype=int64)

In [27]:
y_train[779]

1

# Deploying the model for the test set

Transforming Test set first;

In [28]:
test_set = complete_pipeline.fit_transform(test_data)
test_set.shape

(418, 13)

In [29]:
final_prediction = final_model.predict(test_set)

In [30]:
output = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':final_prediction})
output.to_csv('Prediction_table', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
