In [275]:
import warnings
warnings.filterwarnings('ignore')

In [276]:
import pandas as pd
import numpy as np

In [459]:
data = pd.read_csv('train.csv')
print(data.head())
print(data.shape)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [278]:
#find number of missing values in each feature
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Data Preprocessing

Remove columns not required for data modeling

find if data is imbalanced

In [460]:
#find instances belonging to class Survived=0 and Survived=1
n = data.shape[0]
no = data.Survived.value_counts()[0]
yes = data.Survived.value_counts()[1]
print((yes/n)*100)
print((no/n)*100)
print(n)

38.38383838383838
61.61616161616161
891


data is not imbalanced

Now left with 9 features that will determine whether a person survived or not.'Survived'-binary target class

In [461]:
def data_preprocessing(data):
    data = data.drop(columns=['PassengerId','Name',])
    #drop 'Cabin' feature- too many missing values
    data = data.drop(columns='Cabin')
    #for 'Age': impute missing values with -1
    data['Age'].fillna(-1, inplace=True)
    #drop 2 rows where 'Embarked' has missing values
    data.dropna(subset=['Embarked'], inplace=True)
    
    #handling categorical variables- one hot encoding
    cat_cols = ['Sex', 'Ticket', 'Embarked']
    data = pd.get_dummies(data, columns=cat_cols)
    
    #replace values of Pclass : 1 with 3, and 3 with 1
    data = data.replace({'Pclass':{1:5}})
    data = data.replace({'Pclass':{3:1}})
    data = data.replace({'Pclass':{5:3}})
    
    return data

In [462]:
data = data_preprocessing(data)

In [463]:
y_data = data['Survived'] # 0:No, 1:Yes
x_data = data.drop(columns='Survived')

split data into train and test

In [464]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x_data, 
    y_data,
    random_state=0,
    shuffle=True,
    stratify=y_data
)

In [284]:
from sklearn import preprocessing

In [465]:
# normalise the data
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [466]:
scaler = preprocessing.StandardScaler().fit(x_data)
x_data = scaler.transform(x_data)

In [287]:
x_data.shape

(889, 690)

dimensionality reduction using PCA

In [265]:
from sklearn.decomposition import PCA

In [467]:
pca = PCA(n_components=500)
pca.fit(x_train)
x_train = pca.transform(x_train)
a = pca.explained_variance_ratio_
print("variance of data explained:",sum(a))
# x_train = pca.fit_transform(x_train)

variance of data explained: 0.9367355670485212


In [468]:
x_test = pca.transform(x_test)
x_test.shape

(223, 500)

In [469]:
pca2 = PCA(n_components=500)
x_data = pca2.fit_transform(x_data)
x_data.shape

(889, 500)

# Data Modeling
## Apply classifiers: logistic regression, Naive bayes, SVM, Decision tree, Random Forests, XGBoost, MLP, 
### Apply 10-fold cross validation

logistic regression

In [487]:
from sklearn.metrics import accuracy_score

In [488]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

y_pred = LogisticRegression().fit(x_train,y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.820627802690583

Naive Bayes

In [489]:
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()
y_pred = GaussianNB().fit(x_train,y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.3811659192825112

grid search

In [490]:
from sklearn.model_selection import GridSearchCV

SVM

In [273]:
from sklearn.svm import SVC

In [491]:
parameters = {'kernel':('linear','poly','rbf','sigmoid'), 'C':[1,10],}
svm_classifier = GridSearchCV(SVC(random_state=0), param_grid=parameters, cv=5, n_jobs=-1)
svm_classifier.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [493]:
y_pred = svm_classifier.predict(x_test)

In [494]:
svm_classifier.best_params_

{'C': 1, 'kernel': 'linear'}

In [495]:
accuracy_score(y_test, y_pred)

0.820627802690583

Decision Tree

In [496]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(
    criterion='gini', 
    random_state=0,
    max_depth=20
)
dt_classifier.fit(x_train, y_train)
y_pred = dt_classifier.predict(x_test)

In [498]:
accuracy_score(y_test, y_pred)

0.7892376681614349

In [499]:
np.mean(cross_val_score(dt_classifier, x_data, y_data))

0.7412905329571995

Random Forest

In [500]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [501]:
parameters = {'n_estimators':[1,5,10,15,20,25,30], 'criterion':('gini', 'entropy')}
rf_classifier = RandomForestClassifier(n_estimators=5, criterion='gini', random_state=0)
rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)

In [502]:
accuracy_score(y_test, y_pred)

0.7668161434977578

Gradient Boosting Classifier

In [503]:
gb_classifier = GradientBoostingClassifier(learning_rate=0.01, n_estimators=100, random_state=0)
y_pred = gb_classifier.fit(x_train, y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.8251121076233184

XGBoost

In [505]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(learning_rate=0.01, n_estimators=150, random_state=0)
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

In [506]:
accuracy_score(y_test, y_pred)

0.7892376681614349

MLP

In [309]:
from sklearn.neural_network import MLPClassifier

In [310]:
parameters = {
    'activation':('identity','logistic','tanh','relu'),
    'solver':('adam','sgd','lbfgs'),
    'alpha':[0.01,0.1,0.5]
    
}
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50,50,50), random_state=0)
mlp_classifier = GridSearchCV(mlp_classifier, param_grid=parameters, cv=5, n_jobs=-1)
mlp_classifier.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'activation': ('identity', 'logistic', 'tanh', 'relu'), 'solver': ('adam', 'sgd', 'lbfgs'), 'alpha': [0.01, 0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [318]:
mlp_classifier.best_params_

{'activation': 'logistic', 'alpha': 0.5, 'solver': 'lbfgs'}

In [508]:
mlp_classifier2 = MLPClassifier(hidden_layer_sizes=(30,30,30), activation='logistic', alpha=0.5, solver='lbfgs', momentum=0.01)
y_pred = mlp_classifier2.fit(x_train, y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.8161434977578476

## Choosing Gradient boosting as final model for predicting test data as it has the highest accuracy(82.51%)

In [436]:
test = pd.read_csv('test.csv')
passenger_id = test['PassengerId']

In [437]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [438]:
test = data_preprocessing(test)

In [439]:
#handle missing value in 'Fare' - replace by mean value
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [440]:
test = np.array(test)

In [441]:
extra = np.zeros((418,317))

In [442]:
test = np.hstack((test, extra))

In [443]:
test = pca.transform(test)

In [510]:
test_pred = gb_classifier.predict(test)

In [511]:
passenger_id = np.array(passenger_id)
output = np.vstack((passenger_id, test_pred))
output = output.T
output.shape

(418, 2)

In [512]:
with open('result2.csv','a') as file:
    for entry in output:
        file.write(str(entry[0]) + ',' + str(entry[1]) + '\n')