In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/train.csv')
df.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isnull().sum()
df.count()
df.drop(columns = ['Cabin','Ticket'], inplace=True)
# df.dropna(inplace=True)
df['Sex'] = df['Sex'].map({'male':0, 'female':1})
df['Embarked'] = df['Embarked'].map({'S':1, 'C':2,'Q':3})
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.count()
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [4]:
X = df.drop(columns=['PassengerId','Survived','Name'])
X['FamilyMember'] = X.Parch + X.SibSp +1
X.drop(columns=['Parch','SibSp'],inplace=True)
print(X.isnull().sum())
y = df.Survived
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=.3)
sc= StandardScaler()
X_train_ = sc.fit_transform(X_train)
X_test_ = sc.transform(X_test)

Pclass          0
Sex             0
Age             0
Fare            0
Embarked        0
FamilyMember    0
dtype: int64


In [5]:
knnparam = {
    'n_neighbors':list(range(3, 21)),
    'weights':['uniform', 'distance'],
    'metric':['euclidean', 'manhattan']
}
knn = GridSearchCV(KNeighborsClassifier(), knnparam, cv = 5, scoring='accuracy')
knn.fit(X_train_, y_train)
print("KNN",knn.best_params_)
print("KNN",knn.best_score_)
print("KNN",classification_report(y_test, knn.predict(X_test_)))

KNN {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}
KNN 0.8234064516129032
KNN               precision    recall  f1-score   support

           0       0.80      0.94      0.86       157
           1       0.88      0.67      0.76       111

    accuracy                           0.82       268
   macro avg       0.84      0.80      0.81       268
weighted avg       0.83      0.82      0.82       268



In [6]:
nb = GaussianNB()
nb.fit(X_train_,y_train)
nbpredict = nb.predict(X_test_)
print("NB",accuracy_score(y_test,nbpredict))
confusion_matrix(y_test,nbpredict)
print('NB',classification_report(y_test, nbpredict))

NB 0.7985074626865671
NB               precision    recall  f1-score   support

           0       0.82      0.83      0.83       157
           1       0.76      0.75      0.75       111

    accuracy                           0.80       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [7]:
lrparam = {
    'C': [0.01, 0.1, 1, 10],  
    'penalty': ['l2'],
    'solver': ['liblinear'] 
}
lr = GridSearchCV(LogisticRegression(max_iter=500), lrparam, cv=5, scoring='accuracy')
lr.fit(X_train_,y_train)
lrpredict = lr.predict(X_test_)
print('LR:',lr.best_params_)
print('LR:',lr.best_score_)
print('LR:',classification_report(y_test, lrpredict))

LR: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
LR: 0.8009290322580647
LR:               precision    recall  f1-score   support

           0       0.81      0.86      0.84       157
           1       0.78      0.72      0.75       111

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [8]:
rfparam = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf = GridSearchCV(RandomForestClassifier(random_state=42),rfparam,cv=5, scoring='accuracy')
rf.fit(X_train,y_train)
rfpredict = rf.predict(X_test)
print('RF:',rf.best_params_)
print('RF:',rf.best_score_)
print('RF:',classification_report(y_test, rfpredict))

RF: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
RF: 0.8314838709677419
RF:               precision    recall  f1-score   support

           0       0.79      0.90      0.84       157
           1       0.82      0.67      0.74       111

    accuracy                           0.80       268
   macro avg       0.81      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



In [9]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
test['FamilyMember'] = test['SibSp'] + test['Parch'] + 1
test['Sex'] = test['Sex'].map({'male':0, 'female':1})
test['Embarked'] = test['Embarked'].map({'S':1, 'C':2,'Q':3})
result=test.PassengerId.to_frame()
test.drop(columns=['PassengerId','Name','Cabin','SibSp','Parch','Ticket'], inplace =True)
test.isnull().sum()
test.fillna(test.Age.mean(), inplace = True)

In [11]:
result['Survived'] = rf.predict(test)

In [12]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [23]:
result.to_csv('data/Result.csv', index = False)