In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./titanic_data/train.csv')
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [2]:
def drop_add_unwanted_columns(df_):
    # drop less useful columns
    df_.drop(['Name','Ticket','Cabin'],axis='columns', inplace=True)

    # combine & drop
    df_['family_mem_count'] = df_.SibSp + df_.Parch
    df_.drop(['SibSp','Parch'],axis='columns', inplace=True)
    
    return df_;
    
def convert_categorial_columns(df_):
    df_['Sex'] = df_['Sex'].astype('category')
    df_['Pclass'] = df_['Pclass'].astype('category')
    df_['Embarked'] = df_['Embarked'].astype('category')
    if 'Survived' in list(df_.columns):
        df_['Survived'] = df_['Survived'].astype('category')
    return df_;

def fill_missing_values(df_):
    df_ = df_.fillna({'Embarked': df_.Embarked.value_counts(dropna=False).index[0], 'Age': np.mean(df_.Age), 'Fare': np.mean(df_.Fare)})
    df_.Embarked.value_counts(dropna=False)
    return df_;

def get_feature_target_columns(df_):
    X = df_[['PassengerId', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked',
       'family_mem_count']]
    y = df_.Survived
    
    return X,y

In [3]:
df = drop_add_unwanted_columns(df)
df = convert_categorial_columns(df)
df = fill_missing_values(df)

df.isnull().sum().sum()

0

In [4]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline

column_transf = make_column_transformer(
    #(StandardScaler(), ['Age','Fare']),
    (OneHotEncoder(), ['Pclass','Sex','Embarked']), 
    remainder='passthrough')

clf = SVC(random_state=1)
pipe = make_pipeline(column_transf, clf)

X, y = get_feature_target_columns(df)
# sorted(pipe.get_params().keys())[len(pipe.get_params().keys())-20:]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# pipe.fit(X_train, y_train)

In [None]:
# pipe.score(X,y)

In [None]:
# pipe.score(X_test, y_test)

In [None]:
sorted(pipe.get_params().keys())[len(pipe.get_params().keys())-20:]

In [None]:
c_values = np.linspace(0.1, 1, 10).tolist()
gamma_values = np.linspace(0.00001, 1, 20)
kernel_values = ['linear', 'rbf', 'poly']

params = {'svc__C':c_values, 'svc__gamma': gamma_values, 'svc__kernel' : kernel_values}

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, scoring='accuracy', cv=10, param_grid=params)

grid.fit(X_train,y_train)

In [None]:
grid.score(X_test, y_test)

### **0.8765432098765432**

In [None]:
df_test = pd.read_csv('./titanic_data/test.csv')

df_test = drop_add_unwanted_columns(df_test)
df_test = convert_categorial_columns(df_test)
df_test = fill_missing_values(df_test)

df_test.isnull().sum()

In [None]:
test_y_predicted = pipe.predict(df_test)

submission_data = {'PassengerId':list(df_test.PassengerId), 'Survived':list(test_y_predicted)}

submission_df = pd.DataFrame(submission_data)

submission_df.sort_values('PassengerId',ascending=True, inplace=True)
submission_df.to_csv('titanic_submission.csv',index=False)

In [None]:
y_pred = pipe.predict(X_test)
cm = confusion_matrix(y_pred, y_test)

import seaborn as sns
import matplotlib.pyplot as plt

print(pipe.score(X_test, y_test))

plt.figure(figsize=(8,5))
sns.heatmap(cm, annot=True)
plt.show()