## Data Cleaning using scikit learn

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [62]:
raw_data = pd.read_csv('train.csv')

In [63]:
y = raw_data.Survived
X = raw_data.drop(columns=['Survived'])
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [64]:
# Name feature
def get_title(x):
    return x.split(',')[1].split('.')[0].strip().strip()
def title(x):
    x['Title'] = x.Name.apply(get_title)
title_trans = FunctionTransformer(title)
title_trans.transform(x_train)
title_trans.transform(x_test)

In [65]:
# Age reature
train_median_age = x_train.groupby('Title')['Age'].mean()
test_median_age = x_test.groupby('Title')['Age'].mean()
def train_fill_age(x):
    for index, value in zip(train_median_age.index, train_median_age.values):
        if x['Title'] == index:
            return value
def test_fill_age(x):
    for index, value in zip(test_median_age.index, test_median_age.values):
        if x['Title'] == index:
            return value
x_train['Age'] = x_train.apply(lambda x: train_fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1)
x_test['Age'] = x_test.apply(lambda x: test_fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1)
groups = [0, 5, 17, 25, 50, 80]
labels = ['Infant', 'Kid', 'Young', 'Adult', 'Old']
x_train['Age'] = pd.cut(x_train['Age'], bins = groups, labels=labels)
x_test['Age'] = pd.cut(x_test['Age'], bins = groups, labels=labels)

In [66]:
def test(x):
    x= pd.DataFrame(x)
    x['FamilySize'] = x['SibSp'] + x['Parch'] + 1 
fam_trans = FunctionTransformer(test)
fam_trans.transform(x_train)
fam_trans.transform(x_test)

In [67]:
encode = OneHotEncoder(handle_unknown='ignore')

In [68]:
embark_cleaner = Pipeline(steps=[('Imputer', SimpleImputer(strategy='most_frequent')), ('encoding', encode)])

In [69]:
preprocessor = ColumnTransformer(transformers=[('Clean_Embarkment_point',embark_cleaner,['Embarked']), ('clean_sex',encode,['Sex', 'Age', 'Name'])])

### 1. KNN Pipeline

In [70]:
final_pipeline_knn = Pipeline(steps=[('Preprocessor', preprocessor), ('estimator', KNeighborsClassifier(n_neighbors=18))])

In [71]:
final_pipeline_knn.fit(x_train, y_train)

In [72]:
y_pred = final_pipeline_knn.predict(x_test)
accuracy_score(y_pred, y_test)

0.7533632286995515

### 2. Logistice Regression Pipeline

In [73]:
final_pipeline_logistic = Pipeline(steps=[('Preprocessor', preprocessor), ('estimator', LogisticRegression())])
final_pipeline_logistic.fit(x_train, y_train)
y_pred = final_pipeline_logistic.predict(x_test)
# Accuracy
logistic_acc = accuracy_score(y_pred, y_test)
logistic_acc

0.7847533632286996

### 3. SVM Pipeline

In [74]:
final_pipeline_svm = Pipeline(steps=[('Preprocessor', preprocessor), ('estimator', LinearSVC(max_iter=50000, random_state=42, C=100))])
final_pipeline_svm.fit(x_train, y_train)
lsvc_pred = final_pipeline_svm.predict(x_test)
acc_linear_svc = round(final_pipeline_svm.score(x_train, y_train) * 100, 2)
lsvc = accuracy_score(lsvc_pred, y_test)
lsvc

0.7847533632286996

### 4. Decision Tree

In [75]:
final_pipeline_dt = Pipeline(steps=[('Preprocessor', preprocessor), ('estimator', DecisionTreeClassifier(random_state=0) )])
final_pipeline_dt.fit(x_train, y_train)  
dt_pred = final_pipeline_dt.predict(x_test)  
acc_decision_tree = round(final_pipeline_dt.score(x_train, y_train) * 100, 2)
dt = accuracy_score(dt_pred, y_test)
dt

0.7802690582959642

### 5. Random Forest Pipeline

In [76]:
final_pipeline_rand = Pipeline(steps=[('Preprocessor', preprocessor), ('estimator', RandomForestClassifier(n_estimators=100, random_state=42))])
final_pipeline_rand.fit(x_train, y_train)
rn_pred = final_pipeline_rand.predict(x_test)
acc_random_forest = round(final_pipeline_rand.score(x_train, y_train) * 100, 2)
rn = accuracy_score(rn_pred, y_test)
rn

0.7802690582959642

## Submission

In [77]:
sub = pd.read_csv('test.csv')

In [78]:
title_trans.transform(sub)
fam_trans.transform(sub)
sub_median_age = sub.groupby('Title')['Age'].mean()
def fill_age(x):
    for index, value in zip(sub_median_age.index, sub_median_age.values):
        if x['Title'] == index:
            return value
sub['Age'] = sub.apply(lambda x: fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1)
groups = [0, 5, 17, 25, 50, 80]
labels = ['Infant', 'Kid', 'Young', 'Adult', 'Old']
sub['Age'] = pd.cut(sub['Age'], bins = groups, labels=labels)

In [80]:
sub_pred_knn = final_pipeline_knn.predict(sub)
sub_pred_logistic = final_pipeline_logistic.predict(sub)
sub_pred_svm = final_pipeline_svm.predict(sub)
sub_pred_dt = final_pipeline_dt.predict(sub)
sub_pred_rand = final_pipeline_rand.predict(sub)

In [81]:
sub_pred_series_knn = pd.Series(sub_pred_knn)
sub_pred_series_logistic = pd.Series(sub_pred_logistic)
sub_pred_series_svm = pd.Series(sub_pred_svm)
sub_pred_series_dt = pd.Series(sub_pred_dt)
sub_pred_series_rand = pd.Series(sub_pred_rand)

In [82]:
submission = pd.concat([sub, sub_pred_series_logistic], axis=1)
submission.rename(columns={0:'Survived'}, inplace=True)
submission.to_csv('Submission.csv', index=False, columns=['PassengerId', 'Survived'])