In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dataset_joined = [train, test]
dataset_joined = pd.concat(dataset_joined)
print(dataset_joined.info())

Some features we'd like to talk about:

### Age:
According to the in-class lab, it's obvious that age plays an important role in predicting if a person survived. However there're nulls in the age column. We'd like to fill them with random numbers generated by a normal distribution.

In [None]:
# deal with sex encode
from sklearn.preprocessing import LabelEncoder

labelencoder_sex = LabelEncoder()
dataset_joined['Sex'] = labelencoder_sex.fit_transform(dataset_joined['Sex'])

# fill age
median_0 = dataset_joined[(dataset_joined['Sex'] == 0)]['Age'].dropna().median()
var_0 = dataset_joined[ (dataset_joined['Sex'] == 0)]['Age'].dropna().std()
median_1 = dataset_joined[(dataset_joined['Sex'] == 1)]['Age'].dropna().median()
var_1 = dataset_joined[ (dataset_joined['Sex'] == 1)]['Age'].dropna().std()
dataset_joined.loc[(dataset_joined['Sex'] == 0),'Age'] = dataset_joined[(dataset_joined['Sex'] == 0)]['Age'].fillna(np.random.normal(median_0, var_0))
dataset_joined.loc[(dataset_joined['Sex'] == 1),'Age'] = dataset_joined[(dataset_joined['Sex'] == 1)]['Age'].fillna(np.random.normal(median_1, var_1))
dataset_joined.info()
print('\n Now age is filled')

### Pclass:
Pictures from the in-class lab shows that there's a relationship between Pclass and chance of survival. We need to turn the Pclass feature into dummy variables.

In [None]:
dataset_joined = pd.get_dummies(dataset_joined, columns=['Pclass'], drop_first=True)
dataset_joined.head()

### Titles: 
One couldn't have failed to notice the titles in the passengers' names. We simply divided them into three catagories: 'Mr', 'Miss' and others. Again we need to turn them into dummy variables to avoid dummy variable trap.

In [None]:
dataset_joined['Name'] = dataset_joined['Name'].str.split(',').str[1]
dataset_joined['Name'] = dataset_joined['Name'].str.split('.').str[0]   
names = dataset_joined['Name'].copy()
for item in names:
    if (item == ' Mr'):
        names.replace(item, 1, inplace = True)
    elif (item == ' Miss' or item == ' Mrs'):
        names.replace(item, 0, inplace = True)
    elif (item == ' Capt' or item == ' Col' or item == ' Don' or item == ' Dona' or item == ' Dr' or item == ' Jonkheer' or item == ' Lady' or item == ' Major' or item == ' Master' or item == ' Mile' or item == ' Mlle' or item == ' Mme' or item == ' Ms' or item == ' Rev' or item == ' Sir' or item == ' the Countess'):
        names.replace(item, 2, inplace = True)
    
dataset_joined['Name'] = names
dataset_joined = pd.get_dummies(dataset_joined, columns=['Name'], drop_first=True)
dataset_joined.head()

### Alone
According to the lab, whether a person is alone means a lot to the chance of survival.

In [None]:
dataset_joined['Alone'] = (dataset_joined['SibSp'] + dataset_joined['Parch'])>0
dataset_joined['Alone'] = dataset_joined['Alone'].map({True: 1, False: 0})
dataset_joined.info()

### Deal with other features and cleanse, preprocessing

In [None]:
# Embarked, perform dummy variable
dataset_joined['Embarked'].fillna('S', inplace=True)
labelencoder = LabelEncoder()
dataset_joined['Embarked'] = labelencoder.fit_transform(dataset_joined['Embarked'])
dataset_joined = pd.get_dummies(dataset_joined, columns=['Embarked'], drop_first=True)
# fare feature
dataset_joined['Fare'].fillna(dataset_joined['Fare'].mean(), inplace=True)

In [None]:
# Splitting the dataset into Train and Test
dataset_train = dataset_joined.iloc[:891, :]
dataset_test = dataset_joined.iloc[891:, :]

# Splitting the dataset into the input and output
dataset_train.info()
x_train = dataset_train.iloc[:,[0,2,3,5,6,9,10,11,12,13,14,15]]
y_train = dataset_train.iloc[:, [7]]
x_test = dataset_test.iloc[:,[0,2,3,5,6,9,10,11,12,13,14,15]]
print('\nx_train head is:')
x_train.head()

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)
y_train = y_train.as_matrix()

Features we chose to be in the model are: Age, fare, parch, sex, sibsp, pclass, name, alone, embarked.

### Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

def models():
    return [LogisticRegression(),
            SVC(),
            LinearSVC(),
            RandomForestClassifier(),
            KNeighborsClassifier(),
            GaussianNB(),
            DecisionTreeClassifier(),
            MLPClassifier(alpha=0.01, hidden_layer_sizes=(15), max_iter=10000),
            XGBClassifier()]

In [None]:
def name(model):
    return model.__class__.__name__


def best_model(model2predict):
    score_df = series_best_first(model2predict)
    print(score_df)
    return score_df.keys()[0]


def series_best_first(model2predict):
    model2score = {k: v[1] for k, v in model2predict.items()}
    return pd.Series(model2score).sort_values(ascending=False)


def write_submission(predict, submission_csv):
    submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predict
    })
    submission.to_csv(submission_csv, index=False)
    return pd.read_csv(submission_csv).tail(10)

### Model Selection based on the `score` method

In [None]:
def predict_and_score(model, inputs):
    X, y, X_test = inputs
    model.fit(X, y)
    y_pred = model.predict(X_test)
    return y_pred, model.score(X, y)

inputs = x_train, y_train.ravel(), x_test
table = {name(m): predict_and_score(m, inputs) for m in models()}
model = best_model(table)
print(table.keys())
write_submission(table[model][0].astype(np.int8), 'submission.csv')

### Model Selection based on the `cross_val_score` function

In [None]:
from sklearn.model_selection import cross_val_score

def predict_and_cv_score(model, inputs):
    X, y, X_test = inputs
    score = cross_val_score(model, X, y, cv=10, scoring='accuracy').mean()
    model.fit(X, y)
    return model.predict(X_test), score


table = {name(m): predict_and_cv_score(m, inputs) for m in models()}
model = best_model(table)
write_submission(table[model][0].astype(np.int8), 'submission_sv.csv')

<font color=blue>
## Here we achieved our best score: 0.80861.

### Improve prediction by voting

In [None]:
model2predict = {name(m): predict_and_score(m, inputs) for m in models()}
series = series_best_first(model2predict)
print(series)
best_models = series.keys()[0:5].tolist()
print(best_models)
ys = [v[0] for k, v in model2predict.items() if k in best_models]
print(len(ys))
sum_ys = np.sum(ys, axis=0)
# print(sum_ys)
votes = sum_ys // 3
# print(votes)

write_submission(votes.astype(np.int8), 'submission_vote.csv')

### Improve prediction by parameter tuning

In [None]:
table1 = {m:predict_and_score(SVC(C=m*0.1), inputs) for m in range(1,100)}
table2 = {m:predict_and_cv_score(SVC(C=m*0.1), inputs) for m in range(1,100)}

# model1 = best_model(table1)
write_submission(table1[best_model(table1)][0].astype(np.int8), 'submission_sv1.csv')

# model2 = best_model(table2)
write_submission(table2[best_model(table2)][0].astype(np.int8), 'submission_sv2.csv')

We let the parameter C of SVC model to be in the range(0.1,10) and evaluate them with score and cross-val-score. Turns out that the best result we can achieve is the same.

### Link to one of our best submissions:
https://www.kaggle.com/submissions/6746327/6746327.zip