# Loading Dataset And Dependencies

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# Train Data pre-processing

# 1. Extra Feature Extraction

In [None]:
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

def title_mapping(title):
    if title in ['Mr', 'Mrs', 'Miss', 'Master']:
        return title
    elif title in ['Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer', 'Dona']:
        return 'Rare'
    else:
        return 'Unknown'
    
def title_mapping_encode(title):
    if title in ['Mr', 'Mrs', 'Miss', 'Master']:
        return 1
    elif title in ['Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer', 'Dona']:
        return 2
    else:
        return 0

# introduce title feature
train_data['Title'] = train_data['Name'].apply(get_title).apply(title_mapping)

# introduce title feature with numerical encoding
train_data['Title_encode'] = train_data['Name'].apply(get_title).apply(title_mapping_encode)

In [None]:
def get_deck(cabin):
    if pd.isna(cabin):
        return 'Unknown'
    else:
        return cabin[0]
    
def deck_mapping(deck):
    deck_mapping_dict = {'Unknown': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    return deck_mapping_dict.get(deck, 0)

# introduce deck feature
train_data['Deck'] = train_data['Cabin'].apply(get_deck)

# introduce deck feature with numerical encoding
train_data['Deck_encode'] = train_data['Deck'].apply(deck_mapping)

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
train_data['IsAlone'] = train_data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

## 2. Imputation of Missing Values in the Age Column

tips: uncomment the method you want to use before running the code

In [None]:
"""
Method 1: Linear Regression Imputation
"""
from sklearn.linear_model import LinearRegression

def linear_regression_impute(df):
    known_age = df[df['Age'].notnull()]
    unknown_age = df[df['Age'].isnull()]
    
    X_train = known_age[['Pclass', 'SibSp', 'Parch', 'Fare', 'Title_encode']]
    y_train = known_age['Age']
    X_test = unknown_age[['Pclass', 'SibSp', 'Parch', 'Fare', 'Title_encode']]
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    predicted_ages = lr.predict(X_test)
    df.loc[df['Age'].isnull(), 'Age'] = predicted_ages
    
    return df

"""
Method 2: Random Forest Imputation
"""
from sklearn.ensemble import RandomForestRegressor

def random_forest_impute(df):
    known_age = df[df['Age'].notnull()]
    unknown_age = df[df['Age'].isnull()]
    
    X_train = known_age[['Pclass', 'SibSp', 'Parch', 'Fare', 'Title_encode']]
    y_train = known_age['Age']
    X_test = unknown_age[['Pclass', 'SibSp', 'Parch', 'Fare', 'Title_encode']]
    
    rfr = RandomForestRegressor(random_state=0, n_estimators=100)
    rfr.fit(X_train, y_train)
    
    predicted_ages = rfr.predict(X_test)
    df.loc[df['Age'].isnull(), 'Age'] = predicted_ages
    
    return df

"""
Method 3: K-Nearest Neighbors (KNN) Imputation
"""
from sklearn.impute import KNNImputer

def knn_impute(df):
    imputer = KNNImputer(n_neighbors=5)
    df[['Age']] = imputer.fit_transform(df[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Title_encode']])[:, 0].reshape(-1, 1)
    
    return df

In [None]:
# impute_function = linear_regression_impute
impute_function = random_forest_impute
# impute_function = knn_impute

In [None]:
print("Before imputation:")
train_data[['Age']].describe() # for debugging

In [None]:
train_data = impute_function(train_data)
print("After imputation:")
train_data[['Age']].describe() # for debugging

## 3. One-hot Encoding

In [None]:
deck_dummies_train = pd.get_dummies(train_data['Deck'], prefix='Deck')
dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(train_data['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(train_data['Pclass'], prefix= 'Pclass')
title_dummies_train = pd.get_dummies(train_data['Title'], prefix='Title')

df = pd.concat([train_data, deck_dummies_train, dummies_Embarked, dummies_Sex, dummies_Pclass, title_dummies_train], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Title_encode', 'Deck_encode'], axis=1, inplace=True)

## 3. Standardization

In [None]:
import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()

# Reshape操作
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1))

df

# Test Data pre-processing

In [None]:
test_data.describe() # for debugging

In [None]:
def fill_missing_fare(row):
    if pd.isnull(row['Fare']):
        return fare_means[row['Pclass']]
    else:
        return row['Fare']
    
# calculate the average fare for each class
fare_means = test_data.groupby('Pclass')['Fare'].mean()
    
# introduce title feature
test_data['Title'] = test_data['Name'].apply(get_title).apply(title_mapping)
test_data['Title_encode'] = test_data['Name'].apply(get_title).apply(title_mapping_encode)

test_data['Deck'] = test_data['Cabin'].apply(get_deck)
test_data['Deck_encode'] = test_data['Deck'].apply(deck_mapping)

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = test_data['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

# fill missing fare
test_data['Fare'] = test_data.apply(fill_missing_fare, axis=1)

# fill missing age
test_data = impute_function(test_data)

# one-hot encoding
deck_dummies_test = pd.get_dummies(test_data['Deck'], prefix='Deck')
dummies_Embarked = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(test_data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(test_data['Pclass'], prefix='Pclass')
title_dummies_test = pd.get_dummies(test_data['Title'], prefix='Title')

df_test = pd.concat([test_data, deck_dummies_test, dummies_Embarked, dummies_Sex, dummies_Pclass, title_dummies_test], axis=1)

df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Title_encode', 'Deck_encode'], axis=1, inplace=True)

# fix missing one-hot encodes
df_test['Title_Unknown'] = 0
df_test['Deck_T'] = 0

# standardize
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'].values.reshape(-1, 1))
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'].values.reshape(-1, 1))

df_test

# Training

for a naive version, we use logistic regression

In [None]:
from sklearn import linear_model

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|FamilySize|IsAlone|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*|Title_.*')
train_np = train_df.values

y = train_np[:, 0]

X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
clf.fit(X, y.astype("int"))
    
clf

use trained model to predict result

In [None]:
test = df_test.filter(regex='Age_.*|SibSp|Parch|FamilySize|IsAlone|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*|Title_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':test_data['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)