# Loading Dataset And Dependencies

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# Train Data pre-processing

## 1. Imputation of Missing Values in the Age Column

tips: uncomment the method you want to use before running the code

In [None]:
"""
Method 1: Linear Regression Imputation
"""
from sklearn.linear_model import LinearRegression

def linear_regression_impute(df):
    known_age = df[df['Age'].notnull()]
    unknown_age = df[df['Age'].isnull()]
    
    X_train = known_age[['Pclass', 'SibSp', 'Parch', 'Fare']]
    y_train = known_age['Age']
    X_test = unknown_age[['Pclass', 'SibSp', 'Parch', 'Fare']]
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    predicted_ages = lr.predict(X_test)
    df.loc[df['Age'].isnull(), 'Age'] = predicted_ages
    
    return df

"""
Method 2: Random Forest Imputation
"""
from sklearn.ensemble import RandomForestRegressor

def random_forest_impute(df):
    known_age = df[df['Age'].notnull()]
    unknown_age = df[df['Age'].isnull()]
    
    X_train = known_age[['Pclass', 'SibSp', 'Parch', 'Fare']]
    y_train = known_age['Age']
    X_test = unknown_age[['Pclass', 'SibSp', 'Parch', 'Fare']]
    
    rfr = RandomForestRegressor(random_state=0, n_estimators=100)
    rfr.fit(X_train, y_train)
    
    predicted_ages = rfr.predict(X_test)
    df.loc[df['Age'].isnull(), 'Age'] = predicted_ages
    
    return df

"""
Method 3: K-Nearest Neighbors (KNN) Imputation
"""
from sklearn.impute import KNNImputer

def knn_impute(df):
    imputer = KNNImputer(n_neighbors=5)
    df[['Age']] = imputer.fit_transform(df[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']])[:, 0].reshape(-1, 1)
    
    return df

In [None]:
# impute_function = linear_regression_impute
impute_function = random_forest_impute
# impute_function = knn_impute

In [None]:
print("Before imputation:")
train_data[['Age']].describe() # for debugging

In [None]:
train_data = impute_function(train_data)
print("After imputation:")
train_data[['Age']].describe() # for debugging

## 2. One-hot Encoding

In [None]:
# we don't want to analyze cabin number, just whether they have a cabin or not
def set_Cabin_type(df):
    df.loc[df.Cabin.notnull(), 'Cabin'] = "Yes"
    df.loc[df.Cabin.isnull(), 'Cabin'] = "No"
    return df

train_data = set_Cabin_type(train_data)

dummies_Cabin = pd.get_dummies(train_data['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(train_data['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(train_data['Pclass'], prefix= 'Pclass')

df = pd.concat([train_data, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df

## 3. Standardization

In [None]:
import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()

# Reshape操作
df['Age_scaled'] = scaler.fit_transform(df['Age'].values.reshape(-1, 1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].values.reshape(-1, 1))

# Test Data pre-processing

In [None]:
test_data.describe() # for debugging

In [None]:
# calculate the average fare for each class
fare_means = test_data.groupby('Pclass')['Fare'].mean()

def fill_missing_fare(row):
    if pd.isnull(row['Fare']):
        return fare_means[row['Pclass']]
    else:
        return row['Fare']

# fill missing fare
test_data['Fare'] = test_data.apply(fill_missing_fare, axis=1)

# fill missing age
test_data = impute_function(test_data)

# one-hot encoding
test_data = set_Cabin_type(test_data)
dummies_Cabin = pd.get_dummies(test_data['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(test_data['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(test_data['Pclass'], prefix='Pclass')

df_test = pd.concat([test_data, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)

df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

# standardize
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'].values.reshape(-1, 1))
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'].values.reshape(-1, 1))

df_test

# Training

for a naive version, we use logistic regression

In [None]:
from sklearn import linear_model

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.values

y = train_np[:, 0]

X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6)
clf.fit(X, y.astype("int"))
    
clf

use trained model to predict result

In [None]:
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':test_data['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)