# Group Member: Lunjing Yuan, Zihao Li, Haorui Cheng, Mengyao Song

In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
def data_clean_transfer(df, fillna):
    '''
    data clean function, which dealing the missed value in embarked and age 
    and transfer embarked to dummies; we fill nan value in age by mean of age,
    and value in Embarked by S based on value_counts of Embarked; then return 
    the df_d for machine learning.
    '''
    df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
    df.Age.fillna(fillna, inplace=True)
    df.Embarked.fillna("S", inplace=True)
    df_d = pd.get_dummies(df, columns=['Sex', "Embarked"]).drop(columns=['Sex_female'])
    X = df_d.iloc[:, 1:]
    y = df_d.iloc[:, 0]    
    
    return X, y

In [3]:
def scaler_fit(X):
    '''
    train the scaler and retrun the trained scaler
    '''
    s = StandardScaler().fit(X)
    return s

In [4]:
def scaler_trans(fitted_scaler, X):
    '''
    transform features by fitted scaler
    '''
    X_scaled = fitted_scaler.transform(X)
    return X_scaled

In [5]:
def fit_model(X, y, model=MLPClassifier(hidden_layer_sizes=(5,10),activation='relu',alpha=1,learning_rate_init=0.005, max_iter=900)):
    '''
    we try almost every classifier in sklearn and find MLP get highest roc_auc score with 
    default hyperparameter we get best params by gridsearchcv method in template, so we use them directly here.
    '''
    model.fit(X, y)
    return model

In [6]:
def evaluate_model(X_test, y_test, fitted_model):
    '''
    return roc_auc score
    '''
    y_pred = fitted_model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    return score

In [7]:
if __name__ == "__main__":
    models = [MLPClassifier(hidden_layer_sizes=(5,10),activation='relu',alpha=1,learning_rate_init=0.005, max_iter=900),
             DecisionTreeClassifier(criterion = 'entropy'),
             LogisticRegression()]
    df_t = pd.read_csv("Titanic_0.csv")
    df_k = pd.read_csv("testk.csv")
    
    agemean = df_t.Age.mean()
    
    X_train, y_train = data_clean_transfer(df_t, agemean)
    
    df_k.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
    df_k.Age.fillna(agemean, inplace=True)
    df_k.Embarked.fillna("S", inplace=True)
    X_test = pd.get_dummies(df_k, columns=['Sex', "Embarked"]).drop(columns=['Sex_female'])
    
    
   
    
    s = scaler_fit(X_train)
    X_train = scaler_trans(s, X_train)
    X_test = scaler_trans(s, X_test)
    for model in models:
        model.fit(X_train, y_train)
        res = evaluate_model(X_test, y_test, model)
        res_cross = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=10, n_jobs=-1)
    
        print(f"model is {model.__class__.__name__}, the titanic score is {round(res, 4)}, cross_val_score is {round(res_cross.mean(),4)}")
    

0      0
1      1
2      1
3      1
4      0
      ..
708    0
709    0
710    1
711    0
712    0
Name: Survived, Length: 713, dtype: int64
model is MLPClassifier, the titanic score is 0.7915, cross_val_score is 0.8468
model is DecisionTreeClassifier, the titanic score is 0.7557, cross_val_score is 0.7653
model is LogisticRegression, the titanic score is 0.843, cross_val_score is 0.845
