# Group Member: Lunjing Yuan, Zihao Li, Haorui Cheng, Mengyao Song

In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
def data_clean_transfer(df):
    '''
    data clean function, which dealing the missed value in embarked and age 
    and transfer embarked to dummies; we fill nan value in age by mean of age,
    and value in Embarked by S based on value_counts of Embarked; then return 
    the df_d for machine learning.
    '''
    df.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
    df.Age.fillna(df.Age.mean(), inplace=True)
    df.Embarked.fillna("S", inplace=True)
    df_d = pd.get_dummies(df, columns=['Sex', "Embarked"]).drop(columns=['Sex_female'])
    X = df_d.iloc[:, 1:]
    y = df_d.iloc[:, 0]
    return X, y

In [3]:
def scaler_fit(X):
    '''
    train the scaler and retrun the trained scaler
    '''
    s = StandardScaler().fit(X)
    return s

In [4]:
def scaler_trans(fitted_scaler, X):
    '''
    transform features by fitted scaler
    '''
    X_scaled = fitted_scaler.transform(X)
    return X_scaled

In [5]:
def fit_model(X, y, model=MLPClassifier(hidden_layer_sizes=(5,10),activation='relu')):
    '''
    we try almost every classifier in sklearn and find MLP get highest roc_auc score with 
    default hyperparameter we get best params by gridsearchcv method in template, so we use them directly here.
    '''
    model.fit(X, y)
    return model

In [6]:
def evaluate_model(X_test, y_test, fitted_model):
    '''
    return roc_auc score
    '''
    y_pred = fitted_model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    return score

In [7]:
if __name__ == "__main__":
    df_a = pd.read_csv('TestA.csv')
    df_t = pd.read_csv("Titanic_0.csv")
    
    X_t, y_t = data_clean_transfer(df_t)
    X_a, y_a = data_clean_transfer(df_a)

    s = scaler_fit(X_t)
    X_a = scaler_trans(s, X_a)
    
    model = fit_model(X=X_t, y=y_t)
    # res = evaluate_model(X_t, y_t, model)
    res_a = evaluate_model(X_a, y_a, model)
    print(f"the TestA score is {res_a}")
    

0.8030386280386281


