In [18]:
import numpy as np
import pandas as pd
import warnings
import math, copy         

warnings.filterwarnings('ignore')

# 1. Load data

In [19]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
combined = train_data.append(test_data)

In [20]:
combined.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


# 2. Fill missing data

In [21]:
def fill_age(x, grp):
    return grp[(grp.Pclass==x.Pclass)&(grp.Sex==x.Sex)]['Age'].values[0]

def missing_age(combined, train_data):
    #children will have common age
    age = train_data[train_data.Name.str.contains('Master')]['Age'].mean()
    combined.loc[combined.Age.isnull() & combined.Name.str.contains('Master'), ['Age']] = age
    #other people will be given age based on their Sex and Pclass features
    grp = train_data.groupby(['Pclass','Sex'])['Age'].mean().reset_index()[['Sex', 'Pclass', 'Age']]
    combined['Age'] = combined.apply(lambda x: fill_age(x, grp) if np.isnan(x['Age']) else x['Age'], axis=1)
    
def missing_fare(combined, train_data):
    grp = train_data.groupby(['Pclass','Embarked'])['Fare'].mean().reset_index()[['Embarked', 'Pclass', 'Fare']]
    fare = grp[(grp.Pclass==3)&(grp.Embarked=='S')]['Fare'].mean()
    combined.loc[combined.Fare.isnull(), 'Fare'] = fare
    
def missing_embarked(combined, train_data):
    combined.Embarked.fillna(train_data.Embarked.mode()[0], inplace = True)
    
def missing_all(combined, train_data):
    missing_age(combined, train_data)
    missing_fare(combined, train_data)
    missing_embarked(combined, train_data)

# 3. Normalize, encode and add new features to data

In [22]:
def normalize_z_score(df, feature):
    x = df.loc[:, feature]
    mi = sum(x)/len(x)
    gamma = math.sqrt(sum((x-mi)**2)/len(x))
    df.loc[:, feature] = (x-mi)/gamma
    
def normalize_min_max(df, feature):
    x = df.loc[:, feature]
    df.loc[:, feature] = (x-min(x))/(max(x)-min(x))

def map_sex(df):
    df['Sex'] = df['Sex'].map( {'male': 0, 'female': 1} ).astype(int)

def encode_embarked(df):
    new_embarked = pd.get_dummies(df['Embarked'])
    df.drop(['Embarked'], axis=1, inplace=True)
    df['S'] = new_embarked['S']
    df['C'] = new_embarked['C']
    df['Q'] = new_embarked['Q']

def add_alone_feature(df):
    df['Alone'] = 0
    df.loc[(df['Parch'] + df['SibSp']) > 0, ['Alone']] = 1
    
def add_title_feature(df):
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    title_mapping = {"Mr": 1, "Miss": 4, "Mrs": 5, "Master": 3, "Rare": 2}
    
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)

def clean_all(df):
    map_sex(df)
    encode_embarked(df)
    normalize_min_max(df, 'Age')
    normalize_min_max(df, 'Fare')
    add_alone_feature(df)


# 4. Implement logistic regression

In [23]:
def predict(x, w, b):
    return np.dot(x,w) + b

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def judgement(x, w, b):
    z = predict(x, w, b)
    g = sigmoid(z)
    return 1 if g>0.5 else 0

def loss(x, w, b, y):
    z = predict(x, w, b)
    g = sigmoid(z)
    return -y*np.log(g) - (1-y)*np.log(1-g)

def cost(X, y, w, b):
    cost = 0.0
    for i in range(m):
        cost += loss(X.iloc[i], w, b, y[i])
    return cost

def compute_gradient(X, y, w, b):
    dj_dw = np.zeros((n,))
    dj_db = 0.
    
    for i in range(m):
        z = predict(X.iloc[i], w, b)
        f_wb = sigmoid(z)
        err_i  = f_wb  - y[i]
        dj_db = dj_db + err_i
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X.iloc[i,j]  
            
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    return dj_db, dj_dw

def gradient_descent(X, y, w_in, b_in, alpha, num_iters):
    w = copy.deepcopy(w_in)
    b = b_in
    J_history = []

    for i in range(num_iters):
        dj_db, dj_dw = compute_gradient(X, y, w, b)   
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db       
        
        if i<10000:
            J_history.append(cost(X, y, w, b) )
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
    return w, b

# 5. Data spliting 

In [24]:
missing_all(combined, train_data)
clean_all(combined)

train_data = combined.iloc[0:891]
test_data =  combined.iloc[891::]

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'S', 'C', 'Q', 'Alone']
X = train_data[features]
T = test_data[features]
Y = train_data['Survived'].astype(int)

m,n = X.shape

# 6. Model training

In [25]:
w_in = np.ones((n,))
b_in = 1.6
alpha = 1.8
num_iters = 150
w,b = gradient_descent(X, Y, w_in, b_in, alpha, num_iters)
print(w,b)

Iteration    0: Cost 669.6039499679006   
Iteration   15: Cost 481.90990050090727   
Iteration   30: Cost 430.5517181677397   
Iteration   45: Cost 408.6200950037869   
Iteration   60: Cost 404.60542280657916   
Iteration   75: Cost 402.4527509881691   
Iteration   90: Cost 402.88523749051603   
Iteration  105: Cost 400.9697331497362   
Iteration  120: Cost 400.4260018049405   
Iteration  135: Cost 398.1085510148235   
[-0.83647225  2.69534136 -1.65450203 -0.48917192 -0.22975862  0.90463459
  0.49960775  0.80512203  0.72128384  0.82370681] 0.6260136139716678


In [26]:
df = pd.DataFrame()
data = []
for i in range(m):
    data.append(judgement(X.iloc[i], w, b))
    
df['Predict'] = data
df['Actual'] = Y

# 7. Accuracy on train_data

In [27]:
acc_train = 100 * df.loc[df['Predict'] == df['Actual']].count()[0] / Y.count()
print(str(acc_train) + '%')

79.57351290684625%


In [24]:
size = train_data.shape[0]
res = {'TP':0, 'FP':0, 'FN':0, 'TN':0}
for index, row in df.iterrows():
    if (row[0] == 1 and row[1] == 1):
        res['TP']+=1/size
    elif (row[0] == 0 and row[1] == 0):
        res['TN']+=1/size
    elif (row[0] == 1 and row[1] == 0):
        res['FN']+=1/size
    elif (row[0] == 0 and row[1] == 1):
        res['FP']+=1/size
        
print(res)

{'TP': 0.277216610549943, 'FP': 0.10662177328843983, 'FN': 0.09764309764309756, 'TN': 0.5185185185185166}


# 8. Accuracy on test_data

76.555%