In [366]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Read data


In [367]:
df = pd.read_csv('titanic.csv')
df = df.drop(columns=['PassengerId', 'Pclass', 'Name', 'Sex',
             'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
df


Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
886,0,27.0,13.0000
887,1,19.0,30.0000
888,0,,23.4500
889,1,26.0,30.0000


In [368]:
df.isnull().sum()


Survived      0
Age         177
Fare          0
dtype: int64

In [369]:
result = np.zeros((14, 11))
index = 0


In [370]:
def logistic_regression(titanic_data, test_size=0.3, random_state=0):
    X = titanic_data.drop(columns='Survived', axis=1)
    Y = titanic_data['Survived']

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state)
    # print(X.shape, X_train.shape, X_test.shape)

    model = LogisticRegression()
    model.fit(X_train, Y_train)

    X_train_prediction = model.predict(X_train)
    training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
    # print('Accuracy score of training data : ', training_data_accuracy)

    X_test_prediction = model.predict(X_test)
    test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
    # print('Accuracy score of test data : ', test_data_accuracy)
    return test_data_accuracy


In [371]:
def write_result(titanic_data):
    global index
    for i in range(10):
        result[index][i] = logistic_regression(
            titanic_data=titanic_data, random_state=i)
    result[index][10] = np.mean(result[index][0:9])
    index += 1


In [372]:
def outliers(titanic_data):
    lower_boundary = titanic_data['Age'].mean() - 3 * titanic_data['Age'].std()
    uppper_boundary = titanic_data['Age'].mean(
    ) + 3 * titanic_data['Age'].std()
    titanic_data.loc[titanic_data['Age'] >= int(
        uppper_boundary), 'Age'] = int(uppper_boundary)

    IQR = titanic_data.Fare.quantile(0.75) - titanic_data.Fare.quantile(0.25)
    lower_bridge = titanic_data['Fare'].quantile(0.25) - (IQR*3)
    upper_bridge = titanic_data['Fare'].quantile(0.75) + (IQR*3)
    titanic_data.loc[titanic_data['Fare'] >= int(
        upper_bridge), 'Fare'] = int(upper_bridge)

    return titanic_data


# Mean


In [373]:
titanic_data = df.copy()
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [374]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# Median


In [375]:
titanic_data = df.copy()
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [376]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# Mode


In [377]:
titanic_data = df.copy()
titanic_data['Age'].fillna(titanic_data['Age'].mode()[0], inplace=True)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [378]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# Random


In [379]:
titanic_data = df.copy()
random_samples = titanic_data['Age'].dropna().sample(
    n=titanic_data['Age'].isnull().sum(), random_state=0)
random_samples.index = titanic_data[titanic_data['Age'].isnull()].index
titanic_data.loc[titanic_data['Age'].isnull(), 'Age'] = random_samples
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [380]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# End of dist


In [381]:
titanic_data = df.copy()
extreme = titanic_data.Age.mean() + 3*titanic_data.Age.std()
titanic_data['Age'] = titanic_data['Age'].fillna(extreme)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [382]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# Arbitrary value


In [383]:
value = 50
titanic_data = df.copy()
titanic_data['Age'] = titanic_data['Age'].fillna(value)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
dtype: int64

In [384]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


# New feature


In [385]:
titanic_data = df.copy()
titanic_data['Age_NAN'] = np.where(titanic_data['Age'].isnull(), 1, 0)
titanic_data['Age'].fillna(titanic_data.Age.median(), inplace=True)
titanic_data.isnull().sum()


Survived    0
Age         0
Fare        0
Age_NAN     0
dtype: int64

In [386]:
write_result(titanic_data)
titanic_data = outliers(titanic_data)
write_result(titanic_data)


In [387]:
df = pd.DataFrame(result, index=['Mean', 'Mean outliers', 'Median', 'Median outliers', 'Mode', 'Mode outliers',
                                 'Random', 'Random outliers', 'End of dist', 'End of dist outliers', 'Arbitrary value',
                                 'Arbitrary value outliers', 'New feature', 'New feature outliers'],
                    columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Mean'])
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Mean
Mean,0.686567,0.619403,0.656716,0.641791,0.675373,0.645522,0.686567,0.630597,0.656716,0.630597,0.655473
Mean outliers,0.697761,0.623134,0.686567,0.623134,0.66791,0.69403,0.712687,0.660448,0.66791,0.63806,0.670398
Median,0.686567,0.623134,0.656716,0.645522,0.675373,0.63806,0.682836,0.634328,0.656716,0.634328,0.655473
Median outliers,0.701493,0.623134,0.682836,0.623134,0.66791,0.69403,0.712687,0.660448,0.66791,0.63806,0.670398
Mode,0.69403,0.623134,0.649254,0.641791,0.679104,0.649254,0.679104,0.623134,0.656716,0.63806,0.655058
Mode outliers,0.69403,0.626866,0.686567,0.623134,0.656716,0.69403,0.705224,0.660448,0.66791,0.63806,0.668325
Random,0.69403,0.626866,0.656716,0.626866,0.679104,0.652985,0.682836,0.626866,0.660448,0.63806,0.656302
Random outliers,0.701493,0.63806,0.686567,0.615672,0.664179,0.697761,0.705224,0.664179,0.66791,0.634328,0.671227
End of dist,0.69403,0.630597,0.664179,0.626866,0.701493,0.671642,0.697761,0.63806,0.66791,0.645522,0.665837
End of dist outliers,0.701493,0.630597,0.682836,0.623134,0.656716,0.679104,0.705224,0.671642,0.66791,0.645522,0.66874
