In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 29
        else:
            return 25
    else:
        return Age


In [10]:
def clean_titanic_df(dataframe, use_pclass=False):
    dataframe['Age'] = dataframe[['Age', 'Pclass']].apply(impute_age, axis=1)
    dataframe = dataframe.drop('Cabin', axis=1)
    dataframe = dataframe.dropna()
    sex = pd.get_dummies(dataframe['Sex'], drop_first=True)
    embark = pd.get_dummies(dataframe['Embarked'], drop_first=True)
    embark.columns = ['embark_Q', 'embark_S']
    dataframe = dataframe.drop(
        [
            'Sex',
            'Embarked',
            'Name',
            'Ticket',
            'PassengerId',
        ],
        axis=1,
    )
    dataframe = pd.concat([dataframe, sex, embark], axis=1)

    if use_pclass:
        pclass = pd.get_dummies(dataframe['Pclass']).drop(3, axis=1)
        pclass.columns = ['Pclass_1', 'Pclass_2']
        dataframe = pd.concat([dataframe, pclass], axis=1)
        dataframe = dataframe.drop('Pclass', axis=1)

    return dataframe

In [4]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [12]:
train_clean = clean_titanic_df(train)
train_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,embark_Q,embark_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [19]:
test_clean = clean_titanic_df(test)
test_clean.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,embark_Q,embark_S
0,3,34.5,0,0,7.8292,1,1,0
1,3,47.0,1,0,7.0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0
3,3,27.0,0,0,8.6625,1,0,1
4,3,22.0,1,1,12.2875,0,0,1


In [14]:
train_class = clean_titanic_df(train, use_pclass=True)
train_class.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,embark_Q,embark_S,Pclass_1,Pclass_2
0,0,22.0,1,0,7.25,1,0,1,0,0
1,1,38.0,1,0,71.2833,0,0,0,1,0
2,1,26.0,0,0,7.925,0,0,1,0,0
3,1,35.0,1,0,53.1,0,0,1,1,0
4,0,35.0,0,0,8.05,1,0,1,0,0


In [15]:
test_class = clean_titanic_df(test, use_pclass=True)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report


In [20]:
# X_train = train_clean.drop('Survived', axis=1)
# y_train = train_clean['Survived']
# X_test = test_clean.drop('Survived', axis=1)
# y_test = test_clean['Survived']

In [33]:
X = train_clean.drop('Survived', axis=1)
y = train_clean['Survived']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=101,
)

In [38]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train, )
predictions = logmodel.predict(X_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.83      0.91      0.87       163
           1       0.84      0.70      0.76       104

    accuracy                           0.83       267
   macro avg       0.83      0.81      0.82       267
weighted avg       0.83      0.83      0.83       267



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
test_clean.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,embark_Q,embark_S
0,3,34.5,0,0,7.8292,1,1,0
1,3,47.0,1,0,7.0,0,0,1
2,2,62.0,0,0,9.6875,1,1,0
3,3,27.0,0,0,8.6625,1,0,1
4,3,22.0,1,1,12.2875,0,0,1


In [45]:
logmodel.predict(test_clean.head())

array([0, 0, 0, 0, 1])