In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
#count of elements in train_data
#train_data.count()

#check for null values in train_data
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#count of elements in test_data
#test_data.count()

#check for null values in test_data
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
random.seed(100)
np.random.seed(100)

#drop Ticket and Cabin columns
train_data.drop(columns= ['Ticket', 'Cabin'], inplace=True)
train_data.set_index(keys=['PassengerId'], drop=True, inplace=True)
test_data.drop(columns=['Ticket', 'Cabin'], inplace=True)
test_data.set_index(keys=['PassengerId'], drop=True, inplace=True)

#take mean for Fare and mode for Embarked to fill null values
train_nan_map = {'Fare': train_data['Fare'].mean(), 'Embarked': train_data['Embarked'].mode()[0]}
test_nan_map = {'Fare': test_data['Fare'].mean(), 'Embarked': test_data['Embarked'].mode()[0]}
train_data.fillna(value=train_nan_map, inplace=True)
test_data.fillna(value=test_nan_map, inplace=True)

In [6]:
title_map = {'Name': {'Mr': 0, 'Capt': 0, 'Col': 0, 'Don': 0, 'Dr': 0, 'Rev': 0,
                      'Jonkheer': 0, 'Major': 0, 'Sir': 0, 'Rev': 0, 'Mrs': 1,
                      'Dona': 1, 'Lady': 1, 'Mme': 1, 'the Countess': 1, 'Miss': 2,
                      'Mlle': 2, 'Ms': 2, 'Master': 3}}

train_data.Name = train_data.Name.str.partition(", ")[2].str.partition(".")[0]
test_data.Name = test_data.Name.str.partition(", ")[2].str.partition(".")[0]
train_data.replace(title_map, inplace=True)
test_data.replace(title_map, inplace=True)

train_data.rename(columns={"Name": "Name_Prefix"}, inplace=True)
test_data.rename(columns={"Name": "Name_Prefix"}, inplace=True)

columns_map = {'Embarked': {'C': 0, 'Q': 1, 'S': 2}, 'Sex': {'male': 0, 'female': 1}}
train_data.replace(columns_map, inplace=True)
test_data.replace(columns_map, inplace=True)

In [7]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name_Prefix,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,0,0,22.0,1,0,7.25,2
2,1,1,1,1,38.0,1,0,71.2833,0
3,1,3,2,1,26.0,0,0,7.925,2
4,1,1,1,1,35.0,1,0,53.1,2
5,0,3,0,0,35.0,0,0,8.05,2


In [8]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name_Prefix,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,0,0,34.5,0,0,7.8292,1
893,3,1,1,47.0,1,0,7.0,2
894,2,0,0,62.0,0,0,9.6875,1
895,3,0,0,27.0,0,0,8.6625,2
896,3,1,1,22.0,1,1,12.2875,2


In [9]:
#Use multivariate iterative feature imputation on the Age column for train_data.
train_data_imputer = IterativeImputer(random_state=42)
train_data_imputed = train_data_imputer.fit_transform(train_data)
train_data = pd.DataFrame(train_data_imputed)
train_data.rename(columns={0: 'Survived', 1: 'Name_Prefix', 2: 'Pclass', 3:'Sex', 4: 'Age'
                                   , 5: 'SibSp', 6: 'Parch', 7: 'Fare', 8: 'Embarked'}, inplace=True)
train_data = train_data.round(2)

In [10]:
#Use multivariate iterative feature imputation on the Age column for test_data.
test_data_imputer = IterativeImputer(random_state=42)
test_data_imputed = test_data_imputer.fit_transform(test_data)
test_data = pd.DataFrame(test_data_imputed)
test_data.rename(columns={0: 'Name_Prefix', 1: 'Pclass', 2:'Sex', 3: 'Age'
                                   , 4: 'SibSp', 5: 'Parch', 6: 'Fare', 7: 'Embarked'}, inplace=True)
test_data = test_data.round(2)

In [11]:
train_data.isnull().sum()

Survived       0
Name_Prefix    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [12]:
test_data.isnull().sum()

Name_Prefix    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [13]:
train_data.to_csv('cleaned_train_data.csv', header=True, sep=',')
test_data.to_csv('cleaned_test_data.csv', header=True, sep=',')

In [14]:
features = ['Name_Prefix', 'Pclass', 'Sex','Age', 'SibSp','Parch','Fare','Embarked']
X_train = train_data[features]
y_train = train_data['Survived']
X_test = test_data[features]

In [15]:
X_training, X_valid, y_training, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

In [16]:
model = LogisticRegression()
model.fit(X_training, y_training)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
model_predictions = model.predict(X_valid)

In [20]:
accuracy = accuracy_score(y_valid, model_predictions)
print(accuracy)

0.8116591928251121


In [21]:
confusion_matrix(y_valid, model_predictions, normalize='true')

array([[0.89208633, 0.10791367],
       [0.32142857, 0.67857143]])

In [20]:
prediction_data = model.predict(X_test)

In [21]:
passenger_ID_Doc = pd.read_csv("test.csv")

In [22]:
gender_submission = pd.DataFrame({ "PassengerId" : passenger_ID_Doc["PassengerId"], "Survived" : prediction_data})
gender_submission.to_csv("gender_submission.csv", index=False)