In [253]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [276]:
Data = pd.read_csv("T_train.csv", delimiter = ",")
Test = pd.read_csv("T_test.csv", delimiter = ",")

In [277]:
Data.shape

(668, 11)

In [278]:
Data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [279]:
Test.shape

(223, 10)

In [280]:
Data.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [281]:
Test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          45
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       173
Embarked      1
dtype: int64

* cabin column has around 76% missing values so we will ignore this column

In [282]:
Data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [283]:
Data.drop('Cabin', inplace = True, axis = 1)

In [284]:
print(Data["Age"].mean(skipna=True))
print(Data["Age"].median(skipna=True))

29.700559701492537
29.0


In [285]:
Data["Age"].fillna(Data["Age"].median(skipna=True), inplace=True)

In [286]:
E = Data["Embarked"].value_counts().idxmax()
Data["Embarked"].fillna(E, inplace = True)

In [287]:
Test.drop("Cabin", inplace = True, axis=1)
Test["Age"].fillna(Test["Age"].median(skipna=True), inplace=True)
E1 = Test["Embarked"].value_counts().idxmax()
Test["Embarked"].fillna(E, inplace = True)

In [288]:
Data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,29.0,0,0,A/5 2466,8.05,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,S,0


According to the Kaggle data dictionary, both SibSp and Parch relate to traveling with family. For simplicity I will combine the values in a categorical column which tells me if the person was travelling alone or not

In [289]:
Data['TravelAlone'] = np.where((Data["SibSp"] + Data["Parch"])>0, 0, 1)
Data.drop('SibSp', axis = 1, inplace = True)
Data.drop('Parch', axis = 1, inplace = True)

In [290]:
Data

Unnamed: 0,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,Survived,TravelAlone
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,228414,26.0000,S,1,0
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,29.0,A/5 2466,8.0500,S,0,1
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,250655,26.0000,S,0,1
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,349909,21.0750,S,0,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,SOTON/OQ 392076,7.0500,S,0,1
...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,SO/C 14885,10.5000,S,1,1
664,3,"Morrow, Mr. Thomas Rowan",male,29.0,372622,7.7500,Q,0,1
665,3,"Bing, Mr. Lee",male,32.0,1601,56.4958,S,1,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,7553,9.8375,S,0,1


Creating categorical variable for column - Pclass, Sex and Embarked for which I take Dummy values to predict output

In [292]:
training = pd.get_dummies(Data, columns = ["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis = 1, inplace = True)
training.drop('Name', axis = 1, inplace = True)
training.drop('Ticket', axis = 1, inplace = True)

final_train = training
final_train.head()

Unnamed: 0,Age,Fare,Survived,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,29.0,26.0,1,0,0,1,0,0,0,1,0
1,29.0,8.05,0,1,0,0,1,0,0,1,1
2,39.0,26.0,0,1,0,1,0,0,0,1,1
3,29.0,21.075,0,0,0,0,1,0,0,1,0
4,25.0,7.05,0,1,0,0,1,0,0,1,1


In [293]:
##  Applying Changes to Testing Data
Test['TravelAlone'] = np.where((Test["SibSp"] + Test["Parch"])>0, 0, 1)
Test.drop('SibSp', axis = 1, inplace = True)
Test.drop('Parch', axis = 1, inplace = True)

testing = pd.get_dummies(Test, columns = ["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis = 1, inplace = True)
testing.drop('Name', axis = 1, inplace = True)
testing.drop('Ticket', axis = 1, inplace = True)

final_test = testing
final_test.head()

Unnamed: 0,Age,Fare,TravelAlone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_male
0,8.0,36.75,0,0,1,0,0,0,1,1
1,49.0,25.9292,1,1,0,0,0,0,1,0
2,27.0,7.7375,1,0,0,1,0,1,0,1
3,24.0,27.0,0,0,1,0,0,0,1,0
4,36.0,26.2875,1,1,0,0,0,0,1,1


In [306]:
cols = ['Age','Fare', 'TravelAlone', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Sex_male' ]
X = final_train[cols]
Y = final_train['Survived']
X_test = final_test[cols]

In [300]:
clf = LogisticRegression()
clf.fit(X, Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [307]:
Y_pred = clf.predict(X_test)
Y_pred

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0], dtype=int64)

In [313]:
np.savetxt("Predicted_Survived.csv", Y_pred, delimiter = ",", fmt='%i')