In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### Prepare Train data

In [2]:
train = pd.read_csv('titanic_train.csv')
# Dropping some columns I believe they are irrelevent
train = train[[#'PassengerId',
 'Survived',
 'Pclass',
 #'Name',
 'Sex',
 'Age',
 #'SibSp', #showing very low corr with survival of '-0.04'
 'Parch',
 #'Ticket',
 'Fare',
 #'Cabin',
 'Embarked'
]]

#Fill missing Embarked with most repetitve value of 'S'    
train['Embarked'].fillna('S', inplace= True) 

#Assign 1 for male and 0 for female as new column call Male
train['Male'] = np.nan
for i in range(len(train)):   
    if train.loc[i,'Sex'] == 'male':
        train.loc[i,'Male'] = 1
        
    elif train.loc[i,'Sex'] == 'female':
        train.loc[i,'Male'] = 0
        
    else:
        train.loc[i,'Male'] = '???????'
        
# Create dummy values for the category 'Embarked' column
Embarked = pd.get_dummies(train['Embarked'], drop_first=True)
train = pd.concat([train,Embarked], axis= 1)

# Drop the Sex, Embarked columns as we now have Male,Q ,S columns with values of (0,1)
train.drop(['Sex'],axis= 1, inplace=True)
train.drop(['Embarked'], axis = 1, inplace = True)




# Forecast the missing Ages using the other attributes in other columns:-
# Get actual filled ages, then using Linear Reggression with other columns to predict the missing age in the main data
df2 = train[train['Age'].isna() == False].copy()


#building the model and feed the X = 'attributes', y = 'Age' into it.
lm = LinearRegression()
X = df2[['Survived', 'Pclass', 'Parch', 'Fare', 'Male']]
y = df2[['Age']]
lm.fit(X ,y )

# Predict the null Age in the main data using the Model coeff

missing_age = train[train['Age'].isna()][['Survived', 'Pclass', 'Parch', 'Fare', 'Male']]
missing_age['Age']=lm.predict(missing_age).astype('int')

# Assiging predicted missing_age into the main data frame using the common indexing values
for i in missing_age.index:
    train.loc[i,'Age'] = missing_age.loc[i,'Age']





# Visualise the null values in the data
#sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#plt.show()




train.head()


Unnamed: 0,Survived,Pclass,Age,Parch,Fare,Male,Q,S
0,0,3,22.0,0,7.25,1.0,0,1
1,1,1,38.0,0,71.2833,0.0,0,0
2,1,3,26.0,0,7.925,0.0,0,1
3,1,1,35.0,0,53.1,0.0,0,1
4,0,3,35.0,0,8.05,1.0,0,1


### Prepare Test data 

In [3]:
#Preparation same as Train data but with 2 differences:-
# 1) Test data dont have null values in Embarked colum   2) Test data have single null value in Fare column

test = pd.read_csv('titanic_test.csv')
# Dropping some columns I believe they are irrelevent
test = test[[#'PassengerId',
 'Survived',
 'Pclass',
 #'Name',
 'Sex',
 'Age',
 #'SibSp', #showing very low corr with survival of '-0.04'
 'Parch',
 #'Ticket',
 'Fare',
 #'Cabin',
 'Embarked'
]]

#Fill the single missing data in Fare column with total column avg
test['Fare'].fillna(test['Fare'].mean(), inplace= True)



#Assign 1 for male and 0 for female as new column call Male
test['Male'] = np.nan


for i in range(len(test)):   
    if test.loc[i,'Sex'] == 'male':
        test.loc[i,'Male'] = 1
        
    elif test.loc[i,'Sex'] == 'female':
        test.loc[i,'Male'] = 0
        
    else:
        test.loc[i,'Male'] = '???????'


# Create dummy values for the category 'Embarked' column
Embarked = pd.get_dummies(test['Embarked'], drop_first=True)
test = pd.concat([test,Embarked], axis= 1)



# Drop the Sex, Embarked columns as we now have Male,Q ,S columns with values of (0,1)
test.drop(['Sex'],axis= 1, inplace=True)
test.drop(['Embarked'], axis = 1, inplace = True)



# Forecast the missing Ages using the other attributes in other columns:-
# Get actual filled ages, then using Linear Reggression with other columns to predict the missing age in the main data
df2 = test[test['Age'].isna() == False].copy()


#building the model and feed the X = 'attributes', y = 'Age' into it.
lm = LinearRegression()
X = df2[['Survived', 'Pclass', 'Parch', 'Fare', 'Male']]
y = df2[['Age']]
lm.fit(X,y)



# Predict the null Age in the main data using the Model coeff

missing_age = test[test['Age'].isna()][['Survived', 'Pclass', 'Parch', 'Fare', 'Male']]
missing_age['Age']=lm.predict(missing_age).astype('int')

# Assiging predicted missing_age into the main data frame using the common indexing values
for i in missing_age.index:
    test.loc[i,'Age'] = missing_age.loc[i,'Age']





# Visualise the null values in the data
#sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#plt.show()


test.head()

Unnamed: 0,Survived,Pclass,Age,Parch,Fare,Male,Q,S
0,0,3,34.5,0,7.8292,1.0,1,0
1,1,3,47.0,0,7.0,0.0,0,1
2,0,2,62.0,0,9.6875,1.0,1,0
3,0,3,27.0,0,8.6625,1.0,0,1
4,1,3,22.0,1,12.2875,0.0,0,1


In [4]:
#Final train data
train.head()

Unnamed: 0,Survived,Pclass,Age,Parch,Fare,Male,Q,S
0,0,3,22.0,0,7.25,1.0,0,1
1,1,1,38.0,0,71.2833,0.0,0,0
2,1,3,26.0,0,7.925,0.0,0,1
3,1,1,35.0,0,53.1,0.0,0,1
4,0,3,35.0,0,8.05,1.0,0,1


In [5]:
#Final train data
test.head()

Unnamed: 0,Survived,Pclass,Age,Parch,Fare,Male,Q,S
0,0,3,34.5,0,7.8292,1.0,1,0
1,1,3,47.0,0,7.0,0.0,0,1
2,0,2,62.0,0,9.6875,1.0,1,0
3,0,3,27.0,0,8.6625,1.0,0,1
4,1,3,22.0,1,12.2875,0.0,0,1


In [6]:
# Predict the Survival column using Logistics Reg


# Set X,y to predict X parameters
X_train = train[['Pclass', 'Age', 'Parch', 'Fare', 'Male', 'Q', 'S']]
y_train = train['Survived']
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)


X_test = test[['Pclass', 'Age', 'Parch', 'Fare', 'Male', 'Q', 'S']]
y_test = test['Survived']

predictions = pd.DataFrame(logmodel.predict(X_test), columns=['Predicted Survival'])
print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

           0       0.96      0.94      0.95       266
           1       0.90      0.93      0.91       152

    accuracy                           0.94       418
   macro avg       0.93      0.93      0.93       418
weighted avg       0.94      0.94      0.94       418



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
