In [79]:
import pandas as pd
import numpy as np


* Download the dataset here: https://www.kaggle.com/c/titanic/data

# Exploring the Titanic dataset

In [80]:
titanic = pd.read_csv("titanic_data.csv")

In [81]:
pd.read_csv("titanic_data.csv")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Handling missing data in numerical values

In [82]:
titanic.fillna(titanic.mean(), inplace=True)


Here, we see that 'NaN'of the 'Age' in line 888 for example was replaced by a number (the mean of all ages):

In [83]:
titanic.tail(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


# Handling missing data in categorical data

1. Here, we check if the data-type of the column is of 'object' type by using the dtypes keyword.
2. Using the fillna() method, we impute the column's missing values with the most frequent value of that column with the value_counts() method and index attribute and assign it to cc_apps.

In [84]:
# Iterate over each column of cc_apps
for col in titanic.columns:
    # Check if the column is of object type
    if titanic[col].dtypes == 'object':
        # Impute with the most frequent value
        titanic = titanic.fillna(titanic[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(titanic.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


Now, using label encoding, we will be convert all the non-numeric values into numeric ones

In [85]:
# Import LabelEncoder

from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder

le = LabelEncoder()
# Iterate over all the values of each column and extract their dtypes
for col in titanic.columns.values:
    # Compare if the dtype is object
    if titanic[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        titanic[col]=le.fit_transform(titanic[col])

Verifying the result:

In [86]:
titanic.tail(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
884,885,0,3,793,1,25.0,0,0,650,7.05,146,3
885,886,0,3,684,0,39.0,0,5,480,29.125,146,2
886,887,0,2,548,1,27.0,0,0,101,13.0,146,3
887,888,1,1,303,0,19.0,0,0,14,30.0,30,3
888,889,0,3,413,0,29.699118,1,2,675,23.45,146,3
889,890,1,1,81,1,26.0,0,0,8,30.0,60,0
890,891,0,3,220,1,32.0,0,0,466,7.75,146,2


# Spliting the data

Here, we split the data in two sets (test set and training set). Also, we drop the PassengerId comumn, since it isn't a significant feature to our model. To create X and y variables, for X we take the columns Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin and Embarket. For y we take only column 'Survived' (our target)

In [87]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop the features 11 and 13 and convert the DataFrame to a NumPy array
titanic = titanic.drop(['PassengerId'], axis=1)
titanic = titanic.values

# Segregate features and labels into separate variables
X,y = titanic[:,1:10] , titanic[:,0]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

# Scaling data

1. Here, we scale the data using MinMaxScaler.
2. See the importance of scaling here: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#:~:text=Feature%20scaling%20through%20standardization%20(or,a%20standard%20deviation%20of%20one.
3. "When a dataset has varying ranges, one a small change in a particular feature may not have a significant effect on the other feature, which can cause a lot of problems when predictive modeling."
4. For example, here, when I didn't scale the dataset, I had a problem because the maximum number of iterations was reached (it didn't converge)

5. Another interesting source: https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/

In [89]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Fitting a logistic reg model

In [91]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train,y_train)

LogisticRegression()

# Analysing the confusion matrix:

In [92]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression classifier:  0.8135593220338984


array([[154,  21],
       [ 34,  86]], dtype=int64)

# Trying to improve our mode: GridSearchCV

In [93]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

In [94]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.790107 using {'max_iter': 100, 'tol': 0.01}
