In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train = pd.read_csv('/content/drive/My Drive/train.csv')

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
train.tail(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
x=train.columns
for i in x:
  print(i)

PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked


In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
train.duplicated().sum()

0

In [8]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [9]:
train.shape

(891, 12)

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
#replacing the age null values
train['Age'].fillna(train['Age'].mean(), inplace = True)
train.isnull().sum() #checking whether all the nulls in age have become replaced

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
train['Cabin'].fillna(train['Cabin'].mode()[0], inplace = True)

In [13]:
train['Cabin'].isnull().sum()

0

In [14]:
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace = True)

In [15]:
train['Embarked'].isnull().sum()

0

In [16]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [17]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [18]:
categorical_columns = []
non_categorical_columns = []

for column in train.columns:
  if train[column].dtype == 'object' or train[column].dtype == 'category':
    categorical_columns.append(column)
  else:
    non_categorical_columns.append(column)
print("categorical columns in train dataset are:")
print(categorical_columns)
print("\nnon_categorical columns in train dataset are:")
print(non_categorical_columns)

categorical columns in train dataset are:
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

non_categorical columns in train dataset are:
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


BUILDING THE MODEL

In [19]:
train.head(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [52]:
#encoding the categorical columns

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [53]:
train.loc[:,'Name'] = le.fit_transform(train.loc[:,'Name'])
train.loc[:,'Sex'] = le.fit_transform(train.loc[:,'Sex'])
train.loc[:,'Ticket'] = le.fit_transform(train.loc[:,'Ticket'])
train.loc[:,'Cabin'] = le.fit_transform(train.loc[:,'Cabin'])
train.loc[:,'Embarked'] = le.fit_transform(train.loc[:,'Embarked'])


In [54]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,47,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.0,0,0,669,7.925,47,2


In [23]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [24]:
non_categorical_columns

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

BUILDING THE MACHINE LEARNING LOGISTIC REGRESSION MODEL

In [55]:

x = train[['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
y = train['Survived']

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
x_train, x_test, y_train,y_test = train_test_split(x, y,test_size = 0.2, random_state = 42)
model = LogisticRegression(max_iter = 1000)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)  # predicting the outcome

accuracy = accuracy_score(y_test,y_pred)
print(f"accuracy score:{accuracy}") #

accuracy score:0.8156424581005587


MODEL HYPERPARAMETER TUNING

In [57]:
from sklearn.model_selection import GridSearchCV


# Define the parameter grid
param_grid = {
  'C': [0.001,0.01,0.1,1]

}



In [58]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=5)
grid_search.fit(x_train,y_train)
grid_search =GridSearchCV(estimator=model,param_grid=param_grid,cv=5)

grid_search.fit(x_train,y_train)

best_params = grid_search.best_params_

print(best_params)

{'C': 1}


In [59]:
model.fit(x_train,y_train)

best_model = LogisticRegression(max_iter = 1000,**best_params)

best_model.fit(x_train,y_train)



In [60]:
from sklearn.metrics import roc_auc_score
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
roc_auc_score = roc_auc_score(y_test,best_model.predict_proba(x_test)[:,1])
print(f"accuracy score:{accuracy}")

print(f"ROC AUC SCORE:, {roc_auc_score}")


accuracy score:0.8156424581005587
ROC AUC SCORE:, 0.8804375804375805


In [61]:
print(best_model.predict_proba(x_test).shape)

(179, 2)


TRYING HYPERPARAMETER TUNING 2

In [62]:
param_grid = {
  'penalty': ['l1','l2','elasticnet']

}

In [63]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=5)
grid_search.fit(x_train,y_train)
grid_search =GridSearchCV(estimator=model,param_grid=param_grid,cv=5)

grid_search.fit(x_train,y_train)

best_params = grid_search.best_params_

print(best_params)

10 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------------------------

{'penalty': 'l2'}


10 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

---------------------------------------

In [64]:
model.fit(x_train,y_train)

best_model = LogisticRegression(max_iter=1000,**best_params)

best_model.fit(x_train,y_train)


In [65]:
from sklearn.metrics import roc_auc_score
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
y_pred_proba = best_model.predict_proba(x_test)[:, 1]
roc_auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy score: {accuracy}")
print(f"ROC AUC SCORE: {roc_auc_score}")



Accuracy score: 0.8156424581005587
ROC AUC SCORE: 0.8804375804375805


HYPERPAREMETER TUNING 3

**CONCLUSION**

---



Using C param_grid has the best model improvement performance to approximately 85% from 78%. We, therefore explore other classification models and find which fits best

In [36]:
#using random forest as the alternative

RANDOM FOREST DECISION TREE ALGORITHM

Choosing my X and Y variables for the random forest classifier

In [37]:
#dropping the column on property id
#train= train.drop(columns = 'PassengerId')

In [38]:
#confirming if passengerid was dropped
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,47,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.0,0,0,669,7.925,47,2


In [66]:
x = train[['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
y = train['Survived']

In [67]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(x, y,test_size = 0.2, random_state = 2)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth = 3, random_state = 0)
clf_gini.fit(x_train,y_train)
y_pred = clf_gini.predict(x_test)
from sklearn.metrics import accuracy_score

print('accuracy score:{0:0.4f}'.format(accuracy_score(y_test,y_pred)))

accuracy score:0.7989


HYPERPARAMETER TUNING IN RANDOM FOREST

In [68]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [69]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
rf = RandomForestClassifier()
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(x_train, y_train)
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}


In [70]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [5],
    'min_samples_leaf': [1],

}





In [71]:
model.fit(x_train,y_train)

best_model =RandomForestClassifier(**random_search.best_params_)

best_model.fit(x_train,y_train)
from sklearn.metrics import roc_auc_score
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
y_pred_proba = best_model.predict_proba(x_test)[:, 1]
roc_auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy score: {accuracy}")
print(f"ROC AUC SCORE: {roc_auc_score}")

Accuracy score: 0.8156424581005587
ROC AUC SCORE: 0.8905063291139241
