Project : Titanic :  Logistic Regression 

Step 1 : Import Libraries 

In [68]:
# import library for project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier, BayesianRidge
from sklearn.tree import DecisionTreeClassifier, BaseDecisionTree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

Step 2 : Load and Inspect the Data 

In [None]:
# Load Train dataset
t_train = pd.read_csv('train.csv')
# Load Test dataset
t_test = pd.read_csv('test.csv')

# Check dataset summery of statistics
t_train.info()
print('---'*15)
t_test.info()


In [None]:
t_train.isnull().sum()

In [None]:
t_train.describe()

Setp : 3 Data Cleaning

In [None]:
# check status of survived person by PClass in train data set
survived_by_pclass = t_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
survived_by_pclass


In [None]:
# check status of survived person by Sex in train data set
survived_by_sex =  t_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
survived_by_sex

In [None]:
# check status of survived person by Parch (parents/children aboard) in tain data set
survived_by_Parch = t_train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)
survived_by_Parch

In [None]:
survived_by_SibSp = t_train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
survived_by_SibSp

In [None]:
# create a histogram for survived person, Pclass and Age

plot = sns.FacetGrid(t_train, col='Survived', row='Pclass', height=2.5, aspect=2)
plot.map(plt.hist,'Age', bins = 25)

In [None]:
# create a histogram for survived person Em

plot = sns.FacetGrid(t_train, col='Survived', row='Embarked', height=2.5, aspect=2)
plot.map(sns.barplot,'Sex', 'Fare', errorbar = None)

Step 3: Data Cleaning  

In [None]:
# Here we drop columns which one is not much impact on our model, like PassengerId and Cabin 

t_train = t_train.drop(['PassengerId', 'Cabin', 'Ticket'], axis=1)
t_test = t_test.drop(['Cabin', 'Ticket'], axis=1)
t_train = t_train.drop(['Name'], axis = 1)
t_test = t_test.drop(['Name'], axis=1)
# As per data statistics report we fill  null value in  train and test dataset
t_train['Embarked'] = t_train['Embarked'].fillna(method ='ffill')

t_train['Age'] = t_train['Age'].fillna(t_train['Age'].mean())

t_test['Fare'] = t_test['Fare'].fillna(t_test['Fare'].mean())

t_test['Age'] = t_test['Age'].fillna(t_test['Age'].mean())

Step 4: Feature Engineering 

In [None]:
## Here we change gender in binary 0 for female and 1  for male 
t_train['Sex'] = t_train['Sex'].replace({'female': 0, 'male' : 1 })
t_test['Sex'] = t_test['Sex'].replace({'female': 0, 'male' : 1 })  
## Embarked will change S =1, C =2, Q =3 
t_train['Embarked'] = t_train['Embarked'].replace({'S': 1, 'C': 2, 'Q':3})
t_test['Embarked'] =t_test['Embarked'].replace({'S': 1, 'C': 2, 'Q':3})

Step 5: Split Data into Training  and Testing  Sets


In [None]:
## We drop Survived Column from data set 
x_train = t_train.drop('Survived', axis = 1)
y_train = t_train['Survived']
## Drop PassengerTD column from test data set 
x_test = t_test.drop('PassengerId', axis = 1)
x_train.shape, y_train.shape, x_test.shape

Step 6: Standardize and Normalize Feature 

In [None]:
## used stander scaler to fit data of x_train and x_test
scaler = StandardScaler()

x_train[['Age', 'Fare']] = scaler.fit_transform(x_train[['Age', 'Fare']])
x_test[['Age', 'Fare']] = scaler.fit_transform(x_test[['Age', 'Fare']])

Step 7: Build and Train Machine Learning  Model 

In [None]:
## Here we take as data for Y test from Gender_submission_CSV for reference purpose 
y_test = pd.read_csv('gender_submission.csv')
y_test = y_test.drop('PassengerId', axis = 1)

## Define model to use
model = {
    'Logistic Regression' : LogisticRegression(),
    'Support Vector Classification' : SVC(), 
    'Stochastic Gradient Descent_C' : SGDClassifier(),
    'Multi-Layer Perception classifier' : MLPClassifier(),
    'Decision Tree Classifier' : DecisionTreeClassifier(),
    'Random Forest Classifier' : RandomForestClassifier(n_estimators= 3),
    'K-Nearest Neighbors' : KNeighborsClassifier(n_neighbors=7),
    'Gradient Boosting Classifier' : GradientBoostingClassifier()
}
score = []

## We have define data set 
for model_name, model in model.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score.append({'Model': model_name,
                  'Training_Score' : model.score(x_train, y_train),
                  'Accuracy_with_Gender ': accuracy_score(y_test, y_pred)
                  })
    
score_df = pd.DataFrame(score)
display(score_df)
 

Step 8: Visualization of Model Accuracy and Training Model Score 

In [None]:

# Sample score data in the format given
score_df = pd.DataFrame(score)

# Melt the DataFrame to a long format for `hue` usage
melted_score_df = score_df.melt(id_vars="Model", 
                                value_vars=["Training_Score", "Accuracy_with_Gender "],
                                var_name="Score_Type", 
                                value_name="Score")

# Plot using hue for Training_Score and Accuracy_with_Gender
plt.figure(figsize=(12, 6))
sns.barplot(x="Model", y="Score", hue="Score_Type", data=melted_score_df, palette=["blue", "green"])

# Customize plot appearance
plt.xticks(rotation=45, ha="right")
plt.ylim(0, 1)
plt.title("Training Scores and Test Accuracy by Model")
plt.ylabel("Score")
plt.xlabel("Model")

# Display legend and the plot
plt.legend(title="Score Type")
plt.tight_layout()
plt.show()
