In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [130]:
import sys
!{sys.executable} -m pip install --upgrade nbformat



In [131]:
train=pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')

In [132]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


# Data Cleaning

In [133]:
test.info(),train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass  

(None, None)

In [134]:
train.sample(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
660,661,1,1,"Frauenthal, Dr. Henry William",male,50.0,2,0,PC 17611,133.65,,S
428,429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q
228,229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18.0,0,0,236171,13.0,,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
515,516,0,1,"Walker, Mr. William Anderson",male,47.0,0,0,36967,34.0208,D46,S
606,607,0,3,"Karaic, Mr. Milan",male,30.0,0,0,349246,7.8958,,S
638,639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41.0,0,5,3101295,39.6875,,S


<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Handling Null Values  üö¢</h3>

**The 'Cabin' column contains numerous null values, so we will drop it.**



In [135]:
train.drop(columns=['Cabin'],inplace=True)
test.drop(columns=['Cabin'],inplace=True)

In [136]:
train.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [137]:
test.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64

In [138]:
train['Embarked'].fillna('S',inplace=True)

In [139]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Creating a Unified DataFrame for Easier Manipulation üßë‚Äçüíª</h3>


In [140]:
df=pd.concat([train,test],sort=True).reset_index(drop=True)

In [141]:
df.shape

(1309, 11)

In [142]:
df.head()

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [143]:
df.corr(numeric_only=True)['Age'].abs()

Age            1.000000
Fare           0.178314
Parch          0.150917
PassengerId    0.028814
Pclass         0.408106
SibSp          0.243699
Survived       0.077221
Name: Age, dtype: float64

In [144]:
df_Age_mean=df.groupby(['Sex', 'Pclass']).median(numeric_only=True)['Age']
df_Age_mean

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

In [145]:
df['Age']=df.groupby(['Sex','Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

In [146]:
df.isnull().sum()

Age              0
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived       418
Ticket           0
dtype: int64

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Extracting Titles from Names and Adding as a Separate Column üìú</h3>


In [147]:
df['Title']=df['Name'].str.split(", ",expand=True)[1].str.split(".",expand=True)[0]

In [148]:
df['Title'].value_counts()

Title
Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Major             2
Mlle              2
Ms                2
Mme               1
Don               1
Sir               1
Lady              1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: count, dtype: int64

In [149]:
df['Title'] = df['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [150]:
df['Title'].value_counts()

Title
Mr        757
Miss      264
Mrs       198
Master     61
Rare       29
Name: count, dtype: int64

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Creating a Family Column by Combining Sibling and Parent Child Column üë®‚Äçüë©‚Äçüëß‚Äçüë¶</h3>



In [151]:
df['Family_size']=df['SibSp'] + df['Parch'] + 1

In [152]:
df.sample(10)

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_size
728,25.0,S,26.0,"Bryhl, Mr. Kurt Arnold Gottfrid",0,729,2,male,1,0.0,236853,Mr,2
922,24.0,S,31.5,"Jefferys, Mr. Clifford Thomas",0,923,2,male,2,,C.A. 31029,Mr,3
415,22.0,S,8.05,"Meek, Mrs. Thomas (Annie Louise Rowley)",0,416,3,female,0,0.0,343095,Mrs,1
1255,25.0,C,55.4417,"Harder, Mrs. George Achilles (Dorothy Annan)",0,1256,1,female,1,,11765,Mrs,2
565,24.0,S,24.15,"Davies, Mr. Alfred J",0,566,3,male,2,0.0,A/4 48871,Mr,3
1297,23.0,S,10.5,"Ware, Mr. William Jeffery",0,1298,2,male,1,,28666,Mr,2
344,36.0,S,13.0,"Fox, Mr. Stanley Hubert",0,345,2,male,0,0.0,229236,Mr,1
450,36.0,S,27.75,"West, Mr. Edwy Arthur",2,451,2,male,1,0.0,C.A. 34651,Mr,4
731,11.0,C,18.7875,"Hassan, Mr. Houssein G N",0,732,3,male,0,0.0,2699,Mr,1
592,47.0,S,7.25,"Elsbury, Mr. William James",0,593,3,male,0,0.0,A/5 3902,Mr,1


In [153]:
df.drop(columns=['Name','Parch','SibSp','Ticket'],inplace=True)

In [154]:
df.sample(10)

Unnamed: 0,Age,Embarked,Fare,PassengerId,Pclass,Sex,Survived,Title,Family_size
512,36.0,S,26.2875,513,1,male,1.0,Mr,1
824,2.0,S,39.6875,825,3,male,0.0,Master,6
837,25.0,S,8.05,838,3,male,0.0,Mr,1
121,25.0,S,8.05,122,3,male,0.0,Mr,1
570,62.0,S,10.5,571,2,male,1.0,Mr,1
1019,42.0,S,13.0,1020,2,male,,Mr,1
1286,18.0,S,60.0,1287,1,female,,Mrs,2
709,25.0,C,15.2458,710,3,male,1.0,Master,3
89,24.0,S,8.05,90,3,male,0.0,Mr,1
1027,26.5,C,7.225,1028,3,male,,Mr,1


In [155]:
def family_size(number):
    if number==1:
        return "Alone"
    elif number>1 and number <5:
        return "Small"
    else:
        return "Large"

In [156]:
df['Family_size']=df['Family_size'].apply(family_size)


# Exploratory Data Analysis

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1309 non-null   float64
 1   Embarked     1309 non-null   object 
 2   Fare         1309 non-null   float64
 3   PassengerId  1309 non-null   int64  
 4   Pclass       1309 non-null   int64  
 5   Sex          1309 non-null   object 
 6   Survived     891 non-null    float64
 7   Title        1309 non-null   object 
 8   Family_size  1309 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 92.2+ KB


In [158]:
df['Age'] = df['Age'].astype('int64')


In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1309 non-null   int64  
 1   Embarked     1309 non-null   object 
 2   Fare         1309 non-null   float64
 3   PassengerId  1309 non-null   int64  
 4   Pclass       1309 non-null   int64  
 5   Sex          1309 non-null   object 
 6   Survived     891 non-null    float64
 7   Title        1309 non-null   object 
 8   Family_size  1309 non-null   object 
dtypes: float64(2), int64(3), object(4)
memory usage: 92.2+ KB


<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Finding Survival Correlation with Every Other Column üìä</h3>

**We will analyze the correlation between survival and every other column to understand their relationships.**


In [160]:
col=['Embarked','Pclass','Sex','Title','Family_size']
for i in col:
    print('Survival Correlation by:', i)
    df2=df.groupby(i)['Survived'].mean().reset_index()
    print(df2)
    print('-'*10, '\n')

Survival Correlation by: Embarked
  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009
---------- 

Survival Correlation by: Pclass
   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363
---------- 

Survival Correlation by: Sex
      Sex  Survived
0  female  0.742038
1    male  0.188908
---------- 

Survival Correlation by: Title
    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826
---------- 

Survival Correlation by: Family_size
  Family_size  Survived
0       Alone  0.303538
1       Large  0.161290
2       Small  0.578767
---------- 



<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Adding Graphs to Clarify the Picture üìà</h3>

**We will include some visualizations to make the data and its correlations clearer.**


In [161]:
import plotly.express as px
import plotly.graph_objects as go


fig = go.Figure()

fig.add_trace(go.Histogram(x=df[df['Survived']==0]['Age'], name='Not Survived', opacity=0.5))
fig.add_trace(go.Histogram(x=df[df['Survived']==1]['Age'], name='Survived', opacity=0.5))

fig.update_layout(
    title='Age Distribution by Survival',
    xaxis_title='Age',
    yaxis_title='Density',
    barmode='overlay',  
    bargap=0.1, 
)

fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=df[df['Survived'] == 0]['Fare'], name='Not Survived', opacity=0.5))
fig.add_trace(go.Histogram(x=df[df['Survived'] == 1]['Fare'], name='Survived', opacity=0.5))

fig.update_layout(
    title='Fare Distribution by Survival',
    xaxis_title='Fare',
    yaxis_title='Density',
    barmode='overlay',  
    bargap=0.1,  
)

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=df[df['Survived'] == 0]['Family_size'], name='Not Survived', opacity=0.5))
fig.add_trace(go.Histogram(x=df[df['Survived'] == 1]['Family_size'], name='Survived', opacity=0.5))

fig.update_layout(
    title='Family Size Distribution by Survival',
    xaxis_title='Family Size',
    yaxis_title='Density',
    barmode='overlay',  
    bargap=0.1,  
)

fig.show()

In [None]:
from plotly.subplots import make_subplots
col = ['Embarked', 'Pclass', 'Sex', 'Title']

fig = make_subplots(rows=1, cols=len(col), subplot_titles=col)

for i, c in enumerate(col, start=1):
    grouped_data = df.groupby([c, 'Survived']).size().reset_index(name='Count')
    
    survived_data = grouped_data[grouped_data['Survived'] == 1]
    not_survived_data = grouped_data[grouped_data['Survived'] == 0]
    
    fig.add_trace(go.Bar(x=survived_data[c], y=survived_data['Count'], name='Survived', marker_color='green'), row=1, col=i)
    fig.add_trace(go.Bar(x=not_survived_data[c], y=not_survived_data['Count'], name='Not Survived', marker_color='red'), row=1, col=i)
    
    fig.update_xaxes(title_text=c, row=1, col=i)
    fig.update_yaxes(title_text='Count', row=1, col=i)

# Update figure layout
fig.update_layout(
    title='Countplot of Categorical Variables by Survival',
    height=400,  
    showlegend=True,
    barmode='group',  
)

# Show figure
fig.show()

In [None]:
sns.pairplot(df,hue='Survived')

<seaborn.axisgrid.PairGrid at 0x7f5cede87af0>

In [None]:
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(numeric_only=True), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)

correlation_heatmap(df)


# Feature Engineering¬∂

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn import ensemble
from sklearn import gaussian_process
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn import svm
from sklearn import tree
from sklearn import discriminant_analysis
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score


In [None]:
df.head()

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Separating the Data into Train and Test Sets üß™</h3>

**We will divide the data into training and testing sets to evaluate the model's performance.**


In [None]:
train = df.loc[:890]
test = df.loc[891:]

In [None]:
test.head()

In [None]:
test.drop(columns=['Survived'],inplace=True)

In [None]:
train['Survived'] = train['Survived'].astype('int64')

In [None]:
train.head()

In [None]:
train.shape,test.shape

In [None]:
train = train.drop("PassengerId", axis=1)

In [None]:
X_train = train.drop("Survived", axis=1)
y_train = train["Survived"]

In [None]:
X_train.head()

In [None]:
X_train.shape,y_train.shape

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Creating a Pipeline with Column Transformer üîÑ</h3>

**We will build a pipeline that incorporates a Column Transformer to streamline preprocessing and feature engineering.**


In [None]:
num_cat_tranformation=ColumnTransformer([
                                    ('scaling',MinMaxScaler(),[0,2]),
                                    ('onehotencolding1',OneHotEncoder(),[1,3]),
                                    ('ordinal',OrdinalEncoder(),[4]),
                                    ('onehotencolding2',OneHotEncoder(),[5,6])
                                    ],remainder='passthrough')

In [None]:
bins=ColumnTransformer([
                        ('Kbins',KBinsDiscretizer(n_bins=15,encode='ordinal',strategy='quantile'),[0,2]),
                        ],remainder='passthrough')

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_train,y_train,test_size=0.2)

In [None]:
def create_pipeline(algo):
    return Pipeline([
        ('num_cat_transformation', num_cat_tranformation),
        ('bins', bins),
        ('classifier', algo)
    ])

# Model Training

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Evaluating Various Machine Learning Models to Find the Best Performer ü§ñ</h3>

**We will test different machine learning models to identify which one provides the best performance for our data.**


In [None]:
algorithms=[
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    # Naive Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    # Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    # Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    
    # XGBoost
    XGBClassifier()
]

In [None]:
model_names = []
CV_Accuracy=[]
for algo in algorithms:
    pipeline = create_pipeline(algo)
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    model_names.append(algo.__class__.__name__)
    CV_Accuracy.append(scores.mean())
    
#     print(f'Model: {algo.__class__.__name__}, CV Accuracy: {scores.mean()}')
#     print('\n')

In [None]:
model_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': CV_Accuracy})

In [None]:
model_df.sort_values(by='Accuracy', ascending=False, inplace=True)

<h3 style="background-color: #111; padding: 15px; font: bold 22px arial; color: lightgreen; border: 2px solid lime; border-radius: 8px">
‚ô† Tuning the Top 3 Machine Learning Models for Better Results üîß</h3>

**We will focus on tuning the top 3 performing models from our data frame to enhance their performance and achieve better results.**


In [None]:
model_df

In [None]:
algo1 = ensemble.RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=42,
                                           n_jobs=-1,
                                           verbose=1) 

algo2 =ensemble.GradientBoostingClassifier(max_depth= 1, max_features='auto', n_estimators=3, random_state=42, warm_start= True)
algo3=linear_model.RidgeClassifierCV()

In [None]:
pipe1=Pipeline([
        ('num_cat_transformation', num_cat_tranformation),
        ('bins', bins),
        ('classifier', algo1)
    ])

In [None]:
pipe1.fit(X_train,y_train)

In [None]:
y_pred = pipe1.predict(X_test)
accuracy_score(y_test,y_pred)


In [None]:
submission = pd.DataFrame()
submission["PassengerId"] = test["PassengerId"]
submission

In [None]:
test = test.drop("PassengerId", axis=1)
test.head()

In [None]:
predictions = pipe1.predict(test)
submission["Survived"] =  predictions
submission.to_csv("submission.csv", index=False)