In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [51]:
dataset = pd.read_csv("dataset/train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
Id = dataset.iloc[:, 0]

y = dataset.iloc[:, 1]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [53]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [54]:
X = dataset.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket'])

In [55]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     204 non-null    object 
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [56]:
X.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [57]:
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
X['Age'] = X['Age'].fillna(X['Age'].mean())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)


In [58]:
X.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [59]:
X['Cabin'].nunique()

147

In [60]:
letter_mapping = {'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G'}

def map_cabin_value(cabin):
    if pd.notna(cabin):
        for letter, value in letter_mapping.items():
            if letter in cabin:
                return value
    return 0

X['Cabin'] = X['Cabin'].apply(map_cabin_value)

In [61]:
X['Cabin'].nunique()

8

In [62]:
X['Cabin']

0      0
1      C
2      0
3      C
4      0
      ..
886    0
887    B
888    0
889    C
890    0
Name: Cabin, Length: 891, dtype: object

In [63]:
X.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [64]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

X[['Sex', 'Embarked', 'Cabin']] = X[['Sex', 'Embarked', 'Cabin']].astype(str)

ct = ColumnTransformer(
    transformers=[('encoder', encoder, ['Sex', 'Embarked', 'Cabin'])],
    remainder='passthrough'
)

encoded_data = ct.fit_transform(X)

encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['Sex', 'Embarked', 'Cabin'])

remaining_columns = [col for col in X.columns if col not in ['Sex', 'Embarked', 'Cabin']]

final_column_names = list(encoded_column_names) + remaining_columns

encoded_dataset = pd.DataFrame(encoded_data, columns=final_column_names)

encoded_dataset.columns

X = encoded_dataset

In [65]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex_male    891 non-null    float64
 1   Embarked_Q  891 non-null    float64
 2   Embarked_S  891 non-null    float64
 3   Cabin_A     891 non-null    float64
 4   Cabin_B     891 non-null    float64
 5   Cabin_C     891 non-null    float64
 6   Cabin_D     891 non-null    float64
 7   Cabin_E     891 non-null    float64
 8   Cabin_F     891 non-null    float64
 9   Cabin_G     891 non-null    float64
 10  Pclass      891 non-null    float64
 11  Age         891 non-null    float64
 12  SibSp       891 non-null    float64
 13  Parch       891 non-null    float64
 14  Fare        891 non-null    float64
dtypes: float64(15)
memory usage: 104.5 KB


In [66]:
X.isnull().sum()

Sex_male      0
Embarked_Q    0
Embarked_S    0
Cabin_A       0
Cabin_B       0
Cabin_C       0
Cabin_D       0
Cabin_E       0
Cabin_F       0
Cabin_G       0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
dtype: int64

In [67]:
X.describe()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,0.08642,0.725028,0.016835,0.05275,0.066218,0.037037,0.037037,0.013468,0.004489,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.47799,0.281141,0.446751,0.128725,0.223659,0.248802,0.188959,0.188959,0.115332,0.06689,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,0.0,0.0,7.9104
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,35.0,1.0,0.0,31.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [68]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [69]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Cross Validation

In [70]:
from sklearn.model_selection import cross_val_score

def evaluate_model(model, X, y, cv=5, scoring='accuracy'):
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    print(f"Cross-validation scores: {scores}")
    print(f"Mean {scoring}: {scores.mean():.4f}")
    return scores.mean()


Decision tree

In [71]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
model = classifier
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[116  23]
 [ 26  58]]


0.7802690582959642

In [72]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.75418994 0.76966292 0.76966292 0.76966292 0.79775281]
Mean accuracy: 0.7722


0.7721863034335572

K-NN

In [73]:

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[116  23]
 [ 21  63]]


0.8026905829596412

In [74]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.61452514 0.66853933 0.7247191  0.71910112 0.71348315]
Mean accuracy: 0.6881


0.6880735672588035

Logistic Regression

In [75]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[118  21]
 [ 22  62]]


0.8071748878923767

In [76]:
evaluate_model(model, X, y)

Cross-validation scores: [0.79329609 0.80337079 0.79213483 0.78089888 0.80898876]
Mean accuracy: 0.7957


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7957378695624883

Random Forest

In [77]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[126  13]
 [ 29  55]]


0.8116591928251121

In [78]:
evaluate_model(model, X, y, cv=10, scoring='f1_weighted')

Cross-validation scores: [0.74211538 0.78588533 0.73163495 0.86584626 0.87673316 0.78268624
 0.80374311 0.79319719 0.89914531 0.8526363 ]
Mean f1_weighted: 0.8134


0.8133623244994123

In [79]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[124  15]
 [ 21  63]]


0.8385650224215246

In [80]:
evaluate_model(model, X, y, cv=10, scoring='f1_weighted')

Cross-validation scores: [0.7345679  0.7752809  0.73447598 0.83321973 0.85432101 0.82980855
 0.7911255  0.74470894 0.89914531 0.84181337]
Mean f1_weighted: 0.8038


0.803846720233223

SVM

In [81]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[114  25]
 [ 25  59]]


0.7757847533632287

In [82]:
# evaluate_model(model, X, y, cv=5, scoring='accuracy')

Kernel SVM

In [83]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 42)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[118  21]
 [ 24  60]]


0.7982062780269058

In [84]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.58100559 0.71348315 0.69101124 0.68539326 0.69101124]
Mean accuracy: 0.6724


0.6723808925993346

Xgboost

In [85]:
import xgboost as xgb

classifier = xgb.XGBClassifier(
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Learning rate
    max_depth=5,           # Tree depth
    subsample=0.8,         # % of data per tree
    colsample_bytree=0.8,  # % of features per tree
    random_state=42
)

model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[127  12]
 [ 19  65]]


0.8609865470852018