In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
dataset = pd.read_csv("dataset/train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
Id = dataset.iloc[:, 0]

y = dataset.iloc[:, 1]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
X = dataset.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket'])

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     204 non-null    object 
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [9]:
X.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)
X['Age'] = X['Age'].fillna(X['Age'].mean())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)


In [11]:
X.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [12]:
X['Cabin'].nunique()

147

In [13]:
letter_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}

def map_cabin_value(cabin):
    if pd.notna(cabin):
        for letter, value in letter_mapping.items():
            if letter in cabin:
                return value
    return 0

X['Cabin'] = X['Cabin'].apply(map_cabin_value)

In [14]:
X['Cabin'].nunique()

8

In [15]:
X['Cabin']

0      0
1      3
2      0
3      3
4      0
      ..
886    0
887    2
888    0
889    3
890    0
Name: Cabin, Length: 891, dtype: int64

In [16]:
X.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

ct = ColumnTransformer(
    transformers=[('encoder', encoder, ['Sex', 'Embarked'])],
    remainder='passthrough'
)

encoded_data = ct.fit_transform(X)

encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['Sex', 'Embarked'])

remaining_columns = [col for col in X.columns if col not in ['Sex', 'Embarked']]

final_column_names = list(encoded_column_names) + remaining_columns

encoded_dataset = pd.DataFrame(encoded_data, columns=final_column_names)

encoded_dataset.columns

X = encoded_dataset

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex_male    891 non-null    float64
 1   Embarked_Q  891 non-null    float64
 2   Embarked_S  891 non-null    float64
 3   Pclass      891 non-null    float64
 4   Age         891 non-null    float64
 5   SibSp       891 non-null    float64
 6   Parch       891 non-null    float64
 7   Fare        891 non-null    float64
 8   Cabin       891 non-null    float64
dtypes: float64(9)
memory usage: 62.8 KB


In [19]:
X.isnull().sum()

Sex_male      0
Embarked_Q    0
Embarked_S    0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin         0
dtype: int64

In [20]:
X.describe()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Pclass,Age,SibSp,Parch,Fare,Cabin
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,0.08642,0.725028,2.308642,29.699118,0.523008,0.381594,32.204208,0.766554
std,0.47799,0.281141,0.446751,0.836071,13.002015,1.102743,0.806057,49.693429,1.56917
min,0.0,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,2.0,22.0,0.0,0.0,7.9104,0.0
50%,1.0,0.0,1.0,3.0,29.699118,0.0,0.0,14.4542,0.0
75%,1.0,0.0,1.0,3.0,35.0,1.0,0.0,31.0,0.0
max,1.0,1.0,1.0,3.0,80.0,8.0,6.0,512.3292,7.0


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Cross Validation

In [23]:
from sklearn.model_selection import cross_val_score

def evaluate_model(model, X, y, cv=5, scoring='accuracy'):
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    print(f"Cross-validation scores: {scores}")
    print(f"Mean {scoring}: {scores.mean():.4f}")
    return scores.mean()


Decision tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
model = classifier
classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[115  24]
 [ 24  60]]


0.7847533632286996

In [25]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.74301676 0.78651685 0.80898876 0.76966292 0.81460674]
Mean accuracy: 0.7846


0.7845584081350825

K-NN

In [26]:

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[115  24]
 [ 24  60]]


0.7847533632286996

In [27]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.62011173 0.6741573  0.73033708 0.71348315 0.74719101]
Mean accuracy: 0.6971


0.6970560542338837

Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[118  21]
 [ 19  65]]


0.820627802690583

In [29]:
evaluate_model(model, X, y)

Cross-validation scores: [0.80446927 0.80898876 0.79775281 0.79775281 0.83146067]
Mean accuracy: 0.8081


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8080848659845584

Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[122  17]
 [ 24  60]]


0.8161434977578476

In [31]:
evaluate_model(model, X, y, cv=10, scoring='f1_weighted')

Cross-validation scores: [0.77494241 0.74226025 0.72426291 0.83321973 0.88610043 0.79498078
 0.79319719 0.75981111 0.87728302 0.84073885]
Mean f1_weighted: 0.8027


0.8026796670510018

In [32]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[124  15]
 [ 21  63]]


0.8385650224215246

In [33]:
evaluate_model(model, X, y, cv=10, scoring='f1_weighted')

Cross-validation scores: [0.77649393 0.79775281 0.72426291 0.83321973 0.88694124 0.84054061
 0.79319719 0.77021909 0.91056418 0.84181337]
Mean f1_weighted: 0.8175


0.8175005061931062

SVM

In [34]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[115  24]
 [ 25  59]]


0.7802690582959642

In [35]:
# evaluate_model(model, X, y, cv=5, scoring='accuracy')

Kernel SVM

In [36]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 42)
model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[119  20]
 [ 25  59]]


0.7982062780269058

In [37]:
evaluate_model(model, X, y, cv=5, scoring='accuracy')

Cross-validation scores: [0.59217877 0.71348315 0.68539326 0.68539326 0.69101124]
Mean accuracy: 0.6735


0.6734919339652251

Xgboost

In [38]:
import xgboost as xgb

classifier = xgb.XGBClassifier(
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Learning rate
    max_depth=5,           # Tree depth
    subsample=0.8,         # % of data per tree
    colsample_bytree=0.8,  # % of features per tree
    random_state=42
)

model = classifier

classifier.fit(X_train, y_train)


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[126  13]
 [ 20  64]]


0.852017937219731