In [78]:
import pandas as pd
import numpy as np

training_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

In [79]:
corr_matrix = training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [80]:
training_set.isna().sum()
training_set['Cabin'].value_counts()
#training_set['Embarked'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [81]:
###############################################################################
################ HANDLING MISSING DATA AND EXTRACINTG FEATURES ################
###############################################################################
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy="mean")
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

training_set[['Age']] = mean_imputer.fit_transform(training_set[['Age']])
training_set[['Embarked']] = most_frequent_imputer.fit_transform(training_set[['Embarked']])

training_set['Cabin'] = training_set['Cabin'].fillna('Missing')
training_set['Deck'] = training_set['Cabin'].str[0]
training_set['HasCabin'] = (training_set['Cabin'] != 'Missing').astype(int)
training_set['CabinCount'] = training_set['Cabin'].apply(lambda x:0 if x != 'Missing' else len(x.split()))
training_set['Title'] = training_set['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
training_set = training_set.drop(columns=['Name']) #===== DROPPING THE TITLE COLUMN BECAUSE I EXTRACTED TITLE FROM IT
training_set = training_set.drop(columns=['Cabin']) #===== DROPPING THE CABIN COLUMN BECAUSE I EXTRACTED USEFUL FEATURES OUT OF IT
training_set = training_set.drop(columns=['Ticket']) #===== DROPPING THE TICKET COLUMN BECAUSE IT SERVES NO PURPOSE

In [82]:
###############################################################################
########################## ENCODING CATEGORICAL DATA ##########################
###############################################################################
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#================================================================================#
#==================== APPLYING LABEL ENCODING TO 'SEX' COLUMN ===================#
#================================================================================#
le = LabelEncoder()
training_set["Sex"] = le.fit_transform(training_set["Sex"])

#==================================================================================#
#======= APPLYING ONE HOT ENCODING TO 'DECK', 'TITLE' AND 'EMBARKED' COLUMNS ======#
#==================================================================================#
encoder = OneHotEncoder(sparse_output = False)
encoded = encoder.fit_transform(training_set[["Deck", "Title", "Embarked"]])

# Convert back to DataFrame with column names
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["Deck", "Title", "Embarked"]))

# Combine with original data (dropping the old column)
training_set = pd.concat([training_set.reset_index(drop=True), encoded_df], axis=1).drop(columns=["Deck", "Title", "Embarked"])

#==================================================================================#
#========================= SCALING 'FARE' AND 'AGE' COLUMNS =======================#
#==================================================================================#
scaler = StandardScaler()
cols_to_scale = ['Age', 'Fare']
training_set[cols_to_scale] = scaler.fit_transform(training_set[cols_to_scale])


In [83]:
training_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,HasCabin,CabinCount,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,-0.592481,1,0,-0.502445,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,0,0.638789,1,0,0.786845,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,1,3,0,-0.284663,0,0,-0.488854,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,0,0.407926,1,0,0.42073,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0,3,1,0.407926,0,0,-0.486337,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split


X = training_set.drop(["PassengerId", "Survived"], axis = 1)
y = training_set['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# 3. Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Precision:", precision_score(y_val, y_pred))
    print("Recall:", recall_score(y_val, y_pred))
    print("F1-score:", f1_score(y_val, y_pred))
    print()

print("Accuracy For RFC:", accuracy_score(y_val, y_pred_rfc), "Accuracy For GBC:", accuracy_score(y_val, y_pred_gbc))
print("Precision For RFC:", precision_score(y_val, y_pred_rfc), "Precision For GBC:", precision_score(y_val, y_pred_gbc) )
#print("Recall:", recall_score(y_val, y_pred))
#print("F1-score:", f1_score(y_val, y_pred))
#print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))

--- Logistic Regression ---
Accuracy: 0.8156424581005587
Precision: 0.7662337662337663
Recall: 0.7972972972972973
F1-score: 0.7814569536423841

--- Random Forest ---
Accuracy: 0.8324022346368715
Precision: 0.8142857142857143
Recall: 0.7702702702702703
F1-score: 0.7916666666666666

--- Gradient Boosting ---
Accuracy: 0.8268156424581006
Precision: 0.7945205479452054
Recall: 0.7837837837837838
F1-score: 0.7891156462585034

Accuracy For RFC: 0.8324022346368715 Accuracy For GBC: 0.8268156424581006
Precision For RFC: 0.8142857142857143 Precision For GBC: 0.7945205479452054
