In [27]:
import pandas as pd
import numpy as np

training_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
training_set.head()
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [28]:
corr_matrix = training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
training_set.isna().sum()
training_set['Cabin'].value_counts()
#training_set['Embarked'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [30]:
###############################################################################
################ HANDLING MISSING DATA AND EXTRACINTG FEATURES ################
###############################################################################
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy="mean")
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

training_set[['Age']] = mean_imputer.fit_transform(training_set[['Age']])
training_set[['Embarked']] = most_frequent_imputer.fit_transform(training_set[['Embarked']])

training_set['Cabin'] = training_set['Cabin'].fillna('Missing')
training_set['Deck'] = training_set['Cabin'].str[0]
training_set['HasCabin'] = (training_set['Cabin'] != 'Missing').astype(int)
training_set['CabinCount'] = training_set['Cabin'].apply(lambda x:0 if x != 'Missing' else len(x.split()))
training_set['Title'] = training_set['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
training_set = training_set.drop(columns=['Name']) #===== DROPPING THE TITLE COLUMN BECAUSE I EXTRACTED TITLE FROM IT
training_set = training_set.drop(columns=['Cabin']) #===== DROPPING THE CABIN COLUMN BECAUSE I EXTRACTED USEFUL FEATURES OUT OF IT
training_set = training_set.drop(columns=['Ticket']) #===== DROPPING THE TICKET COLUMN BECAUSE IT SERVES NO PURPOSE

In [31]:
###############################################################################
########################## ENCODING CATEGORICAL DATA ##########################
###############################################################################
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#================================================================================#
#==================== APPLYING LABEL ENCODING TO 'SEX' COLUMN ===================#
#================================================================================#
le = LabelEncoder()
training_set["Sex"] = le.fit_transform(training_set["Sex"])

#==================================================================================#
#======= APPLYING ONE HOT ENCODING TO 'DECK', 'TITLE' AND 'EMBARKED' COLUMNS ======#
#==================================================================================#
encoder = OneHotEncoder(sparse_output = False, handle_unknown = "ignore")
encoded_training = encoder.fit_transform(training_set[["Deck", "Title", "Embarked"]])

# Convert back to DataFrame with column names
encoded_df = pd.DataFrame(encoded_training, columns=encoder.get_feature_names_out(["Deck", "Title", "Embarked"]), index = training_set.index)

# Combine with original data (dropping the old column)
training_set = pd.concat([training_set.reset_index(drop=True), encoded_df], axis=1).drop(columns=["Deck", "Title", "Embarked"])

#==================================================================================#
#========================= SCALING 'FARE' AND 'AGE' COLUMNS =======================#
#==================================================================================#
scaler = StandardScaler()
cols_to_scale = ['Age', 'Fare']
training_set[cols_to_scale] = scaler.fit_transform(training_set[cols_to_scale])


In [39]:
training_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,HasCabin,CabinCount,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,-0.592481,1,0,-0.502445,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,1,0,0.638789,1,0,0.786845,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,1,3,0,-0.284663,0,0,-0.488854,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,1,0,0.407926,1,0,0.42073,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0,3,1,0.407926,0,0,-0.486337,0,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=300, subsample=1.0; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=200, subsample=1.0; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=300, subsample=0.8; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=3, min_samples_l

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV



from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score



X = training_set.drop(["PassengerId", "Survived"], axis = 1)
y = training_set['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

# sk_folds = StratifiedKFold(n_splits = 5)
# # 3. Train and evaluate
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_val)
#     cv_score = cross_val_score(model, X_train, y_train, cv =  sk_folds)
#     print(f"--- {name} ---")
#     print("Accuracy:", accuracy_score(y_val, y_pred))
#     print("Precision:", precision_score(y_val, y_pred))
#     print("Recall:", recall_score(y_val, y_pred))
#     print("F1-score:", f1_score(y_val, y_pred))
#     print("Cross-Val Score: ", cv_score)
    
#     print()


param_grid_rf = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search_rf = GridSearchCV(
    estimator=models["Random Forest"],
    param_grid=param_grid_rf,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 1.0]
}

grid_search_gb = GridSearchCV(
    estimator=models["Gradient Boosting"],
    param_grid=param_grid_gb,
    cv=5,                # 5-fold CV
    scoring='accuracy',  # or 'f1', 'roc_auc' if imbalanced data
    n_jobs=-1,           # parallelize
    verbose=2
)
grid_search_rf.fit(X_train, y_train)
# grid_search_gb.fit(X_train, y_train)



Fitting 5 folds for each of 486 candidates, totalling 2430 fits


In [34]:
###############################################################################
########### HANDLING MISSING DATA AND EXTRACINTG FEATURES (FOR THE TEST SET) ###########
###############################################################################
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy="mean")
most_frequent_imputer = SimpleImputer(strategy='most_frequent')
test_set_copy = test_set.copy()

test_set_copy[['Age']] = mean_imputer.fit_transform(test_set_copy[['Age']])
test_set_copy[['Embarked']] = most_frequent_imputer.fit_transform(test_set_copy[['Embarked']])

test_set_copy['Cabin'] = test_set_copy['Cabin'].fillna('Missing')
test_set_copy['Deck'] = test_set_copy['Cabin'].str[0]
test_set_copy['HasCabin'] = (test_set_copy['Cabin'] != 'Missing').astype(int)
test_set_copy['CabinCount'] = test_set_copy['Cabin'].apply(lambda x:0 if x != 'Missing' else len(x.split()))
test_set_copy['Title'] = test_set_copy['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
test_set_copy = test_set_copy.drop(columns=['Name']) #===== DROPPING THE TITLE COLUMN BECAUSE I EXTRACTED TITLE FROM IT
test_set_copy = test_set_copy.drop(columns=['Cabin']) #===== DROPPING THE CABIN COLUMN BECAUSE I EXTRACTED USEFUL FEATURES OUT OF IT
test_set_copy = test_set_copy.drop(columns=['Ticket']) #===== DROPPING THE TICKET COLUMN BECAUSE IT SERVES NO PURPOSE
test_set_copy["Fare"] = test_set_copy["Fare"].fillna(training_set["Fare"].median())


###############################################################################
########################## ENCODING CATEGORICAL DATA ##########################
###############################################################################
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#================================================================================#
#==================== APPLYING LABEL ENCODING TO 'SEX' COLUMN ===================#
#================================================================================#
le = LabelEncoder()
test_set_copy["Sex"] = le.fit_transform(test_set_copy["Sex"])

#==================================================================================#
#======= APPLYING ONE HOT ENCODING TO 'DECK', 'TITLE' AND 'EMBARKED' COLUMNS ======#
#==================================================================================#
encoded_test = encoder.transform(test_set_copy[["Deck", "Title", "Embarked"]])

# Convert back to DataFrame with column names
encoded_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(["Deck", "Title", "Embarked"]), index = test_set_copy.index)

# Combine with original data (dropping the old column)
test_set_copy = pd.concat([test_set_copy.reset_index(drop=True), encoded_df], axis=1).drop(columns=["Deck", "Title", "Embarked"])

#==================================================================================#
#========================= SCALING 'FARE' AND 'AGE' COLUMNS =======================#
#==================================================================================#
scaler = StandardScaler()
cols_to_scale = ['Age', 'Fare']
test_set_copy[cols_to_scale] = scaler.fit_transform(test_set_copy[cols_to_scale])



In [35]:

# predictions = models["Gradient Boosting"].predict(test_set_copy)

# submission = pd.DataFrame({
#     'PassengerId': test_set_copy['PassengerId'],
#     'Survived': predictions
# })

In [36]:
test_set_copy.head()
test_set_copy.isna().sum().sort_values(ascending=False).head(10)


PassengerId       0
Title_Mlle        0
Title_Don         0
Title_Dr          0
Title_Jonkheer    0
Title_Lady        0
Title_Major       0
Title_Master      0
Title_Miss        0
Title_Mme         0
dtype: int64

In [37]:
training_set.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'HasCabin', 'CabinCount', 'Deck_A', 'Deck_B', 'Deck_C',
       'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_M', 'Deck_T',
       'Title_Capt', 'Title_Col', 'Title_Countess', 'Title_Don', 'Title_Dr',
       'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master',
       'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs',
       'Title_Ms', 'Title_Rev', 'Title_Sir', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [38]:
passenger_ids = test_set_copy["PassengerId"]
X_test = test_set_copy.drop(columns=["PassengerId"])
# predictions = models["Gradient Boosting"].predict(X_test)
predictions = grid_search_rf.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Survived": predictions
})
submission.to_csv('submission.csv', index = False)