In [167]:
import pandas as pd

In [168]:
train_df = pd.read_csv('train.csv')

train_df.info()

train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [169]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [170]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [171]:
train_df['Survived'].value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [172]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [173]:
train_df.groupby('Sex')['Survived'].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [174]:
train_df.groupby('Pclass')['Survived'].mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

### Data Cleaning and Feature Engineering

In [175]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())

train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

# Cabin: extract Deck letter, fill missing with 'U' for Unknown
train_df['Deck'] = train_df['Cabin'].str[0].fillna('U')

In [176]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Deck'],
      dtype='object')

Create Derived Features

In [177]:
# Family size (total number of family members on board/Adding 1 accounts for the passenger themself)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# 1 if the passenger is alone (FamilySize == 1) , 0 otherwise
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)

# Extract title from name 
train_df['Title'] = train_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

Convert Categorical Variables

In [178]:
train_df = pd.get_dummies(train_df, columns=['Sex','Embarked','Title','Deck'],drop_first=True)

Features for Modeling

In [179]:
features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone'] + \
           [col for col in train_df.columns if col.startswith(('Sex_', 'Embarked_', 'Title_', 'Deck_'))]

X = train_df[features]
y = train_df['Survived']


In [180]:
print(X.shape)         
print(X.isnull().sum()) 
print(y.value_counts(normalize=True))

(891, 32)
Pclass            0
Age               0
Fare              0
FamilySize        0
IsAlone           0
Sex_male          0
Embarked_Q        0
Embarked_S        0
Title_Col         0
Title_Countess    0
Title_Don         0
Title_Dr          0
Title_Jonkheer    0
Title_Lady        0
Title_Major       0
Title_Master      0
Title_Miss        0
Title_Mlle        0
Title_Mme         0
Title_Mr          0
Title_Mrs         0
Title_Ms          0
Title_Rev         0
Title_Sir         0
Deck_B            0
Deck_C            0
Deck_D            0
Deck_E            0
Deck_F            0
Deck_G            0
Deck_T            0
Deck_U            0
dtype: int64
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [181]:
from sklearn.model_selection import train_test_split ,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score ,classification_report
from imblearn.over_sampling import SMOTE

SMOTE for class balancing

In [182]:
smote = SMOTE(random_state=42)
X_res , y_res = smote.fit_resample(X,y)

In [183]:
# check class balance after SMOTE
print(pd.Series(y).value_counts(normalize=True))   # Before
print(pd.Series(y_res).value_counts(normalize=True))  # After

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64
Survived
0    0.5
1    0.5
Name: proportion, dtype: float64


Split into train and validation sets

In [184]:
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

Initialize models

In [185]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

Train, predict, and evaluate

In [186]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    results[name] = acc

for model_name, acc in results.items():
    print(f"{model_name} Validation Accuracy: {acc:.4f}")

Logistic Regression Validation Accuracy: 0.8864
Random Forest Validation Accuracy: 0.8409
XGBoost Validation Accuracy: 0.8773


In [187]:
for name, model in models.items():
    cv_scores = cross_val_score(model, X_res, y_res, cv=5, scoring='accuracy')
    print(f"{name} CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Logistic Regression CV Accuracy: 0.8280 ± 0.0484
Random Forest CV Accuracy: 0.8371 ± 0.0568
XGBoost CV Accuracy: 0.8444 ± 0.0484


hyperparamter tuning 

In [188]:
# for random forest
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [4,6,8,None],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy',n_jobs=-1)
grid_search.fit(X_res, y_res)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best CV Accuracy: 0.8562141967621418


In [189]:
# for xgboost
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [3,4,5],
    'learning_rate': [0.01,0.05,0.1],
    'subsample': [0.8,1.0],
    'colsample_bytree': [0.8,1.0],
    'gamma': [0,0.1,0.2]
}

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_res,y_res)

print(" Best Parameters:", grid_search.best_params_)
print(f" Best CV Accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
 Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
 Best CV Accuracy: 0.8598


In [190]:
best_xgb = XGBClassifier(
    colsample_bytree=0.8,
    gamma=0.1,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=200,
    subsample=1.0,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb.fit(X_res, y_res)

y_pred = best_xgb.predict(X_val) 

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))



Validation Accuracy: 0.9454545454545454
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       103
           1       0.96      0.93      0.95       117

    accuracy                           0.95       220
   macro avg       0.94      0.95      0.95       220
weighted avg       0.95      0.95      0.95       220



Retrain on the Full (Balanced) Data

In [192]:
best_xgb.fit(X_res, y_res)

y_pred_full = best_xgb.predict(X_res)
print("Training Accuracy:", accuracy_score(y_res, y_pred_full))
print(classification_report(y_res, y_pred_full))

Training Accuracy: 0.9380692167577414
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       549
           1       0.97      0.90      0.94       549

    accuracy                           0.94      1098
   macro avg       0.94      0.94      0.94      1098
weighted avg       0.94      0.94      0.94      1098

