# Model Building & Evaluation

In [40]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [41]:
path = r"G:\PROJECTS-2024\Titanic-ML from disaster\notebooks\train_cleaned_data.csv"
df_train = pd.read_csv(path)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,transformed_fare_boxcox,family_size
0,1,0.0,third,male,22.0,S,2.198188,1
1,2,1.0,first,female,38.0,C,4.569691,1
2,3,1.0,third,female,26.0,S,2.283608,0
3,4,1.0,first,female,35.0,S,4.313319,1
4,5,0.0,third,male,35.0,S,2.298741,0


In [42]:
X = df_train.drop('Survived', axis = 1)
y = df_train['Survived']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 42)

In [43]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the preprocessor (make sure it's consistent with preprocessing in preprocess.py)
numeric_features = ['Age', 'family_size','transformed_fare_boxcox']
ordinal_features = ['Pclass','Sex', 'Embarked']

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("ord", OrdinalEncoder(), ordinal_features)
    ],
    remainder='passthrough'
)

# 1. LogisticRegression

In [44]:
from sklearn.linear_model import LogisticRegression

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

lr_pipeline.fit(X_train, y_train)

In [45]:
y_pred_train_lr = lr_pipeline.predict(X_train)
y_pred_test_lr = lr_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_lr = accuracy_score(y_train, y_pred_train_lr)
accuracy_test_lr = accuracy_score(y_test, y_pred_test_lr)

print(f"Accuracy_train_LR: {accuracy_train_lr}\nAccuracy_test_LR: {accuracy_test_lr}")

cm = confusion_matrix(y_test, y_pred_test_lr)
print(f"Confusion_matrix :\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_lr = classification_report(y_test, y_pred_test_lr)

print(f"Classification_report:\n{classification_report_lr}")


Accuracy_train_LR: 0.7949438202247191
Accuracy_test_LR: 0.8100558659217877
Confusion_matrix :
 [[90 15]
 [19 55]]
Classification_report:
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84       105
         1.0       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [46]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(lr_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))


Cross-validated Accuracy: 0.79 (+/- 0.01)


In [52]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessor
numeric_features = ['Age', 'family_size', 'transformed_fare_boxcox']
ordinal_features = ['Pclass']
categorical_features = ['Sex', 'Embarked']

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("ord", OrdinalEncoder(), ordinal_features),
        ("cat", OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Train different classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting':GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

best_accuracy = 0
best_model_name = None

for model_name, model in classifiers.items():
    # Create a pipeline with the preprocessor and the classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name

print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy}")

Random Forest Accuracy: 0.8324022346368715
Decision Tree Accuracy: 0.770949720670391
Gradient Boosting Accuracy: 0.8100558659217877
XGBoost Accuracy: 0.8212290502793296
Logistic Regression Accuracy: 0.8100558659217877

Best Model: Random Forest with Accuracy: 0.8324022346368715


# 2. RandomForest

In [91]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(max_depth= None, min_samples_leaf= 2, min_samples_split= 5, n_estimators=50))
])

rf_pipeline.fit(X_train, y_train)

In [92]:
y_pred_train_rf = rf_pipeline.predict(X_train)
y_pred_test_rf = rf_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming y_pred_train and y_pred_test are your predicted values
accuracy_train_rf = accuracy_score(y_train, y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)

print(f"Accuracy_train_RF: {accuracy_train_rf}\nAccuracy_test_RF: {accuracy_test_rf}")

cm = confusion_matrix(y_test, y_pred_test_rf)
print(f"Confusion_matrix:\n {cm}")

# Rename the variable to avoid conflicts with the function name
classification_report_rf = classification_report(y_test, y_pred_test_rf)

print(f"Classification_report:\n{classification_report_rf}")

Accuracy_train_RF: 0.9157303370786517
Accuracy_test_RF: 0.8324022346368715
Confusion_matrix:
 [[92 13]
 [17 57]]
Classification_report:
              precision    recall  f1-score   support

         0.0       0.84      0.88      0.86       105
         1.0       0.81      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.83       179
weighted avg       0.83      0.83      0.83       179



In [59]:
from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(rf_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))


Cross-validated Accuracy: 0.83 (+/- 0.02)


# Hyperparameter tune

In [51]:
from sklearn.model_selection import GridSearchCV


rf_classifier = RandomForestClassifier()

# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Create the pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', grid_search)
])

# Fit the pipeline to the data
rf_pipeline.fit(X_train, y_train)

# Get the best parameters from the search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
y_pred_train_rf = rf_pipeline.predict(X_train)
y_pred_test_rf = rf_pipeline.predict(X_test)

# Evaluate the model
accuracy_train_rf = accuracy_score(y_train, y_pred_train_rf)
accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)

print(f"Accuracy on Training Set: {accuracy_train_rf:.4f}")
print(f"Accuracy on Test Set: {accuracy_test_rf:.4f}")

cm_rf = confusion_matrix(y_test, y_pred_test_rf)
print(f"Confusion Matrix:\n{cm_rf}")

classification_report_rf = classification_report(y_test, y_pred_test_rf)
print(f"Classification Report:\n{classification_report_rf}")

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Accuracy on Training Set: 0.9115
Accuracy on Test Set: 0.8212
Confusion Matrix:
[[92 13]
 [19 55]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.88      0.85       105
         1.0       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



# Make Prediction

In [94]:
def predict_survival(passenger_id, model, df):
    passenger_data = df[df['PassengerId'] == passenger_id]

    if not passenger_data.empty:
        # Extract relevant features for prediction
        features = passenger_data.drop('Survived', axis=1)

        # Make prediction
        prediction = rf_pipeline.predict(features)

        return f"Passenger {passenger_id} is predicted to {'survive' if prediction[0] == 1 else 'not survive'}."

    return f"No data found for Passenger {passenger_id}."

# Example usage
passenger_id_to_predict =5
result = predict_survival(passenger_id_to_predict, rf_pipeline, df_train)
print(result)

Passenger 5 is predicted to not survive.


In [71]:
# Test data prediction

In [95]:
path = r"G:\PROJECTS-2024\Titanic-ML from disaster\notebooks\test_prepare_data.csv"
test_df = pd.read_csv(path)
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked,transformed_fare_boxcox,family_size
0,892,,third,male,34.5,Q,2.271871,0
1,893,,third,female,47.0,S,2.164835,1
2,894,,second,male,62.0,Q,2.480332,0
3,895,,third,male,27.0,S,2.370107,0
4,896,,third,female,22.0,S,2.719849,2


In [96]:
test_df.drop(columns = 'Survived', inplace = True)

In [97]:
test_df.shape

(418, 7)

In [98]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Embarked,transformed_fare_boxcox,family_size
0,892,third,male,34.5,Q,2.271871,0
1,893,third,female,47.0,S,2.164835,1
2,894,second,male,62.0,Q,2.480332,0
3,895,third,male,27.0,S,2.370107,0
4,896,third,female,22.0,S,2.719849,2


In [99]:
test_df.isnull().sum()

PassengerId                0
Pclass                     0
Sex                        0
Age                        0
Embarked                   0
transformed_fare_boxcox    0
family_size                0
dtype: int64

In [100]:
# Select features for prediction from the test data
test_features = test_df[['PassengerId', 'Pclass', 'Sex','Age', 'Embarked', 'transformed_fare_boxcox','family_size']]

# Make predictions on the test data
test_predictions = rf_pipeline.predict(test_features)

# Create a DataFrame with 'PassengerId' and predicted 'Survived'
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': test_predictions})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)