#Titanic Survival Prediction

###You will use the Titanic Survival Dataset to build a classification model to predict whether a passenger survived the sinking of the Titanic, based on attributes of each passenger in the data set.You'll start with building a Random Forest Classifier

##Data Collection and Understanding

###Dataset: https://www.kaggle.com/datasets/yasserh/titanic-dataset/discussion?sort=hotness

###Upload Dataset

In [None]:
from google.colab import files
uploaded = files.upload()


###Load Dataset

In [None]:
import pandas as pd


df = pd.read_csv("Titanic-Dataset.csv")

print(" Dataset Loaded Successfully")
print("Shape of dataset:", df.shape)
df.head()


###Column Information

In [None]:
#Shows each column, datatype, and missing values count.
df.info()


###Column Descriptions

Make a small table of column meanings

In [None]:
#This table explains dataset features
col_descriptions = {
    "PassengerId": "Unique ID of each passenger",
    "Survived": "Target variable (0 = Did not survive, 1 = Survived)",
    "Pclass": "Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd)",
    "Name": "Passenger’s full name",
    "Sex": "Gender",
    "Age": "Age in years",
    "SibSp": "Number of siblings/spouses aboard",
    "Parch": "Number of parents/children aboard",
    "Ticket": "Ticket number",
    "Fare": "Fare paid",
    "Cabin": "Cabin number (many missing)",
    "Embarked": "Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)"
}

pd.DataFrame.from_dict(col_descriptions, orient="index", columns=["Description"])


###Missing Values

In [None]:
#Identify which columns need cleaning (Age, Cabin, Embarked).
df.isnull().sum()


###Summary Statistics

In [None]:
#Mean, min, max, std for numerical features
df.describe()


###Short Report



✔ Dataset shape: 891 × 12

✔ Target variable: Survived

✔ Missing values: Age (177), Cabin (687), Embarked (2)

✔ Numerical features: Age, Fare, SibSp, Parch

✔ Categorical features: Sex, Embarked, Pclass, Cabin

✔ Insights: ~38% survived

# 1. Handle Missing Values


In [None]:
df["Age"] = df["Age"].fillna(df["Age"].median())



In [None]:
# Cabin: too many missing values → create a new feature 'HasCabin' (1 if cabin present, 0 otherwise)
if "Cabin" in df.columns:
 df["HasCabin"] = df["Cabin"].notnull().astype(int)
 df.drop("Cabin", axis=1, inplace=True)

In [None]:
# Embarked: fill missing with mode (most frequent value)
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

In [None]:
print("Missing values after handling:")
print(df.isnull().sum())

2. Detect and Handle Outliers

In [None]:
import numpy as np

In [None]:
# Define function for capping outliers using IQR method
def cap_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return np.where(series < lower, lower, np.where(series > upper, upper, series))


In [None]:
df["Fare"] = cap_outliers(df["Fare"])
df["Age"] = cap_outliers(df["Age"])

 3. Convert Categorical Variables

In [None]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

In [None]:
# Embarked: One-Hot Encoding (C, Q, S)
df = pd.get_dummies(df, columns=["Embarked"], drop_first=True)

In [None]:
# Drop irrelevant columns: Name, Ticket, PassengerId
df.drop(["Name", "Ticket", "PassengerId"], axis=1, inplace=True)

In [None]:
print("\nFinal cleaned dataset shape:", df.shape)
df.head()

# **Exploratory Data Analysis**

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))

df_viz = df.copy()

df_viz['Sex'] = df_viz['Sex'].map({0: 'Male', 1: 'Female'})

df_viz['AgeGroup'] = pd.cut(df_viz['Age'], bins=[0, 12, 18, 35, 60, 100], labels=['Child', 'Teen', 'Adult', 'Mid-Aged', 'Senior'])

Survival rate by Gender

In [None]:

gender_survival = df_viz.groupby('Sex')['Survived'].mean() * 100


plt.figure(figsize=(8, 5))
ax = sns.barplot(x=gender_survival.index, y=gender_survival.values, palette='viridis')
plt.title('Survival Rate by Gender', fontsize=16, fontweight='bold')
plt.ylabel('Survival Rate (%)')
plt.xlabel('Gender')


for i, v in enumerate(gender_survival.values):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.show()

This chart shows the survival rate by gender in which Female have more survival rate of 74.2 % aa compared to males having survival rate of 18.9%.

# **Survival Rate by Passenger Class**

In [None]:

class_survival = df_viz.groupby('Pclass')['Survived'].mean() * 100

plt.figure(figsize=(8, 5))
ax = sns.barplot(x=class_survival.index, y=class_survival.values, palette='magma')
plt.title('Survival Rate by Passenger Class', fontsize=16, fontweight='bold')
plt.ylabel('Survival Rate (%)')
plt.xlabel('Passenger Class')

for i, v in enumerate(class_survival.values):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.show()

This bar plot shows survival rate by passenger class. The results show that class 1 has the highest rate of 63.0 % , followed by class 2 with 47.3% survival rate and then class 3 having lowest survival rate of 24.2 %.

Srvival rate by Age Group

In [None]:

agegroup_survival = df_viz.groupby('AgeGroup')['Survived'].mean() * 100

plt.figure(figsize=(10, 5))
ax = sns.barplot(x=agegroup_survival.index, y=agegroup_survival.values, palette='plasma')
plt.title('Survival Rate by Age Group', fontsize=16, fontweight='bold')
plt.ylabel('Survival Rate (%)')
plt.xlabel('Age Group')

for i, v in enumerate(agegroup_survival.values):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.show()

This bar plot shows survival rate of passenegers according to their group which shows that children have the highest survival rate .

Histogram of Age Distribution

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df_viz['Age'].dropna(), kde=True, bins=30, color='skyblue')
plt.title('Distribution of Age on the Titanic', fontsize=16)
plt.xlabel('Age')
plt.ylabel('Count')
plt.axvline(df_viz['Age'].mean(), color='red', linestyle='--', label=f'Mean Age: {df_viz["Age"].mean():.1f}')
plt.legend()
plt.show()

This histogram shows the age distribution of passengers and the results highlight that most of the peopel are around 25 to 30 and the peak of histogram lies at 30 showing mean age of passengers.

Bos Plot of Fare by Passenger Class

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x='Pclass', y='Fare', data=df_viz, palette='magma')
plt.title('Fare Distribution by Passenger Class', fontsize=16)
plt.yscale('log')
plt.ylabel('Fare (Log Scale)')
plt.xlabel('Passenger Class')
plt.show()

First class was  more expensive than second or third class

Second and third class tickets were more affordable and had less variation in their prices compared to first class.


In [None]:
plt.figure(figsize=(8, 5))

survived_sex = pd.crosstab(df_viz['Sex'], df_viz['Survived'])

survived_sex.plot(kind='bar', stacked=True, color=['#e74c3c', '#2ecc71'], figsize=(8, 5))
plt.title('Survival Count by Gender', fontsize=16, fontweight='bold')
plt.xlabel('Gender')
plt.ylabel('Number of Passengers')
plt.legend(['Did Not Survive', 'Survived'], title='Survival Outcome')
plt.xticks(rotation=0)
plt.show()

Correlation Analysis with survival

In [None]:

corr_matrix = df.corr()
plt.figure(figsize=(10, 6))
mask = np.zeros_like(corr_matrix)
mask[-1, :] = True

sns.heatmap(corr_matrix[['Survived']].sort_values(by='Survived', ascending=False).T,
            annot=True, cmap='RdBu_r', center=0, vmin=-1, vmax=1)
plt.title('Feature Correlation with Survival (Target Variable)\n', fontsize=16, fontweight='bold')
plt.show()

In [None]:

plt.figure(figsize=(12, 8))
sns.set(font_scale=1.1)

corr_matrix = df.corr()

mask = np.triu(np.ones_like(corr_matrix, dtype=bool))


heatmap = sns.heatmap(corr_matrix,
                      mask=mask,
                      annot=True,
                      fmt='.2f',
                      cmap='RdBu_r',
                      center=0,
                      square=True,
                      cbar_kws={"shrink": .8})


plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Full Correlation Matrix Heatmap of Titanic Dataset Features\n', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

The "Sex" feature has a strong positive correlation with "Survived" (0.54)
which shows that female passengers were more likely to survive.

---


The "Fare" feature shows a strong negative correlation with "Pclass" (-0.72) that indicates that passengers in lower classes paid less for their tickets.  

---


"HasCabin" also have a strong positive correlation with survival rate.

# **Module-3 Feature Engineering**

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = df.drop("Survived", axis=1)
y = df["Survived"]

categorical_features = ["Sex", "Pclass", "HasCabin", "Embarked_Q", "Embarked_S"]  # from your cleaned df
numerical_features = [col for col in X.columns if col not in categorical_features]

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)



In [None]:
le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = ["Sex", "Embarked"]
numerical_cols = [col for col in df.columns if col not in ["Survived", "Sex", "Embarked"]]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_cols),
        ("cat", OneHotEncoder(drop="first"), categorical_cols)
    ]
)
df.head()

In [None]:
# Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Replace rare titles into one category
df['Title'] = df['Title'].replace(
    ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major',
     'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare'
)

# Merge similar titles
df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

# Check distribution of titles
print(df['Title'].value_counts())



In [None]:
df['AgeGroup'] = pd.cut(df['Age'],
                        bins=[0, 12, 18, 35, 60, 100],
                        labels=['Child', 'Teen', 'Adult', 'Mid-Aged', 'Senior'])
df.head()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

print(df[['Age', 'Fare']].head())


In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv(list(uploaded.keys())[0])  # take uploaded file
print(" Dataset loaded successfully")
df.head()
# ----- FEATURE ENGINEERING -----

# 1. Create HasCabin feature
df['HasCabin'] = df['Cabin'].notnull().astype(int)

# 2. Create AgeGroup bins
df['AgeGroup'] = pd.cut(df['Age'],
                        bins=[0, 12, 18, 35, 50, 80],
                        labels=['Child','Teen','YoungAdult','Adult','Senior'])

# 3. Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 4. One-hot encode categorical features
df = pd.get_dummies(df, columns=['Sex','Embarked','AgeGroup','Title'], drop_first=True)

# 5. Define final features
final_features = [
    'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'HasCabin',
    'Sex_male', 'Embarked_Q', 'Embarked_S'
]

# 6. Create feature matrix and target
X = df[final_features]
y = df['Survived']

print("Features and target ready!")
print("X shape:", X.shape)
X.head()





*   As in above code 'sex' was already mapped and 'Embarked' was already encoded but since my task was to encode again so I have again redone with one-hot encoder,label encoder and columntransformer
*   Create new features age group and titles
*  Next we scale numerical features like age and fare, age and fare are in negative values as we have applied standardization.
*  Then do training for the final features



# **`Model Traning `**

In [None]:
# ============================
# Titanic Survival Prediction with Random Forest
# ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Load dataset
#df = pd.read_csv("/content/Titanic-Dataset.csv")

# 2. Drop irrelevant columns (not useful for prediction)
#df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1, errors='ignore')

# 3. Handle missing values
#df['Age'].fillna(df['Age'].median(), inplace=True)
#df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# 4. Convert categorical features into numeric
#df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# 5. Define features and target
#X = df.drop('Survived', axis=1)
#y = df['Survived']

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Random Forest Model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)
rf_model.fit(X_train, y_train)

# 8. Predictions
y_pred = rf_model.predict(X_test)

# 9. Evaluation
print(" Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 10. Confusion Matrix Heatmap
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest - Confusion Matrix")
plt.show()

# 11. Feature Importance Plot
importances = rf_model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8,6))
sns.barplot(x=importances[indices], y=features[indices])
plt.title("Feature Importances (Random Forest)")
plt.show()


Extra Detailing

In [None]:
# ========================================
# Titanic Survival Prediction - Random Forest Classifier
# ========================================

#  Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# ========================================
# 1. Load Dataset
# ========================================
df = pd.read_csv("/content/Titanic-Dataset.csv")

print(" Dataset Shape:", df.shape)
print("\n First 5 Rows of Dataset:")
display(df.head())

# ========================================
# 2. Data Preprocessing
# ========================================

# Drop irrelevant columns
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1, errors='ignore')

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = 1 * (df['FamilySize'] == 1)

# Encode categorical features
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

print("\n Preprocessed Dataset Shape:", df.shape)

# ========================================
# 3. Define Features and Target
# ========================================
X = df.drop('Survived', axis=1)
y = df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ========================================
# 4. Train Random Forest Model
# ========================================
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# ========================================
# 5. Evaluation Metrics
# ========================================
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nRandom Forest Evaluation Metrics:")
print("Accuracy :", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall   :", round(recall, 4))
print("F1-score :", round(f1, 4))

print("\n Classification Report:\n", classification_report(y_test, y_pred))

# ========================================
# 6. Confusion Matrix (Beautiful Plot)
# ========================================
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap="YlGnBu", cbar=False,
            xticklabels=['Died (0)', 'Survived (1)'],
            yticklabels=['Died (0)', 'Survived (1)'])
plt.title(" Random Forest - Confusion Matrix", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# ========================================
# 7. Feature Importance Visualization
# ========================================
importances = rf_model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(8,6))
sns.barplot(x=importances[indices], y=features[indices], palette="viridis")
plt.title(" Feature Importances (Random Forest)", fontsize=14)
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

# ========================================
# 8. Cross-Validation for Stable Performance
# ========================================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_model, X, y, cv=cv, scoring='accuracy')

print("\n Cross-Validation Results:")
print("All 10-Fold Scores:", np.round(cv_scores, 4))
print("Mean Accuracy:", round(cv_scores.mean(), 4))
print("Standard Deviation:", round(cv_scores.std(), 4))

# ========================================
# 9. Extra Insight: Survival by Gender & Class
# ========================================
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.barplot(x="Sex_male", y="Survived", data=df)
plt.title("Survival Rate by Gender")

plt.subplot(1,2,2)
sns.barplot(x="Pclass", y="Survived", data=df)
plt.title(" Survival Rate by Passenger Class")

plt.tight_layout()
plt.show()


## **Advanced Models & Comparison**

## 1. Import Libraries & Load Dataset


In [None]:

import os, warnings, time
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, RocCurveDisplay

import joblib

# try xgboost
try:
    from xgboost import XGBClassifier
    xgb_available = True
except Exception:
    xgb_available = False

print('XGBoost available:', xgb_available)

DATA_PATH = 'Titanic-Dataset.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Could not find {DATA_PATH} in the working directory. Upload the Kaggle train CSV as '{DATA_PATH}'.")
df = pd.read_csv(DATA_PATH)
print('Dataset shape:', df.shape)
df.head()


## 2. Basic Preprocessing & Feature Engineering



In [None]:

def preprocess_df(df):
    df = df.copy()
    # Extract title
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.')
    df['Title'] = df['Title'].replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
    df['Title'] = df['Title'].replace(['Mlle','Ms'],'Miss')
    df['Title'] = df['Title'].replace(['Mme'],'Mrs')
    # family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    return df

df = preprocess_df(df)
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title','FamilySize','IsAlone']
TARGET = 'Survived'
X = df[features]
y = df[TARGET]

print('Features used:', features)
X.head()

## 3. Train / Test Split

Stratified split: 80% train / 20% test.

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## 4. Preprocessing Pipelines

Numeric imputation+scaling and categorical imputation+one-hot encoding using ColumnTransformer.

In [None]:

numeric_features = ['Age','SibSp','Parch','Fare','FamilySize']
categorical_features = ['Pclass','Sex','Embarked','Title','IsAlone']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


## 5. Define Models & Hyperparameter Search

SVM (GridSearchCV), GradientBoosting (RandomizedSearchCV), XGBoost or HistGradientBoosting (RandomizedSearchCV).

In [None]:

from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

def make_pipeline(estimator):
    return Pipeline(steps=[('preprocessor', preprocessor), ('clf', estimator)])

# SVM
svm = SVC(probability=True, random_state=42)
svm_param_grid = {'clf__C': [0.1, 1, 10], 'clf__kernel': ['rbf','linear'], 'clf__gamma': ['scale','auto']}

# Gradient Boosting (sklearn)
gboost = GradientBoostingClassifier(random_state=42)
gboost_param_dist = {'clf__n_estimators': [50,100,200], 'clf__learning_rate': [0.01,0.05,0.1], 'clf__max_depth': [3,4,6], 'clf__subsample':[0.6,0.8,1.0]}

# XGBoost or fallback
if xgb_available:
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb_param_dist = {'clf__n_estimators':[50,100,200], 'clf__learning_rate':[0.01,0.05,0.1], 'clf__max_depth':[3,4,6], 'clf__subsample':[0.6,0.8,1.0], 'clf__colsample_bytree':[0.6,0.8,1.0]}
else:
    xgb = HistGradientBoostingClassifier(random_state=42)
    xgb_param_dist = {'clf__max_iter':[50,100,200], 'clf__learning_rate':[0.01,0.05,0.1], 'clf__max_leaf_nodes':[15,31,63]}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
svm_search = GridSearchCV(make_pipeline(svm), svm_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
gboost_search = RandomizedSearchCV(make_pipeline(gboost), gboost_param_dist, n_iter=12, cv=cv, scoring='roc_auc', random_state=42, n_jobs=-1, verbose=1)
xgb_search = RandomizedSearchCV(make_pipeline(xgb), xgb_param_dist, n_iter=12, cv=cv, scoring='roc_auc', random_state=42, n_jobs=-1, verbose=1)

## 6. Train All Models (with hyperparameter search)



In [None]:

searches = {'SVM': svm_search, 'GradientBoosting': gboost_search, 'XGBoost_like': xgb_search}
fitted_models = {}
start_all = time.time()
for name, search in searches.items():
    print(f"\n=== Training {name} ===")
    search.fit(X_train, y_train)
    print('Best CV ROC AUC:', search.best_score_)
    print('Best params:', search.best_params_)
    fitted_models[name] = search.best_estimator_
print('\nTotal training time (s):', time.time()-start_all)

## 7. Evaluate & Compare Models



In [None]:

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
results = []
for name, model in fitted_models.items():
    print(f"\n--- {name} ---")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)
    print('Accuracy:', acc, 'ROC AUC:', roc)
    print('Confusion matrix:\n', cm)
    # plot confusion
    ConfusionMatrixDisplay(cm).plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.show()
    # ROC
    RocCurveDisplay.from_predictions(y_test, y_proba)
    plt.title(f'ROC Curve - {name}')
    plt.show()
    results.append({'model': name, 'accuracy': acc, 'roc_auc': roc})

results_df = pd.DataFrame(results).sort_values(by='roc_auc', ascending=False).reset_index(drop=True)
results_df

## 8. Select Best Model & Save



In [None]:
# ==============================================
# TITANIC SURVIVAL PREDICTOR - GRADIO APP
# ==============================================

import pandas as pd
import numpy as np
import joblib
import gradio as gr
from sklearn.ensemble import RandomForestClassifier
import socket
from contextlib import closing

# Function to find an available port
def find_free_port(start_port=7860, end_port=7900):
    for port in range(start_port, end_port + 1):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            if sock.connect_ex(('localhost', port)) != 0:
                return port
    return None

# Create a model for demonstration
try:
    model = joblib.load('best_titanic_model.pkl')
    print("Model loaded successfully!")
except:
    print("Model file not found. Creating a demo model...")
    # Create and train a simple demo model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    # Create some demo data for training
    X_demo = np.array([[1, 0, 30, 0, 0, 50, 0, 0, 1],
                       [3, 1, 25, 0, 0, 10, 1, 0, 0],
                       [1, 0, 40, 1, 2, 100, 0, 0, 1],
                       [2, 1, 20, 0, 0, 20, 0, 1, 0],
                       [3, 1, 30, 2, 0, 15, 0, 0, 1]])
    y_demo = np.array([1, 0, 1, 0, 0])
    model.fit(X_demo, y_demo)

    # Save the demo model for future use
    joblib.dump(model, 'titanic_demo_model.joblib')
    print("Demo model created and saved as 'titanic_demo_model.joblib'")

# Create preprocessing function
def preprocess_input(pclass, sex, age, sibsp, parch, fare, embarked):
    # Convert inputs to the format used during training
    input_data = {
        'Pclass': [pclass],
        'Sex': [1 if sex == 'male' else 0],
        'Age': [age],
        'SibSp': [sibsp],
        'Parch': [parch],
        'Fare': [fare],
        'Embarked_C': [1 if embarked == 'C' else 0],
        'Embarked_Q': [1 if embarked == 'Q' else 0],
        'Embarked_S': [1 if embarked == 'S' else 0]
    }

    # Create DataFrame and ensure correct column order
    input_df = pd.DataFrame(input_data)
    expected_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

    for col in expected_columns:
        if col not in input_df.columns:
            input_df[col] = 0

    return input_df[expected_columns]

# Create prediction function
def predict_survival(pclass, sex, age, sibsp, parch, fare, embarked):
    """
    Predict survival on the Titanic based on passenger details
    """
    try:
        # Preprocess the input
        input_df = preprocess_input(pclass, sex, age, sibsp, parch, fare, embarked)

        # Make prediction
        prediction = model.predict(input_df)
        probability = model.predict_proba(input_df)

        # Format output
        survival = "Survived" if prediction[0] == 1 else "Did Not Survive"
        confidence = probability[0][prediction[0]] * 100

        # Additional insights based on passenger characteristics
        insights = []
        if sex == 'female':
            insights.append("• Females had higher survival rates on the Titanic.")
        if pclass == 1:
            insights.append("• First-class passengers had better survival chances.")
        if age < 18:
            insights.append("• Children were prioritized during evacuation.")
        if sibsp + parch > 0:
            insights.append("• Passengers with family members had varied survival rates.")

        insights_text = "\n".join(insights) if insights else "No specific insights available for this passenger profile."

        result = "Prediction: " + survival + " (" + str(round(confidence, 1)) + "% confidence)\n\n"
        result += "Additional Insights:\n" + insights_text

        return result

    except Exception as e:
        return "Error making prediction: " + str(e)

# Create the Gradio interface
title = "Titanic Survival Predictor"
description = """
This app predicts whether a passenger would have survived the Titanic disaster based on their characteristics.
Enter the passenger details below and click 'Predict' to see the result.
"""

examples = [
    [1, "female", 29, 0, 0, 211.3375, "S"],  # First class female
    [3, "male", 25, 0, 0, 7.8958, "S"],      # Third class male
    [2, "female", 18, 1, 0, 23.0, "S"],      # Second class female with sibling
]

# Create the interface
iface = gr.Interface(
    fn=predict_survival,
    inputs=[
        gr.Dropdown([1, 2, 3], label="Passenger Class", info="1 = First, 2 = Second, 3 = Third"),
        gr.Radio(["male", "female"], label="Gender"),
        gr.Slider(0, 100, value=30, label="Age"),
        gr.Slider(0, 10, value=0, step=1, label="Number of Siblings/Spouses Aboard"),
        gr.Slider(0, 10, value=0, step=1, label="Number of Parents/Children Aboard"),
        gr.Number(value=32.0, label="Fare Paid (in pounds)"),
        gr.Radio(["C", "Q", "S"], label="Port of Embarkation", info="C = Cherbourg, Q = Queenstown, S = Southampton")
    ],
    outputs=gr.Textbox(label="Prediction Result", lines=5),
    title=title,
    description=description,
    examples=examples,
    allow_flagging="never"
)

# Launch the application
if __name__ == "__main__":
    print("Launching Titanic Survival Predictor App...")

    # Find an available port
    free_port = find_free_port(7860, 7900)
    if free_port is None:
        free_port = 7860  # Default if no free port found

    print(f"The app will be available at: http://localhost:{free_port}")
    print("If it doesn't open automatically, copy and paste the above URL into your browser.")

    # Launch with the available port
    iface.launch(server_name="0.0.0.0", server_port=free_port, share=False)