<a href="https://colab.research.google.com/github/KaziShahHamza/Colab-Codes/blob/main/titanic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# ======================================
# 2. Load Dataset
# ======================================
# Option A: Load from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# file_path = "/content/drive/MyDrive/datasets/iris.data"
# columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

df = pd.read_csv("/content/drive/MyDrive/datasets/Titanic-Dataset.csv")

# Option B: Load directly from GitHub (example: Iris)
# df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")

print("Shape:", df.shape)
print(df.head())


Mounted at /content/drive
Shape: (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0    

In [None]:
# Handle missing values
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)

# Feature engineering
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

# Extract Title
df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
df["Title"] = df["Title"].replace(
    ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
df["Title"] = df["Title"].replace({'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'})

# Encode categorical
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df = pd.get_dummies(df, columns=["Embarked","Title"], drop_first=True)

# Drop useless columns
df = df.drop(columns=["PassengerId","Name","Ticket","Cabin"])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [None]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("Logistic Regression:", accuracy_score(y_test, y_pred))


Logistic Regression: 0.8212290502793296


In [None]:
rf = RandomForestClassifier(random_state=42)

params = {"n_estimators": [100, 200, 300],
          "max_depth": [3, 5, 7, None],
          "min_samples_split": [2, 5, 10]}

grid = GridSearchCV(rf, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

print("Best Random Forest:", grid.best_params_)
best_rf = grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))


Best Random Forest: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
RF Accuracy: 0.8212290502793296


In [None]:
gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting:", accuracy_score(y_test, y_pred_gb))


Gradient Boosting: 0.8324022346368715


In [None]:
ensemble = VotingClassifier(
    estimators=[("lr", logreg), ("rf", best_rf), ("gb", gb)],
    voting="hard"
)
ensemble.fit(X_train, y_train)
y_pred_ens = ensemble.predict(X_test)
print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ens))


Ensemble Accuracy: 0.8324022346368715


In [None]:
print(classification_report(y_test, y_pred_ens))


              precision    recall  f1-score   support

           0       0.84      0.90      0.87       110
           1       0.82      0.72      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

