<a href="https://colab.research.google.com/github/KesteHarshada87/MachineLearning/blob/main/TitanicSurvivalPrediction(ML_Project).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

csv_path = '/content/drive/MyDrive/MachineLearning/titanic.csv'
df = pd.read_csv(csv_path)
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
# Step 1️⃣: Clean missing values in Titanic dataset

import numpy as np

# Check missing values before cleaning
print("Missing values before cleaning:\n", df.isnull().sum())

# Drop 'Cabin' column (too many missing)
df.drop(columns=['Cabin'], inplace=True)
print("\nDropped 'Cabin' column.")

# Extract Title from Name (Mr, Miss, Mrs, etc.)
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

df['Title'] = df['Name'].apply(extract_title)

# Simplify rare titles
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major',
               'Rev','Sir','Jonkheer','Dona']
df['Title'] = df['Title'].apply(lambda t: 'Rare' if t in rare_titles else t)

print("\nUnique titles:", df['Title'].unique())

# Add flag for missing Age
df['Age_was_missing'] = df['Age'].isnull().astype(int)

# Fill Age using median by Title
df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill Fare with median
df['Fare'].fillna(df['Fare'].median(), inplace=True)

# Fill Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Final check
print("\nMissing values after cleaning:\n", df.isnull().sum())

# Preview data
df.head()



Missing values before cleaning:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Dropped 'Cabin' column.

Unique titles: ['Mr' 'Mrs' 'Miss' 'Master' 'Rare']

Missing values after cleaning:
 PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Embarked           0
Title              0
Age_was_missing    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Age_was_missing
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Mr,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,Mrs,0
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Mr,0
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Mr,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Mrs,0


In [8]:
# Step 2️⃣: Encode categorical features

from sklearn.preprocessing import LabelEncoder

# Encode 'Sex', 'Embarked', and 'Title' columns
label_cols = ['Sex', 'Embarked', 'Title']
encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    print(f"Encoded '{col}' -> {list(le.classes_)}")

# Preview after encoding
df.head()



Encoded 'Sex' -> ['female', 'male']
Encoded 'Embarked' -> ['C', 'Q', 'S']
Encoded 'Title' -> ['Master', 'Miss', 'Mr', 'Mrs', 'Rare']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Age_was_missing
0,892,0,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,1,2,0
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,2,3,0
2,894,0,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,1,2,0
3,895,0,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,2,2,0
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,2,3,0


In [9]:
# Step 3️⃣: Split into features (X) and target (y)

from sklearn.model_selection import train_test_split

# Select columns for modeling
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']]
y = df['Survived']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 334
Testing samples: 84


In [10]:
# Step 4️⃣: Train Logistic Regression model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("\n✅ Logistic Regression model trained successfully!")



✅ Logistic Regression model trained successfully!


In [11]:
# Step 5️⃣: Evaluate model

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n🔹 Accuracy on test data: {accuracy:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



🔹 Accuracy on test data: 1.0000

Confusion Matrix:
 [[50  0]
 [ 0 34]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [12]:
# Step 6️⃣: Feature importance

import pandas as pd

importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature importance (by coefficient):")
print(importance)



Feature importance (by coefficient):
    Feature  Coefficient
4     Parch     0.071025
3     SibSp     0.057420
7     Title     0.046984
5      Fare     0.002755
2       Age    -0.003439
0    Pclass    -0.062243
6  Embarked    -0.065099
1       Sex    -6.313006


In [13]:
# Step 7️⃣: Try Random Forest for comparison

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"\n🌲 Random Forest accuracy: {rf_acc:.4f}")



🌲 Random Forest accuracy: 1.0000
