In [6]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

# Step 2: Load dataset
df = pd.read_csv('train.csv')  # Ensure train.csv is in your working directory

# Step 3: Check for missing values BEFORE cleaning
print("Missing values before cleaning:")
print(df.isnull().sum())

# Step 4: Handle missing values
age_mean_10_50 = df[(df['Age'] >= 10) & (df['Age'] <= 50)]['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean_10_50)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Step 5: Drop unwanted columns
df.drop(['Cabin', 'Ticket', 'PassengerId', 'Name', 'Fare'], axis=1, inplace=True)

# ✅ Check again AFTER cleaning
print("\nMissing values after cleaning:")
print(df.isnull().sum())


# Step 6: One-hot encode Embarked
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Step 7: Optional – Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])  # Only 'Age' remains since 'Fare' was dropped

# Step 8: Save final features and target
X = df.drop('Survived', axis=1)
y = df['Survived']


Missing values before cleaning:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing values after cleaning:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64
