In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [2]:
# Load the Titanic dataset
df = pd.read_csv('/content/Titanic-Dataset.csv')

In [3]:
# Step 1: Handle Missing Values
# Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Fill missing values for 'Age' with the median value
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing values for 'Embarked' with the mode value
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column since it has many missing values
df.drop(columns=['Cabin'], inplace=True)

# Fill missing 'Fare' values with the median
df['Fare'].fillna(df['Fare'].median(), inplace=True)


Missing values before handling:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
# Step 2: Handle Outliers
# You can implement outlier handling logic if needed
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the outliers
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

# Handle outliers for 'Age' and 'Fare'
handle_outliers(df, 'Age')
handle_outliers(df, 'Fare')


In [5]:
# Step 3: Normalize or Scale Features
# Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df = pd.get_dummies(df, columns=['Embarked'])

# Select numerical columns for scaling
num_cols = ['Age', 'Fare', 'Parch', 'SibSp']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical columns
df[num_cols] = scaler.fit_transform(df[num_cols])


In [6]:
# Step 4: Split the Data into Training and Testing Sets
# Define the target variable and feature set
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the results
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training set labels shape:", y_train.shape)
print("Testing set labels shape:", y_test.shape)


Training set shape: (712, 9)
Testing set shape: (179, 9)
Training set labels shape: (712,)
Testing set labels shape: (179,)
