In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the Titanic dataset
file_path = r"train.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Display initial dataset info
print("Initial Dataset Info:")
print(df.info())

# ------------------------------
# 1️⃣ Handle Missing Values
# ------------------------------
# Fill missing Age values with median
df["Age"].fillna(df["Age"].median(), inplace=True)

# Fill missing Embarked values with mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# Drop Cabin column (too many missing values)
df.drop(columns=["Cabin"], inplace=True)

# ------------------------------
# 2️⃣ Convert Categorical Data to Numerical
# ------------------------------
# Convert 'Sex' column to binary (Male = 1, Female = 0)
df["Sex"] = df["Sex"].map({"male": 1, "female": 0})

# One-hot encode 'Embarked' column
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_embarked = pd.DataFrame(encoder.fit_transform(df[["Embarked"]]))
encoded_embarked.columns = encoder.get_feature_names_out(["Embarked"])

# Merge encoded features and drop original column
df = pd.concat([df, encoded_embarked], axis=1)
df.drop(columns=["Embarked"], inplace=True)

# ------------------------------
# 3️⃣ Handle Outliers using IQR (Fix Applied)
# ------------------------------
# Select only numeric columns
numeric_cols = df.select_dtypes(include=["number"]).columns

# Compute IQR for numeric columns only
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers for numeric columns only
df = df[~((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).any(axis=1)]

# ------------------------------
# 4️⃣ Normalize Numerical Features
# ------------------------------
scaler = StandardScaler()
df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])

# ------------------------------
# 5️⃣ Save Preprocessed Data
# ------------------------------
output_file = r"C:\Users\RAMESH\Downloads\titanic\preprocessed_titanic.csv"
df.to_csv(output_file, index=False)

print("\n✅ Data Preprocessing Completed! File saved as 'preprocessed_titanic.csv'.")


Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

✅ Data Preprocessing Completed! File saved as 'preprocessed_titanic.csv'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
