In [10]:
import pandas as pd

In [9]:
import numpy as np

In [8]:
import matplotlib.pyplot as plt


In [7]:
import seaborn as sns


Matplotlib is building the font cache; this may take a moment.


In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [15]:
df = pd.read_csv('titanic.csv')


In [None]:
print("=== Dataset Info ===")
print(df.info())
print("\n=== Missing Values ===")
print(df.isnull().sum())
print("\n=== Descriptive Statistics ===")
print(df.describe())

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in {col} with median: {median_val}")

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Filled missing values in {col} with mode: {mode_val}")

In [None]:
label_encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if df[col].nunique() <= 5:  # Few unique values
        df[col] = label_encoder.fit_transform(df[col])
        print(f"Label encoded column: {col}")


In [None]:
df = pd.get_dummies(df, columns=[col for col in df.select_dtypes(include=['object']).columns if df[col].nunique() > 5])
print("One-hot encoded remaining categorical columns")

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.select_dtypes(include=['int64', 'float64']))
df_scaled = pd.DataFrame(scaled_features, columns=df.select_dtypes(include=['int64', 'float64']).columns)
print("Standardized numerical features")

In [None]:
plt.figure(figsize=(15, 10))
df.boxplot()
plt.title("Boxplot of Features to Detect Outliers")
plt.xticks(rotation=45)
plt.show()

In [None]:
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df = remove_outliers(df, col)
print("Removed outliers using IQR method")


In [None]:
print("\n=== Final Cleaned Dataset ===")
print(df.head())
print(df.info())


In [None]:
df.to_csv('cleaned_dataset.csv', index=False)