In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Display first 5 rows
print(df.head())

# Check missing values
print("\nMissing Values Count:")
print(df.isnull().sum())


In [None]:
print("\nTotal Missing Values:", df.isnull().sum().sum())


In [None]:
print("Shape before removing duplicates:", df.shape)

df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)


In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns


In [None]:
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Boolean dataframe where True indicates outlier
outlier_condition = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) |
                     (df[numeric_cols] > (Q3 + 1.5 * IQR)))

# Count outliers per row
outlier_count = outlier_condition.sum(axis=1)

# Remove rows having more than 2 outliers
df_clean = df[outlier_count <= 2]

print("Shape after removing rows with >2 outliers:", df_clean.shape)


In [None]:
plt.figure(figsize=(10,6))
df_clean[numeric_cols].boxplot()
plt.xticks(rotation=45)
plt.show()


In [None]:
corr_matrix = df_clean.corr(numeric_only=True)

print(corr_matrix)


In [None]:
# Remove self-correlation
corr_pairs = corr_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs != 1]

most_positive = corr_pairs.sort_values(ascending=False).drop_duplicates().head(1)

print("\nMost Positively Correlated Pair:")
print(most_positive)


In [None]:
most_negative = corr_pairs.sort_values().drop_duplicates().head(1)

print("\nMost Negatively Correlated Pair:")
print(most_negative)


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()
