# Task 1: Detect and Handle Missing Values

## Step 1: Loading the dataset

In [76]:
import seaborn as sns

df = sns.load_dataset("titanic")

print(df.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


## Step 2: Identify Columns with Missing Values

In [64]:
missing_values = df.isnull().mean() * 100

missing_values = missing_values.sort_values(ascending=False)

print(missing_values[missing_values > 0])

deck           77.216611
age            19.865320
embarked        0.224467
embark_town     0.224467
dtype: float64


## Step 3: Handle Missing Values

In [66]:
df_fill_mean = df.copy()
numeric_cols = df_fill_mean.select_dtypes(include='number').columns
df_fill_mean[numeric_cols] = df_fill_mean[numeric_cols].fillna(df_fill_mean[numeric_cols].mean())

In [68]:
df_fill_median = df.copy()
df_fill_median[numeric_cols] = df_fill_median[numeric_cols].fillna(df_fill_median[numeric_cols].median())

In [70]:
df_fill_mode = df.copy()
for column in df_fill_mode.columns:
    df_fill_mode[column] = df_fill_mode[column].fillna(df_fill_mode[column].mode()[0])

In [72]:
df_interpolate = df.copy()
df_interpolate[numeric_cols] = df_interpolate[numeric_cols].interpolate()

## Step 4: Compare the impact of each method on the dataset

In [58]:
summary_stats = {
    'Original': df.describe(),
    'Drop NA': df.dropna().describe(),
    'Drop Columns': df.dropna(axis=1).describe(),
    'Fill Mean': df_fill_mean.describe(),
    'Fill Median': df_fill_median.describe(),
    'Fill Mode': df_fill_mode.describe(),
    'Interpolate': df_interpolate.describe()
}

dataset_shapes = {
    'Original': df.shape,
    'Drop NA': df.dropna().shape,
    'Drop Columns': df.dropna(axis=1).shape,
    'Fill Mean': df_fill_mean.shape,
    'Fill Median': df_fill_median.shape,
    'Fill Mode': df_fill_mode.shape,
    'Interpolate': df_interpolate.shape
}

## Step 5: Display the summary statistics and shapes

In [60]:
for method, stats in summary_stats.items():
    print(f"\nSummary Statistics for {method} Method:")
    print(stats)

print("\nDataset Shapes:")
for method, shape in dataset_shapes.items():
    print(f"{method}: {shape}")


Summary Statistics for Original Method:
         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200

Summary Statistics for Drop NA Method:
         survived      pclass         age       sibsp       parch        fare
count  182.000000  182.000000  182.000000  182.000000  182.000000  182.000000
mean     0.675824    1.192308   35.623187    0.467033    0.47