
# Outlier Detection and Treatment

**Author:** IT24104348 â€“ Edirisinghe E.A.R.A.

This notebook identifies and removes outliers using the IQR method.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train_df = pd.read_csv('../data/train-data.csv', index_col=False)

if 'Unnamed: 0' in train_df.columns:
    train_df.drop(columns=['Unnamed: 0'], inplace=True)
train_df.drop_duplicates(inplace=True)

# Convert numeric columns quickly
import re

def parse_numeric(value):
    if pd.isnull(value):
        return np.nan
    match = re.search(r"([0-9]*\.?[0-9]+)", str(value))
    return float(match.group(1)) if match else np.nan

train_df['Mileage_Num'] = train_df['Mileage'].apply(parse_numeric)
train_df['Engine_CC'] = train_df['Engine'].apply(parse_numeric)
train_df['Power_BHP'] = train_df['Power'].apply(parse_numeric)

# IQR function
def remove_outliers(df, columns):
    cleaned = df.copy()
    for col in columns:
        Q1 = cleaned[col].quantile(0.25)
        Q3 = cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 + 1.5*IQR
        cleaned = cleaned[(cleaned[col] >= lower) & (cleaned[col] <= upper)]
    return cleaned

cleaned_df = remove_outliers(train_df, ['Kilometers_Driven','Mileage_Num','Engine_CC','Power_BHP','Price'])
print('Original training size:', train_df.shape[0], 'After removal:', cleaned_df.shape[0])

# Boxplot comparison
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plt.boxplot(train_df['Price'])
plt.title('Price (original)')
plt.subplot(1,2,2)
plt.boxplot(cleaned_df['Price'])
plt.title('Price (cleaned)')
plt.show()
