# 🚗 Automobile Dataset - Data Cleaning & ML Preparation
This notebook covers the full pipeline to clean and prepare the dataset for machine learning.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("Automobile_data.csv")
df.replace('?', np.nan, inplace=True)

In [None]:
numeric_columns = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
df['normalized-losses'].fillna(df['normalized-losses'].mean(), inplace=True)
df['bore'].fillna(df['bore'].mean(), inplace=True)
df['stroke'].fillna(df['stroke'].mean(), inplace=True)
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)
df['peak-rpm'].fillna(df['peak-rpm'].mean(), inplace=True)
df['price'].fillna(df['price'].median(), inplace=True)
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0], inplace=True)

In [None]:
le = LabelEncoder()
df['fuel-type'] = le.fit_transform(df['fuel-type'])
df['aspiration'] = le.fit_transform(df['aspiration'])
df['num-of-doors'] = le.fit_transform(df['num-of-doors'])

df['num-of-cylinders'] = df['num-of-cylinders'].map({
    'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'eight': 8, 'twelve': 12
})

df = pd.get_dummies(df, columns=[
    'make', 'body-style', 'drive-wheels',
    'engine-location', 'engine-type', 'fuel-system'
], drop_first=True)

In [None]:
scaler = StandardScaler()
numeric_to_scale = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'engine-size', 'price']
df[numeric_to_scale] = scaler.fit_transform(df[numeric_to_scale])

In [None]:
for col in numeric_to_scale:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

In [None]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)