ARTI406 - Machine Learning

# Assignment 1: Exploratory Data Analysis (EDA)
## Dataset: Used Cars Price Analysis

EDA is the first and most important step in any Machine Learning project.
Before building models, we must understand:

- What does the data represent?
- Are there missing values?
- Are there outliers?
- What patterns exist?
- Which variables influence others?

If we do not understand the data, we cannot build a good model.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Display settings
plt.style.use('default')
sns.set_palette('Set2')

In [None]:
# Load the dataset
df = pd.read_csv('used_cars_dataset.csv')
print('Dataset loaded successfully!')

## Check Missing Values

In [None]:
# Check for missing values
df.isnull().sum()

## Check duplicate rows

In [None]:
# Check for duplicate rows
df.duplicated().sum()

## No. of rows and columns

In [None]:
# Dataset shape
df.shape

## Data type of columns

In [None]:
# Data types
df.dtypes

## Descriptive summary Statistics

In [None]:
# Statistical summary
df.describe()

## Univariate Analysis

### Distribution of Price

In [None]:
# Price distribution histogram
plt.figure(figsize=(10, 6))
plt.hist(df['Price_SAR'], bins=30, color='skyblue', edgecolor='black')
plt.title('Price Distribution')
plt.xlabel('Price (SAR)')
plt.ylabel('Frequency')
plt.show()

## Bivariate Analysis

### Price by Brand

In [None]:
# Average price by brand
brand_price = df.groupby('Brand')['Price_SAR'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
brand_price.plot(kind='bar', color='lightgreen')
plt.title('Average Price by Brand')
plt.ylabel('Price (SAR)')
plt.xticks(rotation=45)
plt.show()

### Price by Fuel Type

In [None]:
# Price by fuel type
fuel_price = df.groupby('Fuel_Type')['Price_SAR'].mean().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
fuel_price.plot(kind='bar', color='coral')
plt.title('Average Price by Fuel Type')
plt.ylabel('Price (SAR)')
plt.xticks(rotation=45)
plt.show()

### Price by Transmission

In [None]:
# Price by transmission
trans_price = df.groupby('Transmission')['Price_SAR'].mean()

plt.figure(figsize=(10, 6))
trans_price.plot(kind='bar', color='salmon')
plt.title('Average Price by Transmission')
plt.ylabel('Price (SAR)')
plt.xticks(rotation=45)
plt.show()

### Mileage vs Price Relationship

In [None]:
# Scatter plot: Mileage vs Price
plt.figure(figsize=(10, 6))
plt.scatter(df['Mileage_KM'], df['Price_SAR'], alpha=0.5)
plt.title('Mileage vs Price')
plt.xlabel('Mileage (KM)')
plt.ylabel('Price (SAR)')
plt.show()

## Correlation Matrix

In [None]:
# Correlation matrix for numeric columns
numeric_cols = ['Model_Year', 'Mileage_KM', 'Engine_Size', 'Number_of_Owners', 'Price_SAR']
correlation = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix')
plt.show()

## Outlier Detection

In [None]:
# Detect outliers in Price using IQR method
Q1 = df['Price_SAR'].quantile(0.25)
Q3 = df['Price_SAR'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Price_SAR'] < lower_bound) | (df['Price_SAR'] > upper_bound)]

print(f'Number of outliers: {len(outliers)}')
print(f'Percentage of outliers: {len(outliers)/len(df)*100:.2f}%')

In [None]:
# Visualize outliers using box plot
plt.figure(figsize=(10, 6))
plt.boxplot(df['Price_SAR'], vert=True)
plt.title('Price Box Plot - Outlier Detection')
plt.ylabel('Price (SAR)')
plt.grid(axis='y', alpha=0.3)
plt.show()

## Time-Based Analysis

### Price Trend by Model Year

In [None]:
# Average price by model year
year_price = df.groupby('Model_Year')['Price_SAR'].mean()

plt.figure(figsize=(12, 6))
year_price.plot(kind='line', marker='o', color='purple')
plt.title('Average Price Trend by Model Year')
plt.xlabel('Model Year')
plt.ylabel('Average Price (SAR)')
plt.grid(True, alpha=0.3)
plt.show()