**Importing Libraries**

In [18]:
#Import libraries
import numpy as np
import pandas as pd

**Read the downloaded dataset**

In [2]:
# Read CSV file
df = pd.read_csv("data.csv", encoding_errors='ignore')
df.shape

(11914, 16)

In [3]:
# Top 5 rows 
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [4]:
# information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

**Handling Missing Values**

In [15]:
# Check for null Values
df.isnull().sum()

Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Market Category      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64

In [8]:
# Mode imputation for Engine Fuel Type
df['Engine Fuel Type'].fillna(df['Engine Fuel Type'].mode().iloc[0], inplace=True)

In [9]:
# Median imputation for Engine HP
df['Engine HP'].fillna(df['Engine HP'].median(), inplace=True)

In [14]:
# Median imputation for Number of Doors
df['Number of Doors'].fillna(df['Number of Doors'].median(), inplace=True)

In [11]:
# Median imputation for Engine Cylinders
df['Engine Cylinders'].fillna(df['Engine Cylinders'].median(), inplace=True)

In [12]:
# Replace with a placeholder "Not Available" for Market Category
df['Market Category'].fillna("Not Available", inplace=True)

**Removed Duplicates**

In [5]:
# Check for duplicaltes
df.duplicated().sum()

715

In [6]:
# Drop duplicates
df.drop_duplicates(inplace=True)

**Handle Outliers**

In [None]:
# Calculate Q1, Q3, and IQR
Q1 = df['MSRP'].quantile(0.25)
Q3 = df['MSRP'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df[(df['MSRP'] < lower_bound) | (df['MSRP'] > upper_bound)]

In [19]:
# Log Tranformation
df['Log(MSRP)'] = np.log1p(df['MSRP'])  # log(1 + Price)

**Standardize data**

In [16]:
# Correct Datatype - text
df['Model'] = df['Model'].astype(str)

**Stored Cleaned Data in Excel**

In [21]:
df.to_excel("Cleaned_Data.xlsx", index=False)