# Essential Libaries

In [144]:
import numpy as np
import pandas as pd

In [145]:
columns = [
    "Name", "Location", "Year", "Kilometers_Driven", "Fuel_Type", "Transmission", 
    "Owner_Type", "Mileage", "Engine_CC", "Power_bhp", "Seats", "Price"
]

In [146]:
cardata = pd.read_csv(r'Downloads\Cars\train.csv')
cardata.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [147]:
cardata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


# Data Cleaning

In [152]:
# Function to clean mileage
def clean_mileage(mileage):
    if isinstance(mileage, str):
        return float(mileage.split()[0])
    else:
        return mileage

def clean_engine(engine):
    if isinstance(engine, float):
        return engine
    elif isinstance(engine, str):
        return float(engine.split()[0])
    else:
        return np.nan

def clean_power(power):
    if isinstance(power, float):
        return power
    elif isinstance(power, str):
        if power.strip().lower() == 'null bhp':
            return np.nan
        else:
            return float(power.split()[0])
    else:
        return np.nan

In [163]:
cleaned_df = cardata.copy()

# Clean the dataset
cardata['Mileage'] = cardata['Mileage'].apply(clean_mileage)
cardata['Engine'] = cardata['Engine'].apply(clean_power)
cardata['Power'] = cardata['Power'].apply(clean_power)

# Fill missing values in 'Seats' column with the median
cardata['Seats'].fillna(cardata['Seats'].median(), inplace=True)

# Display the first few rows of cleaned data
cardata.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,6019.0,6019.0,6017.0,5983.0,5876.0,6019.0,6019.0
mean,2013.358199,58738.38,18.134961,1621.27645,113.25305,5.27679,9.479468
std,3.269742,91268.84,4.582289,601.355233,53.874957,0.806346,11.187917
min,1998.0,171.0,0.0,72.0,34.2,0.0,0.44
25%,2011.0,34000.0,15.17,1198.0,75.0,5.0,3.5
50%,2014.0,53000.0,18.15,1493.0,97.7,5.0,5.64
75%,2016.0,73000.0,21.1,1984.0,138.1,5.0,9.95
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0


In [166]:
cleaned_df = cardata.copy()
# Drop rows with NaN values in 'Engine' and 'Power' columns
cleaned_df.dropna(subset=['Engine', 'Power'], inplace=True)

# Reset the index after dropping rows
cleaned_df.reset_index(drop=True, inplace=True)

cleaned_df.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,5876.0,5876.0,5874.0,5876.0,5876.0,5876.0,5876.0
mean,2013.476515,58320.26,18.273829,1625.466133,113.25305,5.283526,9.602665
std,3.165822,92139.23,4.371965,601.787379,53.874957,0.804841,11.246531
min,1998.0,171.0,0.0,72.0,34.2,2.0,0.44
25%,2012.0,33443.75,15.26,1198.0,75.0,5.0,3.5175
50%,2014.0,52609.0,18.2,1495.5,97.7,5.0,5.75
75%,2016.0,72402.75,21.1,1991.0,138.1,5.0,10.0125
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0
