###Import and load dataset

In [9]:
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")
print("First 5 rows:\n", df.head(), "\n")

First 5 rows:
    Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Fuel_Type Transmission Owner_Type     Mileage   Engine  \
0              41000    Diesel       Manual      First  19.67 kmpl  1582 CC   
1              46000    Petrol       Manual      First    13 km/kg  1199 CC   
2              87000    Diesel       Manual      First  20.77 kmpl  1248 CC   
3              40670    Diesel    Automatic     Second   15.2 kmpl  1968 CC   
4              86999    Diesel       Manual      First  23.08 kmpl  1461 CC   

       Power  Seats  New_Price  Price  
0  126.2 bhp    5.0        NaN  12.50  
1   88.7 bhp   

### strip units and convert to numbers


In [11]:
# Make a copy
cars = df.copy()

# Remove text units
cars['Mileage']   = cars['Mileage'].str.replace('kmpl','', regex=False).str.replace('km/kg','', regex=False)
cars['Engine']    = cars['Engine'].str.replace('CC','',   regex=False)
cars['Power']     = cars['Power'].str.replace('bhp','',   regex=False)
cars['New_Price'] = cars['New_Price'].str.replace('Lakh','', regex=False)

# Convert to numeric, turning bad strings into NaN
for c in ['Mileage','Engine','Power','New_Price']:
    cars[c] = pd.to_numeric(cars[c], errors='coerce')

# Quick sanity checks
print(cars[['Mileage','Engine','Power','New_Price']].head())
print("\nDtypes after cleaning:\n", cars[['Mileage','Engine','Power','New_Price']].dtypes)
print("\nMissing values in cleaned numeric columns:\n", cars[['Mileage','Engine','Power','New_Price']].isna().sum())

   Mileage  Engine   Power  New_Price
0    19.67  1582.0  126.20        NaN
1    13.00  1199.0   88.70       8.61
2    20.77  1248.0   88.76        NaN
3    15.20  1968.0  140.80        NaN
4    23.08  1461.0   63.10        NaN

Dtypes after cleaning:
 Mileage      float64
Engine       float64
Power        float64
New_Price    float64
dtype: object

Missing values in cleaned numeric columns:
 Mileage         2
Engine         36
Power          36
New_Price    5049
dtype: int64


###Handle missing values

In [12]:
# Check missing values before filling
print("Missing values before filling:\n", cars.isnull().sum(), "\n")

# Fill numeric columns with their mean
num_cols = ['Mileage', 'Engine', 'Power', 'Seats', 'New_Price', 'Price']
for col in num_cols:
    if cars[col].isnull().sum() > 0:
        mean_val = cars[col].mean()
        cars[col].fillna(mean_val, inplace=True)

# Fill categorical columns with their most frequent value (mode)
cat_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
for col in cat_cols:
    if cars[col].isnull().sum() > 0:
        mode_val = cars[col].mode()[0]
        cars[col].fillna(mode_val, inplace=True)

# Check again after filling
print("Missing values after filling:\n", cars.isnull().sum())

Missing values before filling:
 Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5049
Price                   0
dtype: int64 

Missing values after filling:
 Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
New_Price            0
Price                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars[col].fillna(mean_val, inplace=True)


In [13]:
# One-hot encode Fuel_Type and Transmission
cars = pd.get_dummies(cars, columns=['Fuel_Type', 'Transmission'], drop_first=True)

# New feature 'Car_Age'
cars['Car_Age'] = 2025 - cars['Year']

# Show sample rows
print("Data after encoding and adding Car_Age:\n")
print(cars[['Name', 'Year', 'Car_Age']].head())

Data after encoding and adding Car_Age:

                               Name  Year  Car_Age
0  Hyundai Creta 1.6 CRDi SX Option  2015       10
1                      Honda Jazz V  2011       14
2                 Maruti Ertiga VDI  2012       13
3   Audi A4 New 2.0 TDI Multitronic  2013       12
4            Nissan Micra Diesel XV  2013       12


### Data Manipulation

In [15]:
selected = cars[['Name', 'Used_Price' if 'Used_Price' in cars.columns else 'Price', 'Car_Age', 'Engine']]
print("Selected columns:\n", selected.head(), "\n")


filtered = cars[cars['Car_Age'] > 8]
print("Filtered (cars older than 8 years):\n", filtered[['Name', 'Year', 'Car_Age']].head(), "\n")


cars.rename(columns={'Price': 'Used_Price'}, inplace=True)
print("Renamed column:\n", cars[['Name', 'Used_Price']].head(), "\n")


cars['Price_per_CC'] = cars['Used_Price'] / cars['Engine']
print("Mutated column (Price_per_CC):\n", cars[['Name', 'Used_Price', 'Engine', 'Price_per_CC']].head(), "\n")


sorted_cars = cars.sort_values(by='Used_Price', ascending=False)
print("Top 5 most expensive cars:\n", sorted_cars[['Name', 'Used_Price']].head(), "\n")


grouped = cars.groupby('Fuel_Type_Petrol')['Used_Price'].mean()
print("Average Used_Price by Fuel Type:\n", grouped)


Selected columns:
                                Name  Used_Price  Car_Age  Engine
0  Hyundai Creta 1.6 CRDi SX Option       12.50       10  1582.0
1                      Honda Jazz V        4.50       14  1199.0
2                 Maruti Ertiga VDI        6.00       13  1248.0
3   Audi A4 New 2.0 TDI Multitronic       17.74       12  1968.0
4            Nissan Micra Diesel XV        3.50       12  1461.0 

Filtered (cars older than 8 years):
                                Name  Year  Car_Age
0  Hyundai Creta 1.6 CRDi SX Option  2015       10
1                      Honda Jazz V  2011       14
2                 Maruti Ertiga VDI  2012       13
3   Audi A4 New 2.0 TDI Multitronic  2013       12
4            Nissan Micra Diesel XV  2013       12 

Renamed column:
                                Name  Used_Price
0  Hyundai Creta 1.6 CRDi SX Option       12.50
1                      Honda Jazz V        4.50
2                 Maruti Ertiga VDI        6.00
3   Audi A4 New 2.0 TDI Multitronic