In [89]:
# load the dataset
import pandas as pd
import numpy as np

df = pd.read_csv("./pakistan-used-cars-data.csv")
df

Unnamed: 0,Title,Mileage,Model,Color,BodyType,EngineCapacity,FuelType,Transmission,RegisteredIn,Assembly,Price,Location,LastUpdated,ReferenceNo
0,Toyota Yaris Sedan ATIV MT 1.3 2025,30 km,2025.0,White,Sedan,1300 cc,Petrol,Manual,Un-Registered,Local,PKR 51 lacs,Lahore Punjab,"Jun 30, 2025",10262585.0
1,Honda Civic 1.5 VTEC Turbo Oriel 2020,"72,000 km",2020.0,Crystal Black Pearl,Sedan,1500 cc,Petrol,Automatic,Islamabad,Local,PKR 65.75 lacs,"Shadman, Lahore Punjab","Jun 30, 2025",10138164.0
2,BMW 5 Series 520d 2012,"136,862 km",2012.0,White,Sedan,2000 cc,Diesel,Automatic,Islamabad,Local,PKR 1.2 crore,"I- 9, Islamabad Islamabad","Jun 30, 2025",9887693.0
3,Audi A6 1.8 TFSI Business Class Edition 2015,"90,768 km",2015.0,Black,Sedan,1800 cc,Petrol,Automatic,Islamabad,Imported,PKR 1.07 crore,"Defence Phase-2, Islamabad Islamabad","Jun 30, 2025",10262536.0
4,Toyota Corolla GLi 1.3 VVTi Special Edition 2019,"90,000 km",2019.0,Super White,Sedan,1300 cc,Petrol,Manual,Punjab,Local,PKR 42.5 lacs,"Satiana Road, Faisalabad Punjab","Jun 30, 2025",10257615.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2933,Suzuki Alto 2010,"100,000 km",2010.0,White,,660 cc,Petrol,Automatic,Sindh,Local,PKR 13.85 lacs,"G- 7, Islamabad Islamabad","Jun 29, 2025",10115450.0
2934,KIA Picanto 1.0 MT 2023,"28,000 km",2023.0,Black,Hatchback,1000 cc,Petrol,Manual,Sindh,Local,PKR 27.75 lacs,"Gulshan-e-Jamal, Karachi Sindh","Jun 29, 2025",10259846.0
2935,Suzuki Bolan 2017,"780,000 km",2017.0,White,,800 cc,Petrol,Manual,Punjab,Local,PKR 14.5 lacs,"Chungi No 9, Multan Punjab","Jun 29, 2025",10259845.0
2936,Suzuki Mehran VXR Euro II 2014,"49,596 km",2014.0,Grey,Hatchback,800 cc,Petrol,Manual,Punjab,Local,PKR 13 lacs,"BAHRIA PHASE 8, Rawalpindi Punjab","Jun 29, 2025",10259843.0


In [90]:
# rows and columns
df.shape

(2938, 14)

In [91]:
# checking columns datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           2926 non-null   object 
 1   Mileage         2937 non-null   object 
 2   Model           2937 non-null   float64
 3   Color           2937 non-null   object 
 4   BodyType        2686 non-null   object 
 5   EngineCapacity  2903 non-null   object 
 6   FuelType        2937 non-null   object 
 7   Transmission    2937 non-null   object 
 8   RegisteredIn    2937 non-null   object 
 9   Assembly        2937 non-null   object 
 10  Price           2937 non-null   object 
 11  Location        2926 non-null   object 
 12  LastUpdated     2937 non-null   object 
 13  ReferenceNo     2937 non-null   float64
dtypes: float64(2), object(12)
memory usage: 321.5+ KB


The first and last columns are not really important because first column is of title
of car which is almost unique in every row and last column is just a reference to
the original ad. Dropping them both. As well as, dropping the LastUpdated column.

In [92]:
df.drop(['Title', 'ReferenceNo', 'LastUpdated'], axis=1, inplace=True)
df

Unnamed: 0,Mileage,Model,Color,BodyType,EngineCapacity,FuelType,Transmission,RegisteredIn,Assembly,Price,Location
0,30 km,2025.0,White,Sedan,1300 cc,Petrol,Manual,Un-Registered,Local,PKR 51 lacs,Lahore Punjab
1,"72,000 km",2020.0,Crystal Black Pearl,Sedan,1500 cc,Petrol,Automatic,Islamabad,Local,PKR 65.75 lacs,"Shadman, Lahore Punjab"
2,"136,862 km",2012.0,White,Sedan,2000 cc,Diesel,Automatic,Islamabad,Local,PKR 1.2 crore,"I- 9, Islamabad Islamabad"
3,"90,768 km",2015.0,Black,Sedan,1800 cc,Petrol,Automatic,Islamabad,Imported,PKR 1.07 crore,"Defence Phase-2, Islamabad Islamabad"
4,"90,000 km",2019.0,Super White,Sedan,1300 cc,Petrol,Manual,Punjab,Local,PKR 42.5 lacs,"Satiana Road, Faisalabad Punjab"
...,...,...,...,...,...,...,...,...,...,...,...
2933,"100,000 km",2010.0,White,,660 cc,Petrol,Automatic,Sindh,Local,PKR 13.85 lacs,"G- 7, Islamabad Islamabad"
2934,"28,000 km",2023.0,Black,Hatchback,1000 cc,Petrol,Manual,Sindh,Local,PKR 27.75 lacs,"Gulshan-e-Jamal, Karachi Sindh"
2935,"780,000 km",2017.0,White,,800 cc,Petrol,Manual,Punjab,Local,PKR 14.5 lacs,"Chungi No 9, Multan Punjab"
2936,"49,596 km",2014.0,Grey,Hatchback,800 cc,Petrol,Manual,Punjab,Local,PKR 13 lacs,"BAHRIA PHASE 8, Rawalpindi Punjab"


Now, I will work with Mileage and EngineCapacity columns to get them free from
unwanted units and make them integers

In [93]:
df['Mileage'] = df['Mileage'].str.strip(" km")
df['Mileage'] = df['Mileage'].str.replace(",", "").astype(float)
df['EngineCapacity'] = df['EngineCapacity'].str.strip(" cc").astype(float)
df

Unnamed: 0,Mileage,Model,Color,BodyType,EngineCapacity,FuelType,Transmission,RegisteredIn,Assembly,Price,Location
0,30.0,2025.0,White,Sedan,1300.0,Petrol,Manual,Un-Registered,Local,PKR 51 lacs,Lahore Punjab
1,72000.0,2020.0,Crystal Black Pearl,Sedan,1500.0,Petrol,Automatic,Islamabad,Local,PKR 65.75 lacs,"Shadman, Lahore Punjab"
2,136862.0,2012.0,White,Sedan,2000.0,Diesel,Automatic,Islamabad,Local,PKR 1.2 crore,"I- 9, Islamabad Islamabad"
3,90768.0,2015.0,Black,Sedan,1800.0,Petrol,Automatic,Islamabad,Imported,PKR 1.07 crore,"Defence Phase-2, Islamabad Islamabad"
4,90000.0,2019.0,Super White,Sedan,1300.0,Petrol,Manual,Punjab,Local,PKR 42.5 lacs,"Satiana Road, Faisalabad Punjab"
...,...,...,...,...,...,...,...,...,...,...,...
2933,100000.0,2010.0,White,,660.0,Petrol,Automatic,Sindh,Local,PKR 13.85 lacs,"G- 7, Islamabad Islamabad"
2934,28000.0,2023.0,Black,Hatchback,1000.0,Petrol,Manual,Sindh,Local,PKR 27.75 lacs,"Gulshan-e-Jamal, Karachi Sindh"
2935,780000.0,2017.0,White,,800.0,Petrol,Manual,Punjab,Local,PKR 14.5 lacs,"Chungi No 9, Multan Punjab"
2936,49596.0,2014.0,Grey,Hatchback,800.0,Petrol,Manual,Punjab,Local,PKR 13 lacs,"BAHRIA PHASE 8, Rawalpindi Punjab"


After that we will go with price column which is the target y for our training model

In [94]:
df['Price'] = df['Price'].str.strip("PKR")
df['Price'] = df['Price'].str.replace(" ", "")
df

Unnamed: 0,Mileage,Model,Color,BodyType,EngineCapacity,FuelType,Transmission,RegisteredIn,Assembly,Price,Location
0,30.0,2025.0,White,Sedan,1300.0,Petrol,Manual,Un-Registered,Local,51lacs,Lahore Punjab
1,72000.0,2020.0,Crystal Black Pearl,Sedan,1500.0,Petrol,Automatic,Islamabad,Local,65.75lacs,"Shadman, Lahore Punjab"
2,136862.0,2012.0,White,Sedan,2000.0,Diesel,Automatic,Islamabad,Local,1.2crore,"I- 9, Islamabad Islamabad"
3,90768.0,2015.0,Black,Sedan,1800.0,Petrol,Automatic,Islamabad,Imported,1.07crore,"Defence Phase-2, Islamabad Islamabad"
4,90000.0,2019.0,Super White,Sedan,1300.0,Petrol,Manual,Punjab,Local,42.5lacs,"Satiana Road, Faisalabad Punjab"
...,...,...,...,...,...,...,...,...,...,...,...
2933,100000.0,2010.0,White,,660.0,Petrol,Automatic,Sindh,Local,13.85lacs,"G- 7, Islamabad Islamabad"
2934,28000.0,2023.0,Black,Hatchback,1000.0,Petrol,Manual,Sindh,Local,27.75lacs,"Gulshan-e-Jamal, Karachi Sindh"
2935,780000.0,2017.0,White,,800.0,Petrol,Manual,Punjab,Local,14.5lacs,"Chungi No 9, Multan Punjab"
2936,49596.0,2014.0,Grey,Hatchback,800.0,Petrol,Manual,Punjab,Local,13lacs,"BAHRIA PHASE 8, Rawalpindi Punjab"


In [95]:
def clean_price(value):
    value = str(value).strip()

    if 'Call' in value:
        return np.nan
    if 'lacs' in value:
        x = float(value.replace('lacs', ''))
        return x * 100000
    elif 'crore' in value:
        x = float(value.replace('crore', ''))
        return x * 10000000
    else:
        return float(value)


df['Price'] = df['Price'].apply(clean_price)

df.dropna(subset=['Price'], inplace=True)

df
        

Unnamed: 0,Mileage,Model,Color,BodyType,EngineCapacity,FuelType,Transmission,RegisteredIn,Assembly,Price,Location
0,30.0,2025.0,White,Sedan,1300.0,Petrol,Manual,Un-Registered,Local,5100000.0,Lahore Punjab
1,72000.0,2020.0,Crystal Black Pearl,Sedan,1500.0,Petrol,Automatic,Islamabad,Local,6575000.0,"Shadman, Lahore Punjab"
2,136862.0,2012.0,White,Sedan,2000.0,Diesel,Automatic,Islamabad,Local,12000000.0,"I- 9, Islamabad Islamabad"
3,90768.0,2015.0,Black,Sedan,1800.0,Petrol,Automatic,Islamabad,Imported,10700000.0,"Defence Phase-2, Islamabad Islamabad"
4,90000.0,2019.0,Super White,Sedan,1300.0,Petrol,Manual,Punjab,Local,4250000.0,"Satiana Road, Faisalabad Punjab"
...,...,...,...,...,...,...,...,...,...,...,...
2933,100000.0,2010.0,White,,660.0,Petrol,Automatic,Sindh,Local,1385000.0,"G- 7, Islamabad Islamabad"
2934,28000.0,2023.0,Black,Hatchback,1000.0,Petrol,Manual,Sindh,Local,2775000.0,"Gulshan-e-Jamal, Karachi Sindh"
2935,780000.0,2017.0,White,,800.0,Petrol,Manual,Punjab,Local,1450000.0,"Chungi No 9, Multan Punjab"
2936,49596.0,2014.0,Grey,Hatchback,800.0,Petrol,Manual,Punjab,Local,1300000.0,"BAHRIA PHASE 8, Rawalpindi Punjab"
