Import Libraries and variables

In [149]:
import pandas as pd
import re

df=pd.read_csv("./datasets/cleaned_vehicles.csv")

noise_words=["brand new","b/n","brand n","brandnew","new","used","1st","1st owner","petrol","diesel","hybrid","electric","Anniversary",
"second","2nd owner","company maintained","first","owner","owners","manual",
"full option","full options","full-option","full-options","full optioned","full-option","highest spec","high spec","full spec","Pre Order","Moon Roof",
"moonroof","(brandnew)","(brand new)","(moonroof)"]


Add,remove and short

In [150]:
#----------adding body type column----------

#body type
if "Body Type" not in df.columns:
    col_index=df.columns.get_loc("Model")+1
    df.insert(col_index,"Body Type",None)
else:
    print("Body Type column already exists")
#seating capacity
if "Seating Capacity" not in df.columns:
    col_index=df.columns.get_loc("Fuel Type")+1
    df.insert(col_index,"Seating Capacity",None)
else:
    print("Seating Capacity column already exists")
#ground clearance
if "Ground Clearance" not in df.columns:
    col_index=df.columns.get_loc("Seating Capacity")+1
    df.insert(col_index,"Ground Clearance",None)
else:
    print("Ground Clearance column already exists")

#----------Modifing Dataset----------
#Lowercase Model
df["Model"]=df["Model"].str.lower().str.strip()
#Sort dataset
df=df.sort_values(by=["Manufacturer","Model","Year"],ascending=[True,True,False])

#----------Rearranging columns----------
#Year column after Model column
cols=list(df.columns)
cols.remove("Year")
cols.insert(cols.index("Model")+1,"Year")
df = df[cols]

#Fuel Type column after Year column
cols=list(df.columns)
cols.remove("Fuel Type")
cols.insert(cols.index("Year")+1,"Fuel Type")
df = df[cols]

#----------removing noise words from Model column----------
for word in noise_words:
    df["Model"]=df["Model"].str.replace(word,"",regex=True)
df["Model"]=df["Model"].str.replace(r"\s+"," ",regex=True).str.strip()

#----------removing duplicates----------
df=df.drop_duplicates(subset=["Manufacturer","Model","Year"],keep="first")

#----------removing unwanted rows----------
df = df[~df["Model"].str.lower().eq("qute")]
df = df[~df["Model"].str.lower().eq("og v7")]
df = df[~df["Manufacturer"].str.lower().eq("chrysler")]
df=df[~df["Manufacturer"].str.lower().eq("dodge")&~df["Model"].str.lower().eq("glory iauto 7 seater")]

#----------Replace words in Model column----------
mask = (df["Manufacturer"].str.strip().str.lower() == "ford") & \
       (df["Model"].str.lower().str.contains("raptor ranger"))

df.loc[mask, "Model"] = df.loc[mask, "Model"].str.replace(
    r"raptor\s*ranger", "ranger raptor", regex=True, case=False
)

#----------Remove floating points----------
df["Year"] = df["Year"].astype("Int64")
df["Seating Capacity"] = df["Seating Capacity"].astype("Int64")
df["Ground Clearance"] = df["Ground Clearance"].astype("Int64")

#----------Remove columns----------
if "Price" in df.columns:
    df=df.drop(columns=["Price"])
    print("Price column removed")
else:
    print("Price column does not exist")

if "Mileage" in df.columns:
    df=df.drop(columns=["Mileage"])
    print("Mileage column removed")
else:
    print("Mileage column does not exist")

if "Condition" in df.columns:
    df=df.drop(columns=["Condition"])
    print("Condition column removed")
else:
    print("Condition column does not exist")


df.head(0)



Body Type column already exists
Seating Capacity column already exists
Ground Clearance column already exists
Price column does not exist
Mileage column does not exist
Condition column does not exist


Unnamed: 0,Manufacturer,Model,Year,Fuel Type,Body Type,Seating Capacity,Ground Clearance,Average Price (0-20k km),Average Price (20k-50k km),Average Price (50k-100k km),Average Price (100k+ km)


Adding features/Body Type

In [None]:
#----------Adding Body Type----------
vehicle_map={
    "Audi":{
        "seating_capacity":{"a1":5,"a3":5,"a4":5,"a5":5,"a6":5,"a7":5,"a8":5,"q2":5,"q3":5,"q4":5,"q5":5,"q7":7,"q8":5,"e-tron":5},
        "body_type":{"sedan":"sedan","a1":"hatchback","a3":"hatchback","a3 sedan":"sedan","a3 tfsi sedan":"sedan","a3 sportback":"hatchback",
                     "a4":"seden/wagon","a5":"seden","a6":"seden/wagon","a7":"hatchback/hatchback sportback","a8":"sedan","q2":"suv",
                     "q3":"suv/suv sportback","q4":"suv/suv sportback","q5":"suv/suv sportback","q7":"suv","q8":"suv","e-tron":"suv/suv sportback"},
        "ground_clearance":{"a1":125,"a3":140,"a4":140,"a5":140,"a6":140,"a7":140,"a8":140,"q2":200,"q3":200,"q5":200,
                            "q7":200,"q8":200,"e-tron":200}
    },
    "BMW":{
        "seating_capacity":{"218i":5,"316i":5,"318i":5,"320d":5,"430i gran":5,"520d":5,"520i":5,"523i":5,"530e":5,"530i":5,"535i":5,
            "725d":5,"730d":5,"730ld":5,"730le":5,"740li":5,"f10":5,"active 7":5,"i3":4,"i5":5,"i7":5,"i8":4,
            "i8 roadster":2,"ix":5,"m760":5,"mini cooper":4,"x1":5,"x2":5,"x3":5,"x5":5,"x6":5,"x7":7,"z4":2},
        "body_type":{"218i":"coupe","316i":"sedan","318i":"sedan","320d":"sedan","430i gran":"coupe/gran coupe","520d":"sedan/touring",
            "520d gt":"hatchback","520i":"sedan/touring","523i":"sedan","530e":"sedan","530i":"sedan","535i":"sedan","528i":"sedan",
            "725d":"sedan","730ld":"sedan","740le":"sedan","740li":"sedan","740e":"sedan","750i":"sedan","active 7":"sedan",
            "i3":"hatchback","i5":"sedan","i7":"sedan","i8":"coupe/roadster","i8 roadster":"roadster","ix":"suv","m760":"sedan",
            "mini cooper":"hatchback","x1":"suv","x2":"suv","x3":"suv","x5":"suv","x6":"suv","x7":"suv","z4":"roadster"},
    },
    "BYD":{
        "seating_capacity":{"e6":5,"seal":5,"sealion":5,"atto":5},
        "body_type":{"e6":"sedan/hatchback","seal":"sedan","sealion":"suv","atto":"suv"}
    },
    "Chery":{
        "seating_capacity":{"qq":4,"qq 308":5,"tiggo":5},
        "body_type":{"qq":"hatchback","qq 308":"hatchback","tiggo":"suv"}
    },
    "Chevrolet":{
        "seating_capacity":{"cruze":5},
        "body_type":{"cruze":"hatchback"}
    },
    "DFSK":{
        "seating_capacity":{"580":7,"glory":7},
        "body_type":{"580":"suv","glory":"suv"}
    },
    "Daihatsu":{
        "seating_capacity":{"mira":4,"move":4,"rocky":5,"taft":5,"tanto":4,"terios":7,"thor":5},
        "body_type":{"mira":"hatchback","move":"hatchback","rocky":"suv","taft":"suv","tanto":"hatchback","terios":"suv","thor":"suv"}
    },
    "Datsun":{
        "seating_capacity":{"redi-go":5},
        "body_type":{"redi-go":"hatchback"}
    },
    "Dongfeng":{
        "seating_capacity":{"box":5},
        "body_type":{"box":"suv"}
    },
    "Fiat":{
        "seating_capacity":{"punto":5},
        "body_type":{"punto":"hatchback"}
    },
    "Ford":{
        "seating_capacity":{"everest":7,"fiesta":5,"kuga":5,"mustang 5liter":4,"mustang mach-e":5,"ranger":5,"wildtrak":5,"raptor":5,},
        "body_type":{"everest":"suv","fiesta":"hatchback","kuga":"suv","mustang 5liter":"coupe/convertible","mustang mach-e":"suv",
            "ranger":"pickup","wildtrak":"pickup","raptor":"pickup"}
    },
    "Honda":{
        "seating_capacity":{"civic":5,"cr-v":5,"city":5,"cr-v":5,"cr-v australian 7 seater":7,"cr-v vti l 7 seater":7,"cr-v vti lx 7 seater":7,
                 "hr-v":5,"jazz":5,"vezel":5,"cr-z zf1":2,"fit":5,"freed":7,"grace":5,"insight":5,"n-box":4,"n-one":4,"n-wgn":4,
                 "step wagon":7,"wr-v":5,"zrv":5},
        "body_type":{"civic":"seden","cr-v":"suv","city":"seden","cr-v 7 seater":"suv","cr-v australian 7 seater":"suv",
            "cr-v vti l 7 seater":"suv","cr-v vti lx 7 seater":"suv","hr-v":"suv","jazz":"hatchback","vezel":"suv","wr-v":"suv",
            "zr-v":"suv","cr-z zf1":"coupe","fit":"hatchback","fit shuttle":"wagon","fit shuttle gp2":"wagon","fit shuttle gp7":"wagon",
            "fit shuttle gp7 navi ":"wagon","fit shuttle gp2 navi":"wagon","freed":"minivan","grace":"sedan","insight":"sedan","n-box":"kei/hatchback",
            "n-one":"kei/hatchback","n-wgn":"kei/hatchback","step wagon":"minivan"}
    },
    "Hyundai":{
        "seating_capacity":{"accent":5,"creta":5,"eon":5,"elantra":5,"grand i10":5,"i20":5,"i30":5,"kona":5,"santa fe":7,"sonata":5,"tucson":5,
                            "ioniq":5,"terracan":5,"venue":5},
        "body_type":{"accent":"sedan/hatchback","creta":"suv","elantra":"sedan","eon":"hatchback","grand i10":"hatchback","i20":"hatchback",
                     "ioniq":"hatchback","i30":"hatchback/wagon","kona":"suv","santa fe":"suv","sonata":"sedan","tucson":"suv","terracan":"suv",
                     "venue":"suv"}
    },
    "Isuzu":{
        "seating_capacity":{"d-max":5,"mu-x":7},
        "body_type":{"d-max":"pickup","mu-x":"suv"}
    },
    "JAC":{
        "seating_capacity":{"t9":5},
        "body_type":{"t9":"pickup"}
    },
    "Jaecoo":{
        "seating_capacity":{"j7 4wd":5,"j6 ev":5,"j7 phev":5},
        "body_type":{"j7 4wd":"suv","j6 ev":"suv","j7 phev":"suv"}
    },
    "JMEV":{
        "seating_capacity":{"elight":5},
        "body_type":{"elight":"sedan/coupe"}
    },
    "Jaguar":{
        "seating_capacity":{"e-pace":5,"f-pace":5,"xf":5},
        "body_type":{"e-pace":"compact suv","f-pace":"suv","xf":"sedan"}
    },
    "Jeep":{
        "seating_capacity":{"compass":5,"grand cherokee":5,"renegade":5,"wrangler":4,"wrangler gladiator":5,"wrangler rubicon":5},
        "body_type":{"wrangler":"suv","grand cherokee":"suv","renegade":"suv","wrangler":"suv","wrangler gladiator":"pickup",
                     "wrangler rubicon":"suv"}
    },
    "Kia":{
        "seating_capacity":{"carens clavis":7,"carnival":7,"cerato":5,"ev5":5,"jeep caren":7,"picanto":5,"rio":5,"seltos":5,
                            "sonet":5,"sorento":7,"soul":5,"sportage":5,"stonic":5},
        "body_type":{"carens clavis":"muv","carnival":"muv","cerato":"sedan/hatchback","ev5":"uv","jeep caren":"muv",
                      "picanto":"hatchback","rio":"hatchback/sedan","seltos":"suv","sonet":"suv","sorento":"suv","soul":"hatchback",
                      "sportage":"suv","stonic":"suv"}
    }
}




Adding features/Seating Capacity 

In [152]:
for brand,attributes in vehicle_map.items():
    for model,seating_capacity in attributes["seating_capacity"].items():
        df.loc[(df["Manufacturer"].str.strip().str.lower()==brand.strip().lower()) & 
               (df["Model"].str.contains(model,case=False,regex=False)),"Seating Capacity"]=seating_capacity
        
        
    for model, btype in attributes["body_type"].items():
        mask = (df["Manufacturer"]==brand) & (df["Model"].str.lower().eq(model))
        df.loc[mask, "Body Type"] = btype
        mask = (df["Manufacturer"]==brand) & (df["Model"].str.lower().str.contains(model))
        df.loc[mask & df["Body Type"].isna(), "Body Type"] = btype
    
    
    
    print(f"Total {brand} cars: {df[df['Manufacturer'] == brand].shape[0]}")
     


Total Audi cars: 86
Total BMW cars: 133
Total BYD cars: 6
Total Chery cars: 6
Total Chevrolet cars: 1
Total DFSK cars: 18
Total Daihatsu cars: 105
Total Datsun cars: 2
Total Dongfeng cars: 1
Total Fiat cars: 1
Total Ford cars: 55
Total Honda cars: 211
Total Hyundai cars: 40
Total Isuzu cars: 1
Total JAC cars: 2
Total Jaecoo cars: 3
Total JMEV cars: 1
Total Jaguar cars: 5
Total Jeep cars: 9
Total Kia cars: 76


In [153]:
df.to_csv("./datasets/cleaned_vehicles.csv",index=False)
print(f"Saved {len(df)} records to cleaned_vehicles.csv")

Saved 3302 records to cleaned_vehicles.csv
