In [170]:
import pandas as pd
import numpy as np
import regex as re

In [171]:
df = pd.read_csv("originalData.csv")
df.columns = df.columns.str.strip()
print(df.shape)
df.head()

(24198, 17)


Unnamed: 0,Column1,Year,Brand,Model,Distance (km),Body Type,Engine,Transmission,Drivetrain,Exterior Colour,Interior Colour,Passengers,Doors,Fuel Type,City,Highway,Purchase Price
0,0,2019,Acura,MDX,53052 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Majestic Black Pearl,Red,,,Gas,12.2L/100km,9.0L - 9.5L/100km,43880
1,1,2018,Acura,MDX,77127 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Modern Steel Metallic,Black,,,Gas,12.6L/100km,9.0L/100km,36486
2,2,2019,Acura,RDX,33032 km,SUV,2.0L 4cyl,10 Speed Automatic,AWD,White Diamond Pearl,Black,5.0,4.0,Premium Unleaded,11.0L/100km,8.6L/100km,40888
3,3,2020,Acura,RDX,50702 km,SUV,4 Cylinder Engine,,AWD,Platinum White Pearl,Black,,,Gas,11.0L/100km,8.6L/100km,44599
4,4,2021,Acura,RDX,67950 km,SUV,4 Cylinder Engine,,AWD,Apex Blue Pearl,Red,,,Gas,11.3L/100km,9.1L/100km,46989


## Dropping First Column:

In [172]:
df.drop(columns=['Column1'], inplace=True)
print(df.shape)
df.head()

(24198, 16)


Unnamed: 0,Year,Brand,Model,Distance (km),Body Type,Engine,Transmission,Drivetrain,Exterior Colour,Interior Colour,Passengers,Doors,Fuel Type,City,Highway,Purchase Price
0,2019,Acura,MDX,53052 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Majestic Black Pearl,Red,,,Gas,12.2L/100km,9.0L - 9.5L/100km,43880
1,2018,Acura,MDX,77127 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Modern Steel Metallic,Black,,,Gas,12.6L/100km,9.0L/100km,36486
2,2019,Acura,RDX,33032 km,SUV,2.0L 4cyl,10 Speed Automatic,AWD,White Diamond Pearl,Black,5.0,4.0,Premium Unleaded,11.0L/100km,8.6L/100km,40888
3,2020,Acura,RDX,50702 km,SUV,4 Cylinder Engine,,AWD,Platinum White Pearl,Black,,,Gas,11.0L/100km,8.6L/100km,44599
4,2021,Acura,RDX,67950 km,SUV,4 Cylinder Engine,,AWD,Apex Blue Pearl,Red,,,Gas,11.3L/100km,9.1L/100km,46989


## Dropping Duplicate Rows

In [173]:
df.drop_duplicates(inplace=True)
print(df.shape)
df.head()

(20050, 16)


Unnamed: 0,Year,Brand,Model,Distance (km),Body Type,Engine,Transmission,Drivetrain,Exterior Colour,Interior Colour,Passengers,Doors,Fuel Type,City,Highway,Purchase Price
0,2019,Acura,MDX,53052 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Majestic Black Pearl,Red,,,Gas,12.2L/100km,9.0L - 9.5L/100km,43880
1,2018,Acura,MDX,77127 km,SUV,V6 Cylinder Engine,9 Speed Automatic,AWD,Modern Steel Metallic,Black,,,Gas,12.6L/100km,9.0L/100km,36486
2,2019,Acura,RDX,33032 km,SUV,2.0L 4cyl,10 Speed Automatic,AWD,White Diamond Pearl,Black,5.0,4.0,Premium Unleaded,11.0L/100km,8.6L/100km,40888
3,2020,Acura,RDX,50702 km,SUV,4 Cylinder Engine,,AWD,Platinum White Pearl,Black,,,Gas,11.0L/100km,8.6L/100km,44599
4,2021,Acura,RDX,67950 km,SUV,4 Cylinder Engine,,AWD,Apex Blue Pearl,Red,,,Gas,11.3L/100km,9.1L/100km,46989


## Feature Engineering:

### 1. Exterior Car Colour:

In [174]:
df['Exterior Colour'].fillna('Unknown', inplace=True)
color_keywords = {
    'Black': ['black'],
    'Blue': ['blue', 'sapphire'],
    'Red': ['red'],
    'Green': ['green'],
    'Brown': ['brown', 'sandstone'],
    'White': ['white', 'silver'],
    'Grey': ['grey', 'charcoal'],
    'Yellow': ['yellow'],
    'Orange': ['orange'],
    'Purple': ['purple']
}

# Function to map complex color names to basic colors
def map_to_basic_color(exterior_color):
    for basic_color, keywords in color_keywords.items():
        for keyword in keywords:
            if keyword in exterior_color.lower():
                return basic_color
    return 'Other Colour'

# Apply the function to create the new column
df['Exterior Colour'] = df['Exterior Colour'].apply(map_to_basic_color)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Exterior Colour'].fillna('Unknown', inplace=True)


In [175]:
color_counts = df['Exterior Colour'].value_counts()
top_colors = color_counts.head(6)
print(top_colors)

Exterior Colour
White           6115
Black           5430
Other Colour    3067
Grey            2197
Blue            1836
Red              986
Name: count, dtype: int64


In [176]:
def fetch_color(text):
    # Define the patterns for the models you want to extract
    patterns = [
        (r'White', 'White'),
        (r'Black', 'Black'),
        (r'Grey', 'Grey'),
        (r'Blue', 'Blue'),
        (r'Red', 'Red'),
    ]
    
    # Loop through patterns and return the model if found
    for pattern, color in patterns:
        if re.search(pattern, text):
            return color
    return 'Other Colour'

# Apply the function to your DataFrame
df['Exterior Colour'] = df['Exterior Colour'].apply(fetch_color)
df['Exterior Colour'].value_counts()

Exterior Colour
White           6115
Black           5430
Other Colour    3486
Grey            2197
Blue            1836
Red              986
Name: count, dtype: int64

In [177]:
print(len(df['Exterior Colour'].unique()))
print(df['Exterior Colour'].isna().sum())
print(df['Exterior Colour'].value_counts())

6
0
Exterior Colour
White           6115
Black           5430
Other Colour    3486
Grey            2197
Blue            1836
Red              986
Name: count, dtype: int64


### 2. Transmission Type & Transmission Speed:

In [178]:
df['Transmission'].fillna('Unknown', inplace=True)
# Function to determine transmission type
def get_transmission_type(transmission):
    if 'Manual' in transmission:
        return 'Manual'
    elif 'Automatic' in transmission:
        return 'Automatic'
    else:
        return 'Other'  

# Function to extract transmission speed
def get_transmission_speed(transmission):
    import re
    match = re.search(r'(\d+) Speed', transmission)
    return int(match.group(1)) if match else None

# Apply the functions to create new columns
df['Transmission Type'] = df['Transmission'].apply(get_transmission_type)
df['Transmission Speed'] = df['Transmission'].apply(get_transmission_speed)

df.drop(columns='Transmission', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Transmission'].fillna('Unknown', inplace=True)


In [179]:
print("=================TRANSMISSION TYPE=============================")
print(df['Transmission Type'].unique())
print(df['Transmission Type'].isna().sum())
print(df['Transmission Type'].value_counts())

print("=================TRANSMISSION SPEED=============================")
print(df['Transmission Speed'].unique())
print(df['Transmission Speed'].isna().sum())
print(df['Transmission Speed'].value_counts())

df.shape

['Automatic' 'Other' 'Manual']
0
Transmission Type
Automatic    16413
Other         2630
Manual        1007
Name: count, dtype: int64
[ 9. 10. nan  6.  8.  5.  4.  7.  1.  3.]
13599
Transmission Speed
8.0     2639
6.0     1842
7.0      663
9.0      562
1.0      233
10.0     221
5.0      201
4.0       89
3.0        1
Name: count, dtype: int64


(20050, 17)

### 3. Enginer Displacement, Cylinder Count, Engine Configuration, Special Technology & Fuel Type:

In [180]:
df['Engine'].fillna('Unknown', inplace=True)

def parse_engine(engine):
    # Initialize components
    engine_displacement = ''
    cylinder_count = ''
    engine_config = ''
    special_tech = ''
    fuel_type = ''
    
    # Extract Engine Displacement (e.g., 2.0L)
    displacement_match = re.search(r'\d+\.\d+L|\d+L', engine)
    if displacement_match:
        engine_displacement = displacement_match.group()

    # Extract Cylinder Count (e.g., 4cyl)
    cylinder_match = re.search(r'\d+cyl|\d+\sCylinder|\d+\sCyl.', engine, re.IGNORECASE)
    if cylinder_match:
        cylinder_count = re.search(r'\d+', cylinder_match.group()).group()

    # Extract Engine Configuration (e.g., V6, Inline 4)
    config_match = re.search(r'V\d|Inline\s\d|Straight\s\d|Flat\s\d|I-\d', engine, re.IGNORECASE)
    if config_match:
        engine_config = config_match.group()

    # Extract Special Technology (e.g., Turbo, DOHC, EcoBoost)
    tech_match = re.search(r'TURBO|DOHC|ECOBOOST|SIDI|VVT', engine, re.IGNORECASE)
    if tech_match:
        special_tech = tech_match.group().upper()

    # Identify Fuel Type from specific keywords (assuming known terms)
    if 'DIESEL' in engine.upper():
        fuel_type = 'Diesel'
    elif 'ELECTRIC' in engine.upper():
        fuel_type = 'Electric'
    elif 'GAS' in engine.upper():
        fuel_type = 'Gasoline'
    else:
        fuel_type = 'Not specified'  # This could be inferred if needed
    
    return pd.Series([engine_displacement, cylinder_count, engine_config, special_tech, fuel_type])

# Apply the parsing function to the Engine column
df[['Engine Displacement (Lts.)', 'Cylinder Count', 'Engine Config', 'Special Tech', 'Fuel Type Derived']] = df['Engine'].apply(parse_engine)

df.drop(columns = 'Engine', inplace = True)

df['Engine Displacement (Lts.)'] = df['Engine Displacement (Lts.)'].replace('', np.nan).str.replace('L', '').astype(float)
df['Cylinder Count'] = df['Cylinder Count'].replace('', np.nan)
df['Engine Config'] = df['Engine Config'].replace('', np.nan)
df['Special Tech'] = df['Special Tech'].replace('', np.nan)
df['Fuel Type Derived'] = df['Fuel Type Derived'].replace('', np.nan)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine'].fillna('Unknown', inplace=True)


In [181]:
print("=============ENGINE DISPLACEMENT================")

print(df['Engine Displacement (Lts.)'].unique())
print(df['Engine Displacement (Lts.)'].isna().sum())
print(df['Engine Displacement (Lts.)'].value_counts())

print("==============FUEL TYPE DERIVED=========================")

print(df['Fuel Type Derived'].unique())
print(df['Fuel Type Derived'].isna().sum())
print(df['Fuel Type Derived'].value_counts())

print("==============CYLINDER COUNT====================")

print(df['Cylinder Count'].unique())
print(df['Cylinder Count'].isna().sum())
print(df['Cylinder Count'].value_counts())

print("==============ENGINE CONFIG=====================")

print(df['Engine Config'].unique())
print(df['Engine Config'].isna().sum())
print(df['Engine Config'].value_counts())

print("==============SPECIAL TECH=====================")

print(df['Special Tech'].unique())
print(df['Special Tech'].isna().sum())
print(df['Special Tech'].value_counts())


df.shape

[   nan   2.     3.5    2.4    3.     3.7    1.7    2.3    2.5    3.2
   2.9    2.2    4.     0.     5.2    4.2    1.4    6.     6.8    4.4
   1.5   39.5    4.7    3.6    1.3    6.2    2.7    5.3    4.3    7.
   6.6    3.4    1.8    5.7    7.4    3.8    4.8    4.6    5.5    1.6
   1.2    6.4    8.     8.4    3.3    6.3    4.5    3.9    6.7    5.
   1.     7.3    5.4    2.8    5.6  757.     6.5    6.75   1.9 ]
14278
Engine Displacement (Lts.)
2.00      1622
3.60       597
3.50       446
2.50       442
3.00       421
1.50       383
2.40       243
2.70       190
1.60       155
5.30       145
1.80       119
6.20       110
5.00        93
1.40        90
6.60        72
2.30        69
3.20        64
5.70        57
3.80        55
3.30        47
4.00        45
4.40        33
6.00        28
0.00        24
5.60        24
6.40        24
3.70        23
2.90        21
1.30        20
6.70        17
5.20        14
4.70        11
5.50         7
1.00         6
3.90         4
4.80         4
8.40         4

(20050, 21)

### 4. Total Fuel Consumed:

In [182]:
df['City'].fillna('NA', inplace=True)
df['Highway'].fillna('NA', inplace=True)

def clean_consumption_data(column):

    def process_value(value):
        value = value.replace('L/100km', '').replace('L', '').strip()
        if '-' in value:
            range_values = value.split('-')
            range_values = [float(v.strip()) for v in range_values]
            avg = np.mean(range_values)
            return avg
        else:
            return value

    return column.apply(process_value)

# Assuming df is your DataFrame
df['City'] = clean_consumption_data(df['City'])
df['Highway'] = clean_consumption_data(df['Highway'])

def calculate_combined_efficiency(row):
    if row['City'] and row['Highway'] != "NA":
        city_efficiency = float(row['City'])
        highway_efficiency = float(row['Highway'])
        combined_efficiency = round((city_efficiency * 0.55) + (highway_efficiency * 0.45), 2)

        return combined_efficiency
    else:
        return 0

df['Fuel Efficiency (Lt/100kms)'] = df.apply(calculate_combined_efficiency, axis=1)
df[['City','Highway','Fuel Efficiency (Lt/100kms)']]

df.drop(columns = ['City', 'Highway'], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Highway'].fillna('NA', inplace=True)


### 5. Mileage (km/L):

In [183]:
df['Distance (km)'] = df['Distance (km)'].str.replace('km', '').str.strip()
df.dropna(subset=['Distance (km)'], inplace=True)

df['Mileage (kms/Lt)'] = round((100 / df['Fuel Efficiency (Lt/100kms)']),1)

In [184]:
# Count the number of records where 'Mileage (kms/Lt)' is infinite
inf_count = df[df['Mileage (kms/Lt)'] == float('inf')].shape[0]
print(inf_count)

df = df[df['Mileage (kms/Lt)'] != float('inf')]


5279


### 6. Total Number of Years:

In [185]:
current_year = 2024
df['Years Used'] = round((current_year - df['Year']),1)
# df.drop(columns = 'Year', inplace = True)

### 7. Fuel Type:

In [186]:
df['Fuel Type'] = df['Fuel Type'].fillna(df['Fuel Type Derived'])
df.drop(columns = 'Fuel Type Derived', inplace = True)
print(df['Fuel Type'].isna().sum())

0


In [187]:
# Count the number of rows where 'Fuel Type' is 'Electric'
electric_count = (df['Fuel Type'] == 'Electric').sum()
print(electric_count)

df = df[df['Fuel Type'] != 'Electric']


69


### 8. Current Market Price:

Approach to Calculate Current Market Price:
We can derive the "Current Market Price" based on factors such as the original purchase price, years used, distance driven, and fuel efficiency. Here's a proposed formula:

#### 1. Depreciation Rate: 
Vehicles typically depreciate over time. A common heuristic is that a car loses about 5% of its value each year after purchase. This is why Years Used * 0.05 was used as a simple way to estimate the total depreciation percentage due to aging.

**NOTE - Customization:** The 5% per year rate is a general rule of thumb and may not apply to all vehicles. In reality, the depreciation rate can vary depending on the make, model, condition, and other market factors. The 5% annual depreciation is a starting point, but you can adjust this rate to better reflect the specific characteristics of the vehicles in your dataset or industry standards.

    Depreciation Rate = 0.05 × Years Used

#### 2. Conditional Factor: (Formula Breakdown) 
 - **Scaling Factor (0.0001)** is a small constant chosen to ensure that the condition factor decreases at a realistic rate as the mileage increases. It controls how sensitive the condition factor is to changes in distance. The specific value 0.0001 is chosen based on experience, data, or industry standards to ensure the depreciation is neither too steep nor too gradual. 
 - **Addition of 1** The 1 ensures that at 0 kilometers, the condition factor is exactly 1 (indicating no depreciation). As mileage increases, this addition ensures that the condition factor reduces smoothly, reflecting gradual wear and tear.
 - **Division by the Denominator** This is the key part of the formula that translates increasing mileage into a decreasing condition factor. It ensures that as the mileage (distance) increases, the overall condition factor decreases, showing a natural decline in the car's value.

        Condition Factor = 1 / (1+0.0001×Distance (km))

#### 3. Current Market Price: It is calculated by multiplying the original price by the depreciation rate and condition factor, giving you a realistic estimate of the vehicle's current value.

    Current Market Price = Purchase Price × (1−Depreciation Rate) × Condition Factor

- Purchased Price is the original cost of the vehicle.
- Depreciation Rate adjusts the price based on how many years the car has been used, reducing the price gradually over time.
- Condition Factor adjusts the price based on the car's mileage, with higher mileage leading to a lower current market value.

In [188]:
# Convert relevant columns to numeric, forcing errors to NaN
df['Distance (km)'] = pd.to_numeric(df['Distance (km)'], errors='coerce')
df['Purchase Price'] = pd.to_numeric(df['Purchase Price'], errors='coerce')

# Define the depreciation rate and condition factor
df['Depreciation Rate'] = 0.05 * df['Years Used']
df['Condition Factor'] = 1 / (1 + 0.0001 * df['Distance (km)'])

# Calculate the Current Market Price
df['Current Market Price'] = (df['Purchase Price'] * (1 - df['Depreciation Rate']) * df['Condition Factor']).round(2)

# df[['Purchase Price', 'Years Used', 'Distance (km)', 'Current Market Price']].head()

### 9. Car Model:

In [189]:
model_counts = df['Model'].value_counts()
top_models = model_counts.head(20)
print(top_models)

Model
Civic        516
Grand        424
Elantra      381
F-150        362
Wrangler     291
Santa        287
Rogue        284
CR-V         267
Jetta        262
Mazda3       247
Silverado    245
Sierra       221
Tucson       217
CX-5         209
Escape       204
Accord       182
X3           180
X5           172
Tiguan       167
Q5           157
Name: count, dtype: int64


In [190]:
def fetch_model(text):
    # Define the patterns for the models you want to extract
    patterns = [
        (r'Civic', 'Civic'),
        (r'Grand', 'Grand'),
        (r'Elantra', 'Elantra'),
        (r'F-150', 'F-150'),
        (r'Wrangler', 'Wrangler'),
        (r'Santa', 'Santa'),
        (r'Rogue', 'Rogue'),
        (r'CR-V', 'CR-V'),
        (r'Jetta', 'Jetta'),
        (r'Mazda3', 'Mazda3'),
        (r'Silverado', 'Silverado'),
        (r'Sierra', 'Sierra'),
        (r'Tucson', 'Tucson'),
        (r'CX-5', 'CX-5'),
        (r'Escape', 'Escape'),
        (r'Accord', 'Accord'),
        (r'X3', 'X3'),
        (r'X5', 'X5'),
        (r'Tiguan', 'Tiguan'),
        (r'Q5', 'Q5')
    ]
    
    # Loop through patterns and return the model if found
    for pattern, model in patterns:
        if re.search(pattern, text):
            return model
    return 'Other Model Type'

# Apply the function to your DataFrame
df['Model'] = df['Model'].apply(fetch_model)
df['Model'].value_counts()

Model
Other Model Type    9001
Civic                516
Grand                424
Elantra              381
F-150                362
Wrangler             291
Santa                287
Rogue                284
X5                   272
CR-V                 267
Q5                   265
Jetta                262
Mazda3               247
Silverado            245
Sierra               221
Tucson               217
CX-5                 215
Escape               204
X3                   192
Accord               182
Tiguan               167
Name: count, dtype: int64

### 10. Body Type:

In [191]:
body_counts = df['Body Type'].value_counts()
top_bodytype = body_counts.head(10)
print(top_bodytype)

Body Type
SUV            7045
Sedan          3629
Coupe           814
Hatchback       764
Truck           509
Minivan         474
Convertible     400
Crew Cab        396
Wagon           218
Compact          64
Name: count, dtype: int64


In [192]:
def fetch_body(text):
    # Ensure the input is a string
    if not isinstance(text, str):
        return 'Other Body Type'
    
    # Define the patterns for the body types you want to extract
    patterns = [
        (r'SUV', 'SUV'),
        (r'Sedan', 'Sedan'),
        (r'Coupe', 'Coupe'),
        (r'Hatchback', 'Hatchback'),
        (r'Truck', 'Truck'),
        (r'Minivan', 'Minivan'),
        (r'Convertible', 'Convertible'),
        (r'Crew Cab', 'Crew Cab'),
        (r'Wagon', 'Wagon'),
        (r'Compact', 'Compact'),
    ]
    
    # Loop through patterns and return the Body Type if found
    for pattern, body in patterns:
        if re.search(pattern, text):
            return body
    return 'Other Body Type'

# Apply the function to your DataFrame
df['Body Type'] = df['Body Type'].apply(fetch_body)

### 11. Brand:

In [193]:
brand_counts = df['Brand'].value_counts()
top_brands = brand_counts.head(20)
print(top_brands)

Brand
Honda            1283
Ford             1231
Hyundai          1223
BMW              1099
Nissan            952
Volkswagen        866
Jeep              866
Audi              783
Mazda             747
Mercedes-Benz     723
Chevrolet         713
Dodge             500
Kia               497
Porsche           431
Infiniti          393
GMC               347
Subaru            299
Acura             295
Cadillac          216
Chrysler          155
Name: count, dtype: int64


In [194]:
def fetch_brand(text):
    # Ensure the input is a string
    if not isinstance(text, str):
        return 'Other Brand'
    
    # Define the patterns for the body types you want to extract
    patterns = [
        (r'Honda', 'Honda'),
        (r'Ford', 'Ford'),
        (r'Hyundai', 'Hyundai'),
        (r'BMW', 'BMW'),
        (r'Nissan', 'Nissan'),
        (r'Volkswagen', 'Volkswagen'),
        (r'Jeep', 'Jeep'),
        (r'Audi', 'Audi'),
        (r'Mazda', 'Mazda'),
        (r'Mercedes[-\s]?Benz', 'Mercedes-Benz'), 
        (r'Chevrolet', 'Chevrolet'),
        (r'Dodge', 'Dodge'),
        (r'Kia', 'Kia'),
        (r'Porsche', 'Porsche'),
        (r'Infiniti', 'Infiniti'),
        (r'GMC', 'GMC'),
        (r'Subaru', 'Subaru'),
        (r'Acura', 'Acura'),
        (r'Cadillac', 'Cadillac'),
        (r'Chrysler', 'Chrysler'),
    ]
    
    # Loop through patterns and return the Body Type if found
    for pattern, brand in patterns:
        if re.search(pattern, text):
            return brand
    return 'Other Brand'

# Apply the function to your DataFrame
df['Brand'] = df['Brand'].apply(fetch_brand)
print(df['Brand'].value_counts())

Brand
Honda            1283
Ford             1231
Hyundai          1223
BMW              1099
Nissan            952
Other Brand       883
Jeep              866
Volkswagen        866
Audi              783
Mazda             747
Mercedes-Benz     723
Chevrolet         713
Dodge             500
Kia               497
Porsche           431
Infiniti          393
GMC               347
Subaru            299
Acura             295
Cadillac          216
Chrysler          155
Name: count, dtype: int64


### 12. Interior Colour

In [195]:
df['Interior Colour'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Interior Colour'].fillna('Unknown', inplace=True)


In [196]:
color_counts = df['Interior Colour'].value_counts()
top_colors = color_counts.head(12)
print(top_colors)

Interior Colour
Black         8410
Unknown       3808
Grey           694
Red            335
Charcoal       305
Brown          268
Beige          253
Tan            140
White           79
Dark Grey       79
Cream           40
Light Grey      38
Name: count, dtype: int64


In [197]:
def fetch_color(text):
    # Define the patterns for the models you want to extract
    patterns = [
        (r'Black', 'Black'),
        (r'Grey', 'Grey'),
        (r'Red', 'Red'),
        (r'Charcoal', 'Charcoal'),
        (r'Brown', 'Brown'),
        (r'Beige', 'Beige'),
        (r'Tan', 'Tan'),
        (r'White', 'White'),
        (r'Dark Grey', 'Dark Grey'),
        (r'Cream', 'Cream'),
    ]
    
    # Loop through patterns and return the model if found
    for pattern, color in patterns:
        if re.search(pattern, text):
            return color
    return 'Other Colour'

# Apply the function to your DataFrame
df['Interior Colour'] = df['Interior Colour'].apply(fetch_color)
df['Interior Colour'].value_counts()

Interior Colour
Black           8410
Other Colour    3861
Grey             811
Red              335
Charcoal         305
Brown            268
Beige            253
Tan              140
White             79
Cream             40
Name: count, dtype: int64

In [198]:
print(len(df['Interior Colour'].unique()))
print(df['Interior Colour'].isna().sum())
print(df['Interior Colour'].value_counts())

10
0
Interior Colour
Black           8410
Other Colour    3861
Grey             811
Red              335
Charcoal         305
Brown            268
Beige            253
Tan              140
White             79
Cream             40
Name: count, dtype: int64


## Findind % of missing values from each column to decide which column to retain in the df:

In [199]:
print("% of missing values for each columns:\n", round(df.isna().mean() * 100))
print("\n# of missing values for each columns:\n", df.isna().sum())


% of missing values for each columns:
 Year                            0.0
Brand                           0.0
Model                           0.0
Distance (km)                   0.0
Body Type                       0.0
Drivetrain                      2.0
Exterior Colour                 0.0
Interior Colour                 0.0
Passengers                     45.0
Doors                          20.0
Fuel Type                       0.0
Purchase Price                  0.0
Transmission Type               0.0
Transmission Speed             65.0
Engine Displacement (Lts.)     68.0
Cylinder Count                 34.0
Engine Config                  75.0
Special Tech                   90.0
Fuel Efficiency (Lt/100kms)     0.0
Mileage (kms/Lt)                0.0
Years Used                      0.0
Depreciation Rate               0.0
Condition Factor                0.0
Current Market Price            0.0
dtype: float64

# of missing values for each columns:
 Year                               0
Brand

## Dropping the columns that have the most Null values:

In [200]:
print(df.shape)
df.drop(columns = ['Transmission Speed','Engine Displacement (Lts.)','Engine Config','Special Tech','Passengers'], inplace = True)
print(df.shape)

(14502, 24)
(14502, 19)


## Dropping records from the columns that have missing value % between 1-20

In [201]:
df.dropna(subset=['Drivetrain','Doors'], inplace=True)

## Saving the Cleaned Dataframe to CSV:

In [202]:
cleaned_df = df[['Year','Brand','Model','Body Type','Drivetrain','Interior Colour','Exterior Colour','Fuel Type','Transmission Type','Doors','Cylinder Count','Fuel Efficiency (Lt/100kms)','Distance (km)','Mileage (kms/Lt)','Years Used','Purchase Price','Current Market Price']]
cleaned_df.to_csv("cleaned_df.csv", index = False)
cleaned_df

Unnamed: 0,Year,Brand,Model,Body Type,Drivetrain,Interior Colour,Exterior Colour,Fuel Type,Transmission Type,Doors,Cylinder Count,Fuel Efficiency (Lt/100kms),Distance (km),Mileage (kms/Lt),Years Used,Purchase Price,Current Market Price
2,2019,Acura,Other Model Type,SUV,AWD,Black,White,Premium Unleaded,Automatic,4,4,9.92,33032,10.1,5,40888,7126.32
5,2022,Acura,Other Model Type,SUV,AWD,Red,Black,Gas,Automatic,4,,11.16,31000,9.0,2,60899,13368.07
6,2020,Acura,Other Model Type,SUV,AWD,Black,Black,Premium Unleaded,Automatic,4,4,10.31,27800,9.7,4,49099,10391.32
7,2019,Acura,Other Model Type,Sedan,AWD,Tan,Black,Premium Unleaded,Automatic,4,6,9.74,34396,10.3,5,36499,6165.93
8,2020,Acura,Other Model Type,SUV,AWD,Black,Black,Gas,Automatic,4,,9.92,60892,10.1,4,38495,4344.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24020,2011,Other Brand,Other Model Type,Wagon,FWD,Other Colour,White,Gas,Automatic,4,5,8.53,206835,11.7,13,11495,185.54
24053,2021,Other Brand,Other Model Type,SUV,AWD,Black,White,Gas,Automatic,4,4,9.35,36519,10.7,3,46980,8584.23
24122,2022,Other Brand,Other Model Type,Wagon,AWD,Other Colour,White,Gas,Automatic,4,4,9.59,8460,10.4,2,64980,31680.39
24138,2023,Other Brand,Other Model Type,SUV,AWD,Charcoal,White,Gasoline Hybrid,Automatic,4,4,8.50,50,11.8,1,82479,77965.22
