# Data Ingestion and Cleaning
In this notebook we go through the data and clean it.

## Import the necessary libraries

In [1]:
# Set configuration for notebook
import os

os.chdir("c:\\Users\\Spectra\\kaggleX-challenge")
os.getcwd()

'c:\\Users\\Spectra\\kaggleX-challenge'

In [2]:
import pandas as pd

## Data Ingestion

In [3]:
# Load the data
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
submission_df = pd.read_csv("data/sample_submission.csv")

In [4]:
# View train 
train_df.head()


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


## Data exploration

In [5]:
# View info
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


There are no missing values

Let's check for duplicates

In [6]:
# Check for duplicates
train_df.duplicated().sum()

0

There are no duplicates. 


The next process is to clean up for the data types.

## Data Preprocessing

**Check Fuel Type**

In [7]:
train_df["fuel_type"].value_counts()

fuel_type
Gasoline          49439
Hybrid             1766
E85 Flex Fuel      1479
Diesel             1109
–                   294
Plug-In Hybrid      182
not supported         4
Name: count, dtype: int64

Let's check for the - and not supported and see what these vehicles are 

In [8]:
# Not supported and - cars
not_supported_df = train_df[train_df['fuel_type'].isin(['–', 'not supported'])]
not_supported_df.sample(10)

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
41628,41628,Dodge,Dakota SLT Quad Cab,1999,130921,–,–,A/T,Black,Gray,At least 1 accident or damage reported,Yes,4500
15764,15764,Ford,Mustang EcoBoost Premium,2019,34000,–,–,A/T,Blue,Gray,At least 1 accident or damage reported,Yes,32500
33815,33815,Dodge,Challenger SRT8,2010,106500,–,–,6-Speed M/T,White,White,At least 1 accident or damage reported,Yes,22500
45467,45467,Cadillac,DeVille Base,1994,103000,–,–,A/T,Blue,Beige,None reported,Yes,12500
46268,46268,Ford,Mustang EcoBoost Premium,2017,34000,–,–,6-Speed M/T,Blue,Gray,At least 1 accident or damage reported,Yes,25000
3083,3083,Jaguar,XJ6 Vanden Plas,1995,70000,–,–,A/T,White,Black,None reported,Yes,14995
45716,45716,Mazda,MX-5 Miata Base,2008,46873,–,–,A/T,Green,Beige,None reported,Yes,11500
43158,43158,Chevrolet,1500 Cheyenne Extended Cab,1996,43000,–,–,A/T,White,Gray,None reported,Yes,20000
48616,48616,Dodge,Challenger R/T,2016,73000,–,–,A/T,Black,Black,None reported,Yes,17500
6671,6671,Porsche,911 Carrera Cabriolet,1995,94000,–,–,6-Speed M/T,Blue,Gray,None reported,Yes,22500


For the - values we substitute them with unknown. For the not supported we extract from the engine.


In [9]:
train_df[train_df['fuel_type'].isin(['not supported'])]

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
1069,1069,Toyota,Mirai Base,2016,40000,not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,14000
9621,9621,Toyota,Mirai Limited,2023,29553,not supported,182.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Gray,None reported,Yes,9995
11441,11441,Toyota,Mirai Limited,2018,40000,not supported,182.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Gray,None reported,Yes,7500
21771,21771,Nissan,Armada Platinum,2017,92000,not supported,390.0HP 5.6L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,20900


In [10]:
def extract_fuel_type(text):
    text = text.split()
    return text[-2]

def clean_dash_values(df, column):
    df[column] = df[column].replace('–', 'Unknown')
    return df



In [11]:
# Change the not supported
train_df['fuel_type'] = train_df.apply(
    lambda row: extract_fuel_type(row['engine']) if row['fuel_type'] == 'not supported' else row['fuel_type'],
    axis=1
)

In [12]:
# Deal with the dash values

train_df = clean_dash_values(train_df, 'fuel_type')

In [13]:
train_df["fuel_type"].value_counts()

fuel_type
Gasoline          49440
Hybrid             1766
E85 Flex Fuel      1479
Diesel             1109
Unknown             294
Plug-In Hybrid      182
Hydrogen              3
Name: count, dtype: int64

In [14]:
# Change the not supported
test_df['fuel_type'] = test_df.apply(
    lambda row: extract_fuel_type(row['engine']) if row['fuel_type'] == 'not supported' else row['fuel_type'],
    axis=1
)

test_df = clean_dash_values(test_df, 'fuel_type')

**Check Brand**

In [15]:
train_df["brand"].value_counts()

brand
BMW              7369
Ford             6706
Mercedes-Benz    5087
Chevrolet        4424
Audi             2922
Porsche          2627
Toyota           2315
Lexus            2257
Jeep             2247
Land             1995
Cadillac         1565
Nissan           1252
GMC              1076
Dodge            1011
RAM               966
INFINITI          957
Lincoln           767
Mazda             748
Subaru            739
Hyundai           694
Jaguar            661
Volkswagen        628
Honda             624
Acura             580
Kia               526
Volvo             452
MINI              364
Maserati          293
Bentley           284
Chrysler          258
Genesis           249
Buick             228
Mitsubishi        182
Hummer            176
Pontiac           149
Alfa              144
Rolls-Royce       142
Lamborghini       122
Tesla             110
Ferrari            87
Saturn             58
Scion              53
Aston              50
McLaren            43
Rivian             27
Merc

Brand looks clean

**Check accident**

In [16]:
train_df["accident"].value_counts()

accident
None reported                             39896
At least 1 accident or damage reported    14377
Name: count, dtype: int64

**Transmission**

In [17]:
# Check value counts
transmissions = train_df["transmission"].value_counts()

In [18]:
transmissions

transmission
A/T                                                   16757
8-Speed A/T                                            7287
Transmission w/Dual Shift Mode                         6454
6-Speed A/T                                            5944
6-Speed M/T                                            3618
7-Speed A/T                                            3415
10-Speed A/T                                           2187
8-Speed Automatic                                      1189
5-Speed A/T                                            1185
9-Speed A/T                                            1073
4-Speed A/T                                             764
5-Speed M/T                                             718
CVT Transmission                                        692
10-Speed Automatic                                      642
6-Speed Automatic                                       516
Automatic                                               441
M/T                        

Combine the transmissions into four features namely CVT, automatic, manual and unknown

In [19]:
# Normalize the text
train_df['transmission'] = train_df['transmission'].str.lower()

test_df['transmission'] = test_df['transmission'].str.lower()
# Mapping for standardization and reduction
transmission_map = {
    'a/t': 'automatic',
    '8-speed a/t': 'automatic',
    'transmission w/dual shift mode': 'automatic',
    '6-speed a/t': 'automatic',
    '6-speed m/t': 'manual',
    '7-speed a/t': 'automatic',
    '10-speed a/t': 'automatic',
    '8-speed automatic': 'automatic',
    '5-speed a/t': 'automatic',
    '9-speed a/t': 'automatic',
    '4-speed a/t': 'automatic',
    '5-speed m/t': 'manual',
    'cvt transmission': 'cvt',
    '10-speed automatic': 'automatic',
    '6-speed automatic': 'automatic',
    'automatic': 'automatic',
    'm/t': 'manual',
    '9-speed automatic': 'automatic',
    '7-speed automatic with auto-shift': 'automatic',
    '7-speed m/t': 'manual',
    'automatic cvt': 'cvt',
    '1-speed a/t': 'automatic',
    '8-speed automatic with auto-shift': 'automatic',
    'transmission overdrive switch': 'automatic',
    '7-speed automatic': 'automatic',
    '6-speed manual': 'manual',
    '5-speed automatic': 'automatic',
    '7-speed manual': 'manual',
    '6-speed automatic with auto-shift': 'automatic',
    '8-speed manual': 'manual',
    '4-speed automatic': 'automatic',
    '7-speed': 'automatic',
    '–': 'unknown',
    '7-speed dct automatic': 'automatic',
    '1-speed automatic': 'automatic',
    '6-speed electronically controlled automatic with o': 'automatic',
    '10-speed automatic with overdrive': 'automatic',
    '2-speed a/t': 'automatic',
    '6-speed': 'automatic',
    '9-speed automatic with auto-shift': 'automatic',
    'scheduled for or in production': 'unknown',
    '6 speed mt': 'manual',
    'cvt-f': 'cvt',
    'f': 'unknown',
    'variable': 'cvt',
    '6 speed at/mt': 'automatic'
}

# Apply mapping
train_df['transmission'] = train_df['transmission'].replace(transmission_map)
test_df['transmission'] = test_df['transmission'].replace(transmission_map)

**Check color**

In [20]:
# Check color value counts
train_df['ext_col'].unique()

array(['Blue', 'Black', 'Purple', 'Gray', 'White', 'Red', 'Silver',
       'Summit White', 'Platinum Quartz Metallic', 'Green', 'Orange',
       'Lunar Rock', 'Red Quartz Tintcoat', 'Beige', 'Gold',
       'Jet Black Mica', 'Delmonico Red Pearlcoat', 'Brown',
       'Rich Garnet Metallic', 'Stellar Black Metallic', 'Yellow',
       'Deep Black Pearl Effect', 'Metallic', 'Ice Silver Metallic',
       'Agate Black Metallic', 'Rosso Mars Metallic', 'White Clearcoat',
       'Santorini Black Metallic', 'DB Black Clearcoat',
       'Snowflake White Pearl', 'Glacial White Pearl',
       'Maximum Steel Metallic', 'Blue Caelum', 'Dark Matter Metallic',
       '–', 'Oxford White', 'Cobra Beige Metallic',
       'Velvet Red Pearlcoat', 'Python Green', 'Obsidian Black Metallic',
       'Beluga Black', 'Blue Reflex Mica', 'Sparkling Silver',
       'Black Clearcoat', 'Soul Red Crystal Metallic',
       'Bright White Clearcoat', 'Shimmering Silver',
       'Midnight Black Metallic', 'Cajun Red Tint

In [21]:
# Check interior color value counts
train_df['int_col'].unique()

array(['Gray', 'Black', 'Beige', 'Brown', 'Silver', 'Jet Black', 'Mesa',
       'White', '–', 'Red', 'Blue', 'Medium Stone', 'Ash', 'Ebony',
       'Shara Beige', 'Tan', 'Titan Black / Quarzit', 'Global Black',
       'Orange', 'Saddle Brown', 'Nero Ade', 'Beluga', 'Light Slate',
       'Gold', 'Black Onyx', 'Nougat Brown', 'Camel', 'Hotspur Hide',
       'Charcoal', 'Satin Black', 'Deep Chestnut', 'Diesel Gray / Black',
       'White / Brown', 'AMG Black', 'Parchment', 'Shale',
       'Canberra Beige', 'Sahara Tan', 'Ebony / Pimento', 'Rhapsody Blue',
       'Medium Dark Slate', 'Rioja Red', 'Black / Express Red',
       'Deep Garnet', 'Portland', 'Sandstone', 'Dark Ash', 'Deep Cypress',
       'Black / Stone Grey', 'Chestnut', 'Navy Pier', 'Green',
       'Giallo Taurus / Nero Ade', 'Mistral Gray / Raven', 'Dark Gray',
       'Amber', 'Charles Blue', 'Hotspur', 'Medium Earth Gray', 'Ceramic',
       'Kyalami Orange', 'Charcoal Black', 'Adrenaline Red', 'Walnut',
       'Brandy', 'Bla

In [22]:
# Get value from the color such as categorize the color among creating new features
# Function to categorize colors
def categorize_color(color):
    color = color.lower()
    if 'blue' in color:
        return 'Blue'
    elif 'black' in color or 'nero' in color or 'obsidian' in color:
        return 'Black'
    elif 'gray' in color or 'grey' in color or 'graphite' in color or 'pearl effect' in color:
        return 'Gray'
    elif 'white' in color or 'glacier' in color or 'snow' in color or 'chalk' in color or 'pearl' in color or 'diamond' in color:
        return 'White'
    elif 'red' in color or 'rosso' in color or 'garnet' in color or 'sangria' in color:
        return 'Red'
    elif 'silver' in color or 'metallic' in color or 'mist' in color:
        return 'Silver'
    elif 'green' in color or 'jungle' in color or 'moss' in color:
        return 'Green'
    elif 'yellow' in color:
        return 'Yellow'
    elif 'orange' in color:
        return 'Orange'
    elif 'brown' in color:
        return 'Brown'
    elif 'purple' in color or 'plum' in color:
        return 'Purple'
    elif 'gold' in color:
        return 'Gold'
    elif 'beige' in color or 'tan' in color or 'bronze' in color:
        return 'Beige'
    elif 'pink' in color:
        return 'Pink'
    else:
        return 'Other/Unknown'

# Additional feature extraction functions
def is_exotic_or_rare(color):
    rare_colors = [
        'python green', 'hellayella clearcoat', 'lizard green', 'go mango!',
        'gecko pearlcoat', 'liquid platinum', 'isle of man green metallic',
        'rapid red metallic tinted clearcoat', 'majestic plum metallic',
        'volcanic orange', 'chalk', 'sangria red', 'gentian blue metallic',
        'balloon white', 'remington red metallic'
    ]
    return 'Exotic/Rare' if color.lower() in rare_colors else 'Common'

def is_bright(color):
    bright_colors = ['white', 'yellow', 'silver', 'light', 'diamond', 'frost', 'crystal', 'snow']
    return 'Bright' if any(bright in color.lower() for bright in bright_colors) else 'Dark'

def has_metallic_finish(color):
    return 'Metallic' if 'metallic' in color.lower() else 'Non-Metallic'

def has_pearl_or_matte_finish(color):
    if 'pearl' in color.lower():
        return 'Pearlescent'
    elif 'matte' in color.lower():
        return 'Matte'
    else:
        return 'Standard'

# Apply categorizations to the DataFrame columns
train_df['categorized_ext_color'] = train_df['ext_col'].apply(categorize_color)
train_df['exotic_or_rare_ext'] = train_df['ext_col'].apply(is_exotic_or_rare)
train_df['brightness_ext'] = train_df['ext_col'].apply(is_bright)
train_df['metallic_finish_ext'] = train_df['ext_col'].apply(has_metallic_finish)
train_df['finish_ext'] = train_df['ext_col'].apply(has_pearl_or_matte_finish)


test_df['categorized_ext_color'] = test_df['ext_col'].apply(categorize_color)
test_df['exotic_or_rare_ext'] = test_df['ext_col'].apply(is_exotic_or_rare)
test_df['brightness_ext'] = test_df['ext_col'].apply(is_bright)
test_df['metallic_finish_ext'] = test_df['ext_col'].apply(has_metallic_finish)
test_df['finish_ext'] = test_df['ext_col'].apply(has_pearl_or_matte_finish)

In [23]:
# Function to categorize interior colors
def categorize_interior_color(color):
    color = color.lower()
    if 'black' in color or 'ebony' in color or 'nero' in color:
        return 'Black'
    elif 'gray' in color or 'grey' in color or 'ash' in color or 'slate' in color or 'pewter' in color or 'graphite' in color or 'quarzit' in color:
        return 'Gray'
    elif 'beige' in color or 'camel' in color or 'parchment' in color or 'sand' in color or 'tan' in color or 'almond' in color or 'cappuccino' in color:
        return 'Beige'
    elif 'brown' in color or 'chestnut' in color or 'saddle' in color or 'brandy' in color or 'mocha' in color or 'auburn' in color or 'espresso' in color:
        return 'Brown'
    elif 'white' in color or 'platinum' in color or 'light' in color or 'ivory' in color or 'linen' in color or 'macchiato' in color or 'whisper' in color:
        return 'White'
    elif 'red' in color or 'garnet' in color or 'hotspur' in color or 'adrenaline' in color or 'rioja' in color:
        return 'Red'
    elif 'blue' in color or 'navy' in color or 'cobalt' in color or 'rhapsody' in color or 'charles' in color:
        return 'Blue'
    elif 'green' in color or 'cypress' in color:
        return 'Green'
    elif 'gold' in color or 'amber' in color:
        return 'Gold'
    elif 'orange' in color or 'mesa' in color or 'kyalami' in color:
        return 'Orange'
    elif 'silver' in color:
        return 'Silver'
    elif 'yellow' in color:
        return 'Yellow'
    else:
        return 'Other/Unknown'

# Additional feature extraction functions
def is_luxurious(color):
    luxury_colors = [
        'jet black', 'nero ade', 'beluga', 'hotspur hide', 'amg black', 'saddle brown',
        'obsidian black', 'pimento red w/ebony', 'titan black / quarzit', 'whisper beige',
        'macchiato', 'beluga hide', 'silk beige/espresso brown', 'pimento red'
    ]
    return 'Luxurious' if color.lower() in luxury_colors else 'Standard'

def is_light_or_dark(color):
    light_colors = ['white', 'beige', 'ivory', 'light', 'platinum', 'linen', 'whisper']
    return 'Light' if any(light in color.lower() for light in light_colors) else 'Dark'

def is_two_tone(color):
    return 'Two-Tone' if '/' in color or '-' in color else 'Single-Tone'

def is_exotic_or_rare(color):
    rare_colors = [
        'nero ade', 'giallo taurus / nero ade', 'beluga', 'hotspur hide', 'adrenaline red',
        'pimento red', 'kyalami orange', 'titan black / quarzit', 'rioja red', 'pimento red w/ebony'
    ]
    return 'Exotic/Rare' if color.lower() in rare_colors else 'Common'

# Apply categorizations to the DataFrame columns
train_df['categorized_interior_color'] = train_df['int_col'].apply(categorize_interior_color)
train_df['luxurious_interior'] = train_df['int_col'].apply(is_luxurious)
train_df['light_or_dark_interior'] = train_df['int_col'].apply(is_light_or_dark)
train_df['two_tone_interior'] = train_df['int_col'].apply(is_two_tone)
train_df['exotic_or_rare_interior'] = train_df['int_col'].apply(is_exotic_or_rare)

test_df['categorized_interior_color'] = test_df['int_col'].apply(categorize_interior_color)
test_df['luxurious_interior'] = test_df['int_col'].apply(is_luxurious)
test_df['light_or_dark_interior'] = test_df['int_col'].apply(is_light_or_dark)
test_df['two_tone_interior'] = test_df['int_col'].apply(is_two_tone)
test_df['exotic_or_rare_interior'] = test_df['int_col'].apply(is_exotic_or_rare)

**Engine**

In [24]:
train_df["engine"].unique()

array(['375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel',
       '300.0HP 3.0L Straight 6 Cylinder Engine Gasoline Fuel',
       '300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel', ..., '3.0L',
       '3.0 Liter GTDI',
       '120.0HP 2.2L 4 Cylinder Engine Flex Fuel Capability'],
      dtype=object)

In [25]:
import re

def extract_hp(engine_str):
    match = re.search(r'(\d+(\.\d+)?)HP', engine_str)
    return match.group(1) if match else None

def extract_displacement(engine_str):
    match = re.search(r'(\d+(\.\d+)?)L', engine_str)
    return match.group(1) if match else None

def extract_cylinders(engine_str):
    match = re.search(r'(\d+) Cylinder', engine_str)
    return match.group(1) if match else None

train_df['horsepower'] = train_df['engine'].apply(extract_hp)
train_df['displacement'] = train_df['engine'].apply(extract_displacement)
train_df['cylinders'] = train_df['engine'].apply(extract_cylinders)

test_df['horsepower'] = test_df['engine'].apply(extract_hp)
test_df['displacement'] = test_df['engine'].apply(extract_displacement)
test_df['cylinders'] = test_df['engine'].apply(extract_cylinders)

In [26]:
train_df.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price', 'categorized_ext_color', 'exotic_or_rare_ext',
       'brightness_ext', 'metallic_finish_ext', 'finish_ext',
       'categorized_interior_color', 'luxurious_interior',
       'light_or_dark_interior', 'two_tone_interior',
       'exotic_or_rare_interior', 'horsepower', 'displacement', 'cylinders'],
      dtype='object')

**Get the new columns**

In [27]:
from datetime import datetime

# Assuming `df` is your DataFrame
# Current Year
current_year = datetime.now().year

# Calculate Car Age
train_df['car_age'] = current_year - train_df['model_year']
test_df['car_age'] = current_year - test_df['model_year']
# Calculate Mileage per Year
train_df['mileage_per_year'] = train_df['milage'] / train_df['car_age']
test_df['mileage_per_year'] = test_df['milage'] / test_df['car_age']


In [28]:
test_df.isnull().sum()

id                               0
brand                            0
model                            0
model_year                       0
milage                           0
fuel_type                        0
engine                           0
transmission                     0
ext_col                          0
int_col                          0
accident                         0
clean_title                      0
categorized_ext_color            0
exotic_or_rare_ext               0
brightness_ext                   0
metallic_finish_ext              0
finish_ext                       0
categorized_interior_color       0
luxurious_interior               0
light_or_dark_interior           0
two_tone_interior                0
exotic_or_rare_interior          0
horsepower                    2606
displacement                   405
cylinders                     2712
car_age                          0
mileage_per_year                 0
dtype: int64

In [29]:
# Fill missing values with the string 'unspecified'
train_df['horsepower'] = train_df['horsepower'].fillna('unspecified')
train_df['displacement'] = train_df['displacement'].fillna('unspecified')
train_df['cylinders'] = train_df['cylinders'].fillna('unspecified')

# Fill missing values with the string 'unspecified'
test_df['horsepower'] = test_df['horsepower'].fillna('unspecified')
test_df['displacement'] = test_df['displacement'].fillna('unspecified')
test_df['cylinders'] = test_df['cylinders'].fillna('unspecified')

## Feature selection

In [30]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [33]:
categorical_features = [column for column in train_df.columns if train_df[column].dtype == 'object']

# Define target and features
target = 'price'
features = [col for col in train_df.columns if col != target]

# Split the data into features and target
X = train_df[features]
y = train_df[target]

In [46]:

# Define a function to fit the CatBoost model
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=10,
    cat_features=categorical_features,
    verbose=100
)

# Train the model
model.fit(X, y)
# Get feature importances
feature_importances = model.get_feature_importance()
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances from CatBoost')
plt.show()

# Display the top features
print(importance_df.head(20))

0:	learn: 71970.8855342	total: 766ms	remaining: 12m 44s


KeyboardInterrupt: 

In [47]:
importance_df

Unnamed: 0,Feature,Importance
4,milage,12.069158
26,mileage_per_year,11.176382
1,brand,9.12549
22,horsepower,8.769477
6,engine,8.498763
23,displacement,6.537089
3,model_year,5.606469
2,model,5.593159
0,id,4.748462
12,categorized_ext_color,4.70991


In [31]:
selected_features = ['milage', 'mileage_per_year', 'brand', 'horsepower', 'engine',
                     'displacement', 'model_year', 'model', 'categorized_ext_color',
                     'car_age', 'cylinders', 'ext_col', 'categorized_interior_color', 'int_col']

In [34]:
X_new = X[selected_features]
y_new = y

X_train, X_val, y_train, y_val = train_test_split(X_new, y_new, test_size=0.1, random_state=42)

In [35]:
cat_features = [column for column in X_new.columns if X_new[column].dtype == 'object']

In [55]:
# Define the CatBoost model with regularization
model = CatBoostRegressor(iterations=2000,
                          learning_rate=0.1,
                          depth=6,
                          cat_features=cat_features,
                          loss_function='RMSE',
                          l2_leaf_reg=3,  
                          verbose=-1)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_val)

# Evaluate the model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_val, y_pred_test, squared=False)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

0:	learn: 74182.1389517	total: 156ms	remaining: 5m 12s
100:	learn: 66700.2305148	total: 17.8s	remaining: 5m 34s
200:	learn: 63900.6884392	total: 34s	remaining: 5m 4s
300:	learn: 61458.5100414	total: 51.7s	remaining: 4m 51s
400:	learn: 58991.8628095	total: 1m 9s	remaining: 4m 36s
500:	learn: 57546.7864312	total: 1m 27s	remaining: 4m 22s
600:	learn: 55752.3319789	total: 1m 44s	remaining: 4m 3s
700:	learn: 54597.6580196	total: 2m 2s	remaining: 3m 47s
800:	learn: 53648.3796503	total: 2m 20s	remaining: 3m 30s
900:	learn: 52824.0448935	total: 2m 38s	remaining: 3m 13s
1000:	learn: 51950.4896680	total: 2m 56s	remaining: 2m 56s
1100:	learn: 51019.0857105	total: 3m 25s	remaining: 2m 47s
1200:	learn: 50356.3433037	total: 3m 43s	remaining: 2m 28s
1300:	learn: 49592.6815651	total: 4m 1s	remaining: 2m 9s
1400:	learn: 48921.8309902	total: 4m 19s	remaining: 1m 50s
1500:	learn: 48360.8219833	total: 4m 37s	remaining: 1m 32s
1600:	learn: 47675.8908871	total: 4m 55s	remaining: 1m 13s
1700:	learn: 47101.05



In [37]:
# Define objective function for Optuna
import optuna

def objective(trial):
    # Define hyperparameters to be tuned
    params = {
        'iterations': trial.suggest_categorical('iterations', [1000, 2000, 3000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.05, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1.0)
    }

    # Create CatBoostRegressor
    model = CatBoostRegressor(loss_function='RMSE', **params, verbose=False)

    # Fit the model
    model.fit(X_train, y_train, cat_features=cat_features)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate RMSE
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    return rmse

# Perform hyperparameter optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)  

# Get the best hyperparameters
best_params = study.best_params

print("Best Parameters:", best_params)

# Create CatBoostRegressor with best hyperparameters
best_model = CatBoostRegressor(loss_function='RMSE', **best_params, verbose=False)

# Fit the best model
best_model.fit(X_train, y_train, cat_features=cat_features)

# Make predictions
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_val)

# Evaluate the best model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_val, y_pred_test, squared=False)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

[I 2024-06-10 17:10:05,273] A new study created in memory with name: no-name-15bc2307-b62c-4642-9c63-d9f222a280a0


[I 2024-06-10 17:11:13,229] Trial 0 finished with value: 44414.18634133035 and parameters: {'iterations': 1000, 'learning_rate': 0.012847872513178014, 'depth': 3, 'l2_leaf_reg': 1.2566626866235684, 'min_data_in_leaf': 83, 'subsample': 0.9738093020789005, 'colsample_bylevel': 0.8378850965223392}. Best is trial 0 with value: 44414.18634133035.
[I 2024-06-10 17:17:43,546] Trial 1 finished with value: 45571.02725475555 and parameters: {'iterations': 3000, 'learning_rate': 0.11377893696209113, 'depth': 5, 'l2_leaf_reg': 6.298925843816557, 'min_data_in_leaf': 12, 'subsample': 0.9630846685579689, 'colsample_bylevel': 0.3013387016902427}. Best is trial 0 with value: 44414.18634133035.
[I 2024-06-10 17:19:14,631] Trial 2 finished with value: 46543.37427397544 and parameters: {'iterations': 1000, 'learning_rate': 0.22268429445239166, 'depth': 5, 'l2_leaf_reg': 8.265237652315765, 'min_data_in_leaf': 18, 'subsample': 0.4301964628438293, 'colsample_bylevel': 0.34068665358744593}. Best is trial 0 wi

KeyboardInterrupt: 

In [38]:
params = {'iterations': 2000, 'learning_rate': 0.023534866483893517, 'depth': 9, 'l2_leaf_reg': 4.028725000535012, 'min_data_in_leaf': 49, 'subsample': 0.8794404407705152, 'colsample_bylevel': 0.1363452246943897}
# Create CatBoostRegressor with best hyperparameters
best_model = CatBoostRegressor(loss_function='RMSE', **params, verbose=False)

# Fit the best model
best_model.fit(X_train, y_train, cat_features=cat_features)

# Make predictions
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_val)

# Evaluate the best model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_val, y_pred_test, squared=False)
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)

Train RMSE: 61355.36772181953
Test RMSE: 43720.01302903373




In [44]:
import lightgbm as lgb

In [51]:
params = {
    'num_iterations': 2000,
    'learning_rate': 0.023534866483893517,
    'max_depth': 9,
    'lambda_l2': 4.028725000535012,
    'min_data_in_leaf': 49,
    'subsample': 0.8794404407705152,
    'colsample_bytree': 0.1363452246943897,
    'metric': 'rmse',
}
for c in X_train.columns:
    col_type = X_train[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X_train[c] = X_train[c].astype('category')
        X_val[c] = X_val[c].astype('category')
        
fit_params={ 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_val,y_val)],
            'categorical_feature': cat_features
           }
clf = lgb.LGBMRegressor(**params)

clf.fit(X_train, y_train, **fit_params)

preds = clf.predict(X_val)

train_rmse = mean_squared_error(y_train, clf.predict(X_train), squared=False)
test_rmse = mean_squared_error(y_val, preds, squared=False)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3678
[LightGBM] [Info] Number of data points in the train set: 48845, number of used features: 14
[LightGBM] [Info] Start training from score 39297.021005
Train RMSE: 61919.46791484741
Test RMSE: 44708.330712651696




In [53]:
test_df_new = test_df[selected_features]

In [54]:
for c in test_df_new.columns:
    col_type = test_df_new[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        test_df_new[c] = test_df_new[c].astype('category')
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_new[c] = test_df_new[c].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_new[c] = test_df_new[c].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_new[c] = test_df_new[c].astype('category')
A value is trying to be set on a copy of a slice fro

In [55]:
predictions = clf.predict(test_df_new)



In [56]:
rounded_predictions = [round(pred, 3) for pred in predictions]
submission_df["price"] = rounded_predictions

In [57]:
submission_df.to_csv("data/submission.csv", index=False)

In [61]:
test_df

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,...,categorized_interior_color,luxurious_interior,light_or_dark_interior,two_tone_interior,exotic_or_rare_interior,horsepower,displacement,cylinders,car_age,mileage_per_year
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,automatic,White,Beige,...,Beige,Standard,Light,Single-Tone,Common,302.0,3.5,6,10,7300.000000
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,automatic,Silver,Black,...,Black,Standard,Dark,Single-Tone,Common,275.0,3.5,6,9,14225.777778
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,automatic,Blue,White,...,White,Standard,Light,Single-Tone,Common,241.0,2.0,4,9,5775.888889
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,automatic,White,White,...,White,Standard,Light,Single-Tone,Common,518.0,5.0,8,6,4916.666667
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,automatic,White,Black,...,Black,Standard,Dark,Single-Tone,Common,335.0,3.0,6,4,22500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36178,90451,GMC,Yukon Denali,2019,4500,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,automatic,Gray,Black,...,Black,Standard,Dark,Single-Tone,Common,420.0,6.2,8,5,900.000000
36179,90452,Chevrolet,Silverado 1500 Z71 Extended Cab,2004,185000,Gasoline,295.0HP 5.3L 8 Cylinder Engine Gasoline Fuel,automatic,Red,Gray,...,Gray,Standard,Dark,Single-Tone,Common,295.0,5.3,8,20,9250.000000
36180,90453,Toyota,Corolla LE,2011,116000,Gasoline,132.0HP 1.8L 4 Cylinder Engine Gasoline Fuel,manual,Beige,Gray,...,Gray,Standard,Dark,Single-Tone,Common,132.0,1.8,4,13,8923.076923
36181,90454,Lincoln,Navigator Reserve,2019,39000,Gasoline,450.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,automatic,Black,Orange,...,Orange,Standard,Dark,Single-Tone,Common,450.0,3.5,6,5,7800.000000


In [None]:
import lightgbm as lgb

# Convert all object dtype columns to category dtype
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

def objective(trial):
    # Define the parameter search space
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'verbose': -1
    }

    # Create the LightGBM dataset
    dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature="auto")
    dvalid = lgb.Dataset(X_val, label=y_val, categorical_feature="auto", reference=dtrain)

    # Train the model
    gbm = lgb.train(param, dtrain, valid_sets=[dtrain, dvalid])

    # Predict and calculate the mean squared error
    preds = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))
# Get the best hyperparameters
best_params = study.best_params
print("Best Parameters:", best_params)

In [37]:
import lightgbm as lgb
# Convert all object dtype columns to category dtype
for col in X_train.select_dtypes(include='object').columns:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    
lgb_params = {
    'boosting_type': 'gbdt',  
    'objective': 'regression', 
    'metric': 'rmse', 
    'learning_rate': 0.02,
    'verbose': -1
}

 # Create the LightGBM dataset
dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature="auto")
dvalid = lgb.Dataset(X_val, label=y_val, categorical_feature="auto", reference=dtrain)

# Train the model
gbm = lgb.train(lgb_params, dtrain, valid_sets=[dtrain, dvalid])

# Predict and calculate the mean squared error
preds = gbm.predict(X_val, num_iteration=gbm.best_iteration)
rmse = mean_squared_error(y_val, preds, squared=False)
print("Validation RMSE:", rmse)