In [None]:
import numpy as np 
import pandas as pd 
import polars as pl
import os
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import seaborn as sns
#import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
#from matplotlib.colors import LinearSegmentedColormap
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool

warnings.simplefilter('ignore') 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

In [None]:
%%time

train = pd.read_csv('/kaggle/input/hackathon-qualification/archive/train.csv')
test = pd.read_csv('/kaggle/input/hackathon-qualification/archive/test.csv')
original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv') 


original[['milage', 'price']] = original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

train = pd.concat([train, original], ignore_index=True)  

In [None]:
# original.sample(n=5, random_state=42)  

In [None]:
train.info() 

In [None]:
train.describe().T 

# EDA

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='brand', y='price', data= train) 
plt.title('Average Price by Car Brand')
plt.xlabel('Car brand')
plt.ylabel('Average Price')
plt.xticks(rotation=90)  
plt.show() 

In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(x='price', y='transmission', data= train) 
plt.title('Box Plot of Price by Transmission Type')
plt.xlabel('Transmission')
plt.ylabel('Price')
plt.xticks(rotation=90) 
plt.show() 

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='accident', y='price', data= train, errorbar=None) 
plt.title('Average Price by Accident History')
plt.xlabel('Accident History')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show() 

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='fuel_type', data= train) 
plt.title('Count of Cars by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.xticks(rotation=45) 
plt.show() 

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='transmission', data=  train)  
plt.title('Count of Cars by Transmission Type')
plt.xlabel('Transmission')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show() 

In [None]:
categorical_columns = train.select_dtypes(include=['object']).columns
unique_values = {col: train[col].nunique() for col in categorical_columns}
for col, unique_count in unique_values.items():
    print(f"{col}: {unique_count} unique values") 

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='model_year', y='price', data= train)
#sns.regplot(x='model_year', y='price', data= train)
plt.title('Average Car Price by Model Year')
plt.xlabel('Model Year')
plt.ylabel('Average Price')
plt.xticks(rotation=90)  
plt.show() 

In [None]:
train['ext_col'].unique() 

# Feature Engineering

In [None]:
def extract_age_features(df):
    """
    Extracts age-related features from the dataset.
    """
    current_year = 2024
    df['Car_Age'] = current_year - df['model_year']
    # Avoid division by zero
    df['Car_Age'] = df['Car_Age'].replace(0, 1)
    df['Mileage_per_Year'] = df['milage'] / df['Car_Age']
    #df['milage_with_age'] = df.groupby('Car_Age')['milage'].transform('mean')
    #df['Mileage_per_Year_with_age'] = df.groupby('Car_Age')['Mileage_per_Year'].transform('mean')
    return df 

# remove 'model_year', 'milage' 

In [None]:
def extract_engine_trans(df):
    df = df.copy()
    df['transmission'] = df['transmission'].str.lower()
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)
    df['cylinders'] = df['engine'].str.extract(r'(\d+)\s(Cylinder|V\d|Straight)')[0].astype(float)
    # Calculate Power_to_Weight_Ratio, handling division by zero or None
    df['power_to_weight_ratio'] = df.apply(
        lambda row: row['horsepower'] / row['engine_size'] 
        if pd.notnull(row['horsepower']) and pd.notnull(row['engine_size']) and row['engine_size'] != 0 
        else None, 
        axis=1
    )
    
    df['transmission_type'] = df['transmission'].apply(lambda x: 
                                                      'manual' if 'm/t' in x or 'manual' in x or 'mt' in x else
                                                      'automatic' if 'a/t' in x or 'automatic' in x or 'at' in x else
                                                      'CVT' if 'CVT' in x else
                                                      'Other')
    return df


# remove 'engine', 'transmission' 

In [None]:
train['brand'].unique() 

In [None]:
def luxury_feature(df):
    
    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['is_luxury_brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    return df

# remove 'brand'

In [None]:
def extract_color(df):
    black_colors = [
    'Black', 'Santorini Black Metallic', 'Black Clearcoat', 'Black Obsidian', 
    'Mythos Black Metallic', 'Diamond Black', 'Shadow Black', 'Agate Black Metallic', 
    'Midnight Black Metallic', 'Black Raven', 'Obsidian Black Metallic', 'Magnetite Black Metallic', 
    'Onyx Black', 'Santorin Black', 'DB Black Clearcoat', 'Black Sapphire Metallic', 
    'Ultra Black', 'Magnetic Black', 'Crystal Black Silica', 'Ebony Black', 
    'Jet Black Mica', 'Twilight Black', 'Carbon Black Metallic', 'Beluga Black', 
    'Super Black', 'Phantom Black', 'Black Forest Green', 'Mosaic Black Metallic', 
    'Midnight Black', 'Crystal Black Pearl', 'BLACK', 'Nero Daytona', 
    'Obsidian', 'Black Noir Pearl', 'Aurora Black', 'Vik Black', 
    'Santorini Black', 'Brilliant Black', 'Crystal Black', 'Orca Black Metallic'
    ]

    white_colors = [
    'White', 'Summit White', 'designo Diamond White Metallic', 'Bright White Clearcoat', 
    'Dazzling White', 'White Clearcoat', 'Alpine White', 'Snowflake White Pearl', 
    'Pure White', 'Ibis White', 'Wind Chill Pearl', 'Platinum White Pearl', 
    'Fuji White', 'White Frost Tri-Coat', 'Glacial White Pearl', 'Oxford White', 
    'Eminent White Pearl', 'Ultra White', 'Snow White Pearl', 'designo Diamond White Bright', 
    'Emin White', 'Super White', 'White Diamond Tri-Coat', 'Alta White', 
    'Mineral White', 'Quartz White', 'White Platinum Tri-Coat Metallic', 'Balloon White', 
    'Oryx White Prl', 'Matte White', 'White Knuckle Clearcoat', 'Diamond White', 
    'Pearl White', 'Star White', 'Crystal White Pearl', 'Frozen White', 
    'MANUFAKTUR Diamond White Bright', 'Glacier White'
    ] 

    gray_colors = [
    'Gray', 'Granite Crystal Clearcoat Metallic', 'Magnetic Gray Clearcoat', 'Eiger Grey Metallic', 
    'Nebula Gray Pearl', 'Daytona Gray', 'Shadow Gray Metallic', 'Typhoon Gray', 
    'Magnetite Gray Metallic', 'Daytona Gray Pearl Effect', 'Magnetic Metallic', 'Dark Graphite Metallic', 
    'Quartzite Grey Metallic', 'Eiger Grey', 'Graphite Grey Metallic', 'Donington Grey Metallic', 
    'Daytona Gray Pearl Effect w/ Black Roof', 'Hampton Gray', 'Typhoon Gray Metallic', 'Stone Gray Metallic', 
    'Nightfall Gray Metallic', 'Brands Hatch Gray Metallic', 'Machine Gray Metallic', 'Carbonized Gray Metallic', 
    'Chronos Gray', 'Baltic Gray', 'Dark Slate Metallic', 'Arctic Gray Metallic', 
    'Platinum Gray Metallic', 'Granite Crystal Metallic Clearcoat', 'Gun Metallic', 'Thunder Gray', 
    'Dark Gray Metallic', 'Portofino Gray', 'Polymetal Gray Metallic', 'Graphite Grey', 'Arctic Gray Metallic', 'Platinum Gray Metallic', 'Volcano Grey Metallic', 'Ironman Silver', 
    'Thunder Gray', 'Aventurine Green Metallic'
    ] 

    silver_colors = [
    'Silver', 'Silver Ice Metallic', 'Sparkling Silver', 'Sonic Silver Metallic', 
    'Atomic Silver', 'Titanium Silver', 'Brilliant Silver Metallic', 'Glacier Silver Metallic', 
    'Silver Zynith', 'Ingot Silver Metallic', 'Tungsten Metallic', 'Selenite Gray Metallic', 
    'Cirrus Silver Metallic', 'Silver Radiance', 'Iconic Silver Metallic', 'Billet Silver Metallic Clearcoat', 
    'Rift Metallic', 'Florett Silver', 'Ice Silver Metallic', 'Iridium Silver Metallic', 
    'Indus Silver', 'Silver Flare Metallic', 'Silver Mist', 'Quicksilver Metallic', 
    'Silky Silver', 'Reflex Silver', 'Ironman Silver'
    ]

    red_colors = [
    'Red', 'Cameron Green', 'Tim David', 'Firecracker Red Clearcoat', 'Ruby Flare Pearl', 
    'Firenze Red', 'Tango Red Metallic', 'Siren Red Tintcoat', 'Delmonico Red Pearlcoat', 
    'Passion Red', 'Infrared Tintcoat', 'Red Quartz Tintcoat', 'Flame Red Clearcoat', 
    'Velvet Red Pearlcoat', 'Matador Red Mica', 'Rosso Corsa', 'Octane Red Pearlcoat', 
    'Remington Red Metallic', 'Hyper Red', 'Cayenne Red Tintcoat', 'Radiant Red Metallic II', 
    'Redline Red', 'Scarlet Ember', 'Ruby Red Metallic Tinted Clearcoat', 'Crimson Red Tintcoat', 
    'Red Multi', 'Rosso', 'Jupiter Red', 'Red Obsession', 'Ember Pearlcoat', 
    'Sangria Red', 'Remington Red Metallic'
    ] 

    blue_colors = [
    'Deep Crystal Blue Mica','Patriot Blue Pearlcoat','Blue','Antimatter Blue Metallic','Blu','Shoreline Blue Pearl',
    'Gentian Blue Metallic','Phytonic Blue Metallic','Horizon Blue','Northsky Blue Metallic','Caspian Blue',
    'Pacific Blue Metallic','Vega Blue','Electric Blue Metallic','Glacier Blue Metallic','Blue Metallic',
    'Twilight Blue Metallic','Anodized Blue Metallic','Blue Caelum', 'Midnight Blue Metallic'
    ]

    def condition_ext(color):
        if color in black_colors:
            return "black"
        elif color in white_colors:
            return "white"
        elif color in gray_colors:
            return "gray"
        elif color in silver_colors:
            return "silver"
        elif color in red_colors:
            return "red"
        elif color in blue_colors:
            return "blue"
        else:
            return "uncommon"


    df['ext_col'] = df['ext_col'].apply(condition_ext)
    
    
    Black= [
        'Black', 'Jet Black', 'BLACK', 'Global Black', 'Black Onyx',
        'Sardar Brown', 'Black/Gun Metal', 'Charcoal Black', 
        'Ebony Black', 'Carbon Black', 'Black w/Red Stitching',
        'Blk', 'Obsidian Black', 'Black/Graphite', 'Black/Saddle Brown',
        'Black / Brown', 'Titan Black', 'AMG Black', 'Black / Gray',
        'Black / Stone Grey', 'Black / Express Red', 'Black / Saddle',
        'Black / Pimento', 'Ebony', 'Ebony / Ebony Accents',
        'Ebony/Light Oyster Stitch'
    ]
    White= [
        'White', 'Grace White', 'Ivory / Ebony', 'WHITE'
    ]

    Gray= [
        'Gray', 'Dark Gray', 'Light Gray', 'Medium Gray', 
        'Medium Earth Gray', 'Slate', 'Charcoal', 'Graphite',
        'Ebony.', 'Medium Light Camel', 'Mistral Gray / Raven',
        'Medium Ash Gray', 'Light Platinum / Jet Black', 
        'Parchment', 'Parchment.', 'Graystone', 'Ash'
    ]
    Beige= [
        'Beige', 'Canberra Beige', 'Macchiato Beige/Black', 
        'Whisper Beige', 'Silk Beige/Espresso Brown', 'Saiga Beige',
        'Sand Beige', 'Silk Beige/Black', 'Cappuccino', 
        'Pearl Beige', 'Tan/Ebony/Ebony', 'Tan/Ebony', 
        'Camel', 'Light Titanium'
    ]
    Brown= [
        'Brown', 'Chestnut', 'Caramel', 'Saddle Brown',
        'Walnut', 'Espresso', 'Dark Galvanized', 'Cocoa / Dune', 
        'Giallo Taurus / Nero Ade', 'Mocha', 'Mountain Brown',
        'Brandy', 'Roast', 'Sahara Tan', 'Nougat Brown',
        'Medium Dark Slate', 'Medium Stone', 'Tension'
    ]

    Red=[
        'Red', 'Rioja Red', 'Adrenaline Red', 'Magma Red',
        'Pimento Red w/Ebony', 'Classic Red', 'Hotspur', 
        'Red/Black', 'Black / Express Red'
    ] 

    def condition_int(color):
        if color in Black:
            return "black"
        elif color in White:
            return "white"
        elif color in Gray:
            return "gray"
        elif color in Beige:
            return "beige"
        elif color in Brown:
            return "brown"
        elif color in Red:
            return "red"
        else:
            return "uncommon"
    
    df['int_col'] = df['int_col'].apply(condition_int)
    
    return df 

In [None]:
def update(df): 
    """
    Cleans a DataFrame by handling missing values and grouping
    infrequent categorical values into a 'noise' category.
    """
    t = 20 # 50, 100 
    df = df.copy()

    cat_c = df.select_dtypes(include=['category', 'object']).columns.tolist()
    numeric = df.select_dtypes(include=['int', 'float']).columns.tolist()
    
    # Process categorical columns
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        counts = df[col].value_counts()
        # Identify categories with counts less than the threshold 't'
        noise_categories = counts[counts < t].index
        # Replace infrequent categories with "noise"
        df.loc[df[col].isin(noise_categories), col] = "noise"
        df[col] = df[col].astype('category')
        
    # Process numeric columns
    for col in numeric:
        df[col] = df[col].fillna(df[col].median())
    
    
    if 'is_luxury_brand' in df.columns:
        df['is_luxury_brand'] = df['is_luxury_brand'].astype('category')
    
    return df

In [None]:
train = extract_age_features(train)
train = extract_color(train)
train = luxury_feature(train)
train = extract_engine_trans(train)
train = update(train)

test = extract_age_features(test)
test = extract_color(test) 
test = luxury_feature(test) 
test = extract_engine_trans(test)
test = update(test)  

In [None]:
test.info() 

In [None]:
train.sample(n=5, random_state = 42)  

In [None]:
train.info() 

In [None]:
train['transmission_type'].value_counts() 

In [None]:
# dropping 'clean_title', 'model_year', 'engine', ('milage', 'brand', 'transmission')  

train.drop(columns=['clean_title', 'model_year', 'engine'], inplace=True) 
test.drop(columns=['clean_title', 'model_year', 'engine'], inplace=True) 

In [None]:
# test.info() 

In [None]:
cat_cols = test.select_dtypes(include=['object', 'category']).columns.tolist() 

In [None]:
cat_cols

In [None]:
# train.sample(n=3, random_state=42)
# test.info() 

In [None]:
#cat_features = test.select_dtypes(include=['object', 'category']).columns

#for feature in cat_features:          
    #categories = sorted(list(set(train[feature].dropna())))
    #dtype = pd.CategoricalDtype(categories=categories, ordered=False)
    
    #train.loc[~train[feature].isin(categories), feature] = np.nan
    #test.loc[~test[feature].isin(categories), feature] = np.nan
    
    #train[feature] = train[feature].astype(dtype)
    #test[feature] = test[feature].astype(dtype) 

# Model Training  

In [None]:
#test['model'].value_counts() 
#test['model'] = test['model'].fillna('missing') 

In [None]:
X_train = train.drop(columns=["id", "price"])
y_train = train["price"]

X_test = test.drop(columns=["id"]) 

In [None]:
# X_train.info() 

# XGBoost

In [None]:
def cross_validate_model_x(model, X_train, y_train, params, n_splits=5):

    # Initialize variables
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        # Model initialization and training
        clf = model(**params, enable_categorical=True)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train,  eval_set=[(X_val, y_val)],verbose=500, early_stopping_rounds=50)
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds 

In [None]:
xgb_params={
    
'lambda': 0.17694956261235095,
 'alpha': 1.3889763984339085,
 'colsample_bytree': 0.7,
 'subsample': 1.0,
 'learning_rate': 0.008,
 'max_depth': 17,
 'random_state': 2020,
 'min_child_weight': 59,
 'n_estimators': 10_000,
 'tree_method': 'gpu_hist'
 
}


print('XGBoost Cross-Validation Results:\n')
xgb_model, test_preds_xgb = cross_validate_model_x(XGBRegressor, X_train, y_train, xgb_params) 

In [None]:
xgb_result =  pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
xgb_result['price'] = test_preds_xgb.astype(np.float32)
xgb_result 

In [None]:
# X_test.info() 

# LGBM 

In [None]:
def cross_validate_model_l(model, X_train, y_train, params, n_splits=10):

    # Initialize variables
    callbacks = [log_evaluation(period=150), early_stopping(stopping_rounds=200)]
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind] 

        # Model initialization and training
        clf = model(**params)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], callbacks=callbacks )
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds 

In [None]:
from lightgbm import LGBMRegressor
# params are taken from this notebook: https://www.kaggle.com/code/noodl35/optuna-lgbm-tuning-used-cars

lgb_params = {
                'num_leaves': 426,
                 'max_depth': 20,
                 'learning_rate': 0.011353178352988012,
                 'n_estimators': 10000,
                 'metric': 'rmse',
                 'subsample': 0.5772552201954328,
                 'colsample_bytree': 0.9164865430101521,
                 'reg_alpha': 1.48699088003429e-06,
                 'reg_lambda': 0.41539458543414265,
                 'min_data_in_leaf': 73,
                 'feature_fraction': 0.751673655170548,
                 'bagging_fraction': 0.5120415391590843,
                 'bagging_freq': 2,
                 'random_state': 42,
                 'min_child_weight': 0.017236362383443497,
                 'cat_smooth': 54.81317407769262,
                 'verbose' : -1             # Set to -1 for silent mode, no process information printed
}

print('LightGBM Cross-Validation Results:\n')
lgb_model,test_preds_lgbm = cross_validate_model_l(LGBMRegressor, X_train, y_train, lgb_params)

In [None]:
lgb_result =  pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
lgb_result['price'] = test_preds_lgbm.astype(np.float32)
lgb_result 

# CatBoost 

In [None]:
def cross_validate_model_c(model, X_train, y_train, params, n_splits=10):

    # Initialize variables 
    
    cv = KFold(n_splits=n_splits, shuffle=True, random_state= 42) 
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)
    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        # Data splitting
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        # Model initialization and training
        clf = model(**params)
        #clf.fit(X_fold_train, y_fold_train)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], verbose=0)
        # Predict and evaluate
        test_preds[:, fold] = clf.predict(X_test)

        print("-" * 50)
        print(test_preds)

    test_preds= np.mean(test_preds, axis=1)
    return clf,test_preds 

In [None]:
# X_test.info() 

In [None]:
from catboost import CatBoostRegressor 

cat_params = {
    'cat_features':cat_cols,  
    'learning_rate': 0.075,
    'iterations': 5000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'task_type': 'GPU',
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'random_seed': 42,
    'verbose': False                      
}

print('CatBoost Cross-Validation Results:\n')
cat_model, test_preds_cat = cross_validate_model_c(CatBoostRegressor, X_train, y_train, cat_params) 

In [None]:
cat_result =  pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
cat_result['price'] = test_preds_cat.astype(np.float32)
cat_result 

# Prediction Ensemble

In [None]:
Pred = pd.concat([lgb_result,cat_result,xgb_result], axis=1)['price']
print(Pred)
test_preds = test_preds_lgbm * 0.7 + test_preds_cat * 0.1 + test_preds_xgb * 0.2
# test_preds_1 = test_preds_lgbm * 0.4 + test_preds_cat * 0.3 + test_preds_xgb * 0.3 
# test_preds_2 = test_preds_lgbm

In [None]:
ensemble_sub =  pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
ensemble_sub['price'] = test_preds # test_preds_1 

# lgbm_sub =  pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
# lgbm_sub['price'] = test_preds_2

ensemble_sub.to_csv('submission.csv', index=False) 
# lgbm_sub.to_csv('submission_5.csv', index = False) 