In [1]:
import os 
import matplotlib
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import math
import re
import datetime

from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

currentDateTime = datetime.datetime.now()
date = currentDateTime.date()
year = int(date.strftime("%Y"))

matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = "#949494"

%matplotlib inline

In [2]:
od.download('https://www.kaggle.com/c/playground-series-s4e9')
os.listdir('playground-series-s4e9')
train = pd.read_csv('playground-series-s4e9/train.csv', low_memory=False)
test = pd.read_csv('playground-series-s4e9/test.csv', low_memory=False)
submision_df = pd.read_csv('playground-series-s4e9/sample_submission.csv')
train = train.copy()

Skipping, found downloaded files in "./playground-series-s4e9" (use force=True to force download)


Added values in fuel_type from engine

In [3]:
def convert_engine(val):
    d = {
        'engine_HP': np.nan,
        'engine_L': np.nan,
        'engine_cylinder': np.nan,
        'engine_cylinder_type': np.nan,
        'engine_fuel_type': np.nan,
        'engine_type': np.nan,
        'engine_v': np.nan,
        'engine_vtype': np.nan
    }
    spl = val.replace('-', '').split(' ')
    pos_v1 = -1
    pos_v2 = -1
    for i, v in enumerate(spl):
        if v.endswith('HP'):
            d['engine_HP'] = float(v[:-2])
        elif v.endswith('L'):
            d['engine_L'] = float(v[:-1])
        elif v == 'Engine' and spl[i-1] == 'Cylinder':
            d['engine_cylinder'] = abs(float(spl[i-2].replace('V', '')))
        elif v == 'Fuel':
            d['engine_fuel_type'] = spl[i - 1]
        elif v == 'Liter':
            d['engine_L'] = float(spl[i-1])
        elif v.startswith('V') or v.startswith('H') or v.startswith('I'):
            try:
                d['engine_cylinder'] = float(v[1:])
                d['engine_cylinder_type'] = v[0]
                pos_v1 = i
            except:
                pass
        elif v == 'DOHC' or v == 'OHV':
            d['engine_type'] = ' '.join(spl[i:])
            if pos_v2 != -1:
                d['engine_vtype'] = ' '.join(spl[pos_v2+1:i])
            elif pos_v1 != -1:
                d['engine_vtype'] = ' '.join(spl[pos_v1+1:i])
        elif v.endswith('V'):
            try:
                d['engine_v'] = float(v[:-1])
                pos_v2 = i
            except:
                pass
    return d

def test_params(ModelClass, **params):
    model = ModelClass(**params).fit(X_train, Y_train)
    train_rmse = rmse(model.predict(X_train), Y_train)
    val_rmse = rmse(model.predict(X_val), Y_val)
    return train_rmse, val_rmse


def test_param_and_plot(ModelClass, param_name, param_values, **other_params):
    train_errors, val_errors = [], [] 
    for value in param_values:
        params = dict(other_params)
        params[param_name] = value
        train_rmse, val_rmse = test_params(ModelClass, **params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)
    
    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    for x,y in zip(param_values,val_errors):
        label = "{:.2f}".format(y)
        plt.annotate(label,
                    (x,y),
                    textcoords="offset points",
                    xytext=(0,10),
                    ha='center')

    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])
    plt.show()

In [4]:
average_prices_by_brand = train.groupby('brand')['price'].mean()

train = train.fillna('dash')
train = train.replace('–', 'dash')

test = test.fillna('dash')
test = test.replace('–', 'dash')

accident_dict = {
    'None reported': 0,
    'At least 1 accident or damage reported': 1,
    'uknown': 2
}

clean_title_dict = {
    'Yes': 0,
    'uknown': 1
}

expensive_ext_color = ['Blue Caelum', 'Dark Sapphire', 'Bianco Monocerus', 'C / C', 'Ice',
       'Tempest', 'Beluga Black', 'Bianco Icarus Metallic', 
       'BLU ELEOS', 'Shadow Black', 'Nero Noctis', 'Sandstone Metallic',
       'Lizard Green', 'Balloon White', 'Onyx', 'Donington Grey Metallic',
       'China Blue', 'Diamond White', 'Rosso Corsa', 'Granite',
       'Rosso Mars Metallic', 'Carpathian Grey', 'Kemora Gray Metallic',
       'Grigio Nimbus', 'dash', 'Bianco Isis', 'Python Green', 'Fountain Blue',
       'Custom Color', 'Vega Blue', 'Designo Magno Matte',
       'Brands Hatch Gray Metallic', 'Rift Metallic', 'Gentian Blue Metallic',
       'Arancio Borealis', 'BLUE', 'Aventurine Green Metallic', 'Apex Blue',
       'Daytona Gray Pearl Effect', 'Daytona Gray Pearl Effect w/ Black Roof',
       'Matte White', 'Carpathian Grey Premium Metallic', 'Blue Metallic',
       'Santorini Black Metallic', 'Quartzite Grey Metallic',
       'Carrara White Metallic', 'BLACK', 'Kinetic Blue', 'Nero Daytona']

expensive_int_color = ['Dark Auburn', 'Hotspur', 'Cobalt Blue', 'Beluga Hide', 'Linen',
                       'Beluga', 'Black / Brown', 'Nero Ade', 'Sahara Tan', 'Portland']

low_end_brand = average_prices_by_brand[average_prices_by_brand < 25000].index.tolist()
mid_range_brand = average_prices_by_brand[(average_prices_by_brand > 25000) & (average_prices_by_brand < 40000)].index.tolist()
high_end_brand = average_prices_by_brand[(average_prices_by_brand > 40000) & (average_prices_by_brand < 80000)].index.tolist()
luxury_brand = average_prices_by_brand[average_prices_by_brand > 80000].index.tolist()

def encode_columns(df):
    gases = df.fuel_type.value_counts().index
    for gas in gases:
        df.loc[(df['fuel_type'] == '–') & df['engine'].str.contains(gas), 'fuel_type'] = gas

    milage_per_year = df['milage'] / (year - df['model_year'])
    df['milage_per_year'] = milage_per_year.astype(int)

    auto_transmission = df['transmission'].str.contains('A/T') + df['transmission'].str.contains(' At') + df['transmission'].str.contains('Automatic') + df['transmission'].str.contains('Dual Shift') + df['transmission'].str.contains('CVT Transmission') + df['transmission'].str.contains('CVT-F') + df['transmission'].str.contains('Overdrive Switch') + df['transmission'].str.contains('Variable')
    manual_transmission = df['transmission'].str.contains('M/T') + df['transmission'].str.contains(' Mt') + df['transmission'].str.contains('Manual')
    df['auto_transmission'] = auto_transmission.astype(int)
    df['manual_transmission'] = manual_transmission.astype(int)

    df['turbo_engine'] = df['engine'].str.contains('Turbo').astype(int)
    df['supercharged_engine'] = df['engine'].str.contains('Supercharged').astype(int)
    df['dohc_engine'] = df['engine'].str.contains('DOHC').astype(int)
    df['ohv_engine'] = df['engine'].str.contains('OHV').astype(int)


    df['low_end_brand'] = df.brand.isin(low_end_brand).astype(int)
    df['mid_range_brand'] = df.brand.isin(mid_range_brand).astype(int)
    df['high_end_brand'] = df.brand.isin(high_end_brand).astype(int)
    df['luxury_brand'] = df.brand.isin(luxury_brand).astype(int)

    df['accident_encoded'] = df['accident'].map(accident_dict)
    df['clean_title_encoded'] = df['clean_title'].map(clean_title_dict)
    df['expensive_color_ext_encoded'] = df.ext_col.isin(expensive_ext_color).astype(int)
    df['expensive_color_int_encoded'] = df.int_col.isin(expensive_int_color).astype(int)

    df['engine_fuel_type'].replace('dash', np.nan)
    df['engine_fuel_type'] = df.engine_fuel_type.fillna(df['fuel_type'])
    df['engine_fuel_type'] = df.engine_fuel_type.replace('Flexible', 'Flex').replace('E85 Flex Fuel', 'Flex')

    return df

train = pd.concat([train, pd.DataFrame.from_records(train['engine'].map(convert_engine).values)], axis = 1)
test = pd.concat([test, pd.DataFrame.from_records(test['engine'].map(convert_engine).values)], axis = 1)


train = encode_columns(train)
test = encode_columns(test)


In [5]:
input_cols = ['model_year', 'milage', 'milage_per_year', 'fuel_type', 'auto_transmission', 'manual_transmission',
       'low_end_brand', 'mid_range_brand', 'high_end_brand',
       'luxury_brand', 'turbo_engine', 'fuel_type_encoded', 'accident_encoded',
       'clean_title_encoded', 'expensive_color_ext_encoded',
       'expensive_color_int_encoded', 'engine_HP', 'engine_L', 'engine_cylinder',
       'engine_cylinder_type','engine_fuel_type', 'engine_type', 'engine_v', 
       'engine_vtype']

input_cols = ['model_year', 'milage', 'milage_per_year', 'engine_fuel_type', 
       'low_end_brand', 'mid_range_brand', 'high_end_brand', 'luxury_brand', 
       'turbo_engine', 'dohc_engine', 'supercharged_engine', 'ohv_engine',
       'accident_encoded', 'clean_title_encoded', 
       'expensive_color_ext_encoded', 'expensive_color_int_encoded', 
       'auto_transmission', 'manual_transmission', 'engine_HP', 'engine_L',
       'engine_cylinder', 'engine_cylinder_type', 'engine_v', 'engine_vtype']

target_col = 'price'

train_df, val_df = train_test_split(train, test_size=0.25, random_state=42)

X_train = train_df[input_cols].copy()
Y_train = train_df[target_col].copy()

X_val = val_df[input_cols].copy()
Y_val = val_df[target_col].copy()

test_inputs = test[input_cols].copy()

numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes('object').columns.tolist()

In [6]:
imputer = SimpleImputer(strategy='mean').fit(X_train[numeric_cols])

X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

scaler = MinMaxScaler().fit(X_train[numeric_cols])

X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(X_train[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

X_train[encoded_cols] = encoder.transform(X_train[categorical_cols])
X_val[encoded_cols] = encoder.transform(X_val[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

train_df = X_train[numeric_cols + encoded_cols]
val_df = X_val[numeric_cols + encoded_cols]
test = test_inputs[numeric_cols + encoded_cols]

numeric_cols_df = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols_df = train_df.select_dtypes('object').columns.tolist()

In [7]:
X_train, Y_train = train_df, Y_train
X_val, Y_val = val_df, Y_val

model = LinearRegression().fit(X_train, Y_train)

train_predictions = model.predict(X_train)
val_predictions = model.predict(X_val)

print(rmse(Y_train, train_predictions))
print(rmse(Y_val, val_predictions))

75897.6675774169
66479.52791598205


In [8]:
test_preds = model.predict(test)
test_preds

array([25628.84587558, 70565.53680129, 59607.89981226, ...,
       35032.85168901, 26273.34223745, 45637.81076415])