In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#display everything 
pd.set_option('display.max_columns', None)

In [3]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')
submission = pd.read_csv('sample_submission.csv')

In [4]:
train.shape, test.shape, submission.shape

((54273, 13), (36183, 12), (36183, 2))

In [5]:
display(train.isna().sum())
display(test.isna().sum())

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
dtype: int64

From the above, we can see that there are no missing values in both the training and testing datasets

### Working on the engine variable

In [6]:
import re
def extract_horsepower(engine_description):
  match = re.search(r"(\d+)\.0HP", engine_description)
  if match:
    return float(match.group(1))  
  else:
    return None  # Handle missing values

train["horsepower"] = train["engine"].apply(extract_horsepower)
test["horsepower"] = test["engine"].apply(extract_horsepower)


In [7]:
import re

def extract_displacement(engine_description):
    match = re.search(r"(\d+\.\d+L)", engine_description)
    if match:
        # Extract the numeric part and convert it to a float
        return float(match.group(1).replace('L', ''))
    else:
        return None  # Handle missing values

# Assuming 'train' is your DataFrame and 'engine' is the column containing engine descriptions
train["displacement_value"] = train["engine"].apply(extract_displacement)
test["displacement_value"] = test["engine"].apply(extract_displacement)

In [8]:

def extract_engine_type(engine_description):
    # Regex to match engine type after displacement and before fuel type
    match = re.search(r"\b\d\.\dL\s(.*?Engine)\b", engine_description)
    if match:
        return match.group(1)
    else:
        return None  # Handle missing values
train["engine_type"] = train["engine"].apply(extract_engine_type)
test["engine_type"] = test["engine"].apply(extract_engine_type)

In [9]:

def extract_fuel_type(engine_description):
    # Regex to match the fuel type at the end of the description
    match = re.search(r"(Gasoline Fuel|Diesel Fuel|Electric Fuel System|Hybrid|Gasoline/Mild Electric Hybrid|Flex Fuel Capability)", engine_description)
    if match:
        return match.group(1)
    else:
        return None  # Handle missing values
train["fuel_type"] = train["engine"].apply(extract_fuel_type)
test["fuel_type"] = test["engine"].apply(extract_fuel_type)

In [10]:
# Grouping the car brands
brand_categories = {
    'performance_sports': [
        'Chevrolet', 'BMW', 'Porsche', 'Lamborghini',
        'Pontiac', 'Lotus'
    ],
    
    'luxury_luxury_EV': [
        'Mercedes-Benz', 'Audi', 'Maserati', 'Cadillac',
        'Volvo', 'Tesla', 'Jaguar', 'Rolls-Royce', 'Aston',
        'Ferrari', 'Bentley', 'Acura', 'Lexus', 'Lincoln',
        'Hummer', 'Genesis', 'Land', 'Buick', 'INFINITI',
        'McLaren', 'Alfa', 'Lucid', 'Maybach', 'Bugatti'
    ],

    'mid_range': [
        'Mitsubishi', 'Ford', 'Nissan', 'Toyota', 'Dodge',
        'Volkswagen', 'Mazda', 'Chrysler', 'Rivian', 'GMC'
    ],

    'affordable': [
        'Honda', 'Hyundai', 'Subaru', 'Kia',
        'Scion', 'Saturn', 'Mercury',
        'FIAT', 'Plymouth', 'Suzuki'
    ],

    'offroad': [
        'Jeep', 'RAM', 'MINI'
    ]
}

def assign_brand_group(brand):
    for group_name, brands in brand_categories.items():
        if brand in brands:
            return group_name
    return "Others"  # Assign to "Others" if not found in any defined groups

train["brand_group"] = train["brand"].apply(assign_brand_group)
test["brand_group"] = test["brand"].apply(assign_brand_group)


### Handling transmission variable

In [11]:
# Define the mapping for each category
transmission_mapping = {
    'automatic': ['A/T', 'Automatic', 'Automatic CVT', 'Transmission w/Dual Shift Mode', 'Transmission Overdrive Switch',
                  'Electronically Controlled Automatic', 'with Overdrive', 'with Auto-Shift', 'DCT Automatic'],
    'manual': ['M/T', 'Manual'],
    'cvt': ['CVT Transmission', 'CVT-F', 'Variable'],
    'dct': ['DCT Automatic'],
    'other': ['SCHEDULED FOR OR IN PRODUCTION', '–', 'F']
}

# Function to categorize the transmission
def categorize_transmission(transmission):
    transmission = transmission.lower()
    if any(keyword.lower() in transmission for keyword in transmission_mapping['automatic']):
        return 'automatic'
    elif any(keyword.lower() in transmission for keyword in transmission_mapping['manual']):
        return 'manual'
    elif any(keyword.lower() in transmission for keyword in transmission_mapping['cvt']):
        return 'cvt'
    elif any(keyword.lower() in transmission for keyword in transmission_mapping['dct']):
        return 'dct'
    else:
        return 'other'

# Apply the categorization
train['transmission_category'] = train['transmission'].apply(categorize_transmission)
test['transmission_category'] = test['transmission'].apply(categorize_transmission)

In [12]:
# Create boolean features for special characteristics on the training set  
train['dual_shift_mode'] = train['transmission'].str.contains('Dual Shift Mode', case=False, na=False).astype(int)
train['overdrive'] = train['transmission'].str.contains('Overdrive', case=False, na=False).astype(int)
train['auto_shift'] = train['transmission'].str.contains('Auto-Shift', case=False, na=False).astype(int)

# Create boolean features for special characteristics on the testing set
test['dual_shift_mode'] = test['transmission'].str.contains('Dual Shift Mode', case=False, na=False).astype(int)
test['overdrive'] = test['transmission'].str.contains('Overdrive', case=False, na=False).astype(int)
test['auto_shift'] = test['transmission'].str.contains('Auto-Shift', case=False, na=False).astype(int)


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     54273 non-null  int64  
 1   brand                  54273 non-null  object 
 2   model                  54273 non-null  object 
 3   model_year             54273 non-null  int64  
 4   milage                 54273 non-null  int64  
 5   fuel_type              50120 non-null  object 
 6   engine                 54273 non-null  object 
 7   transmission           54273 non-null  object 
 8   ext_col                54273 non-null  object 
 9   int_col                54273 non-null  object 
 10  accident               54273 non-null  object 
 11  clean_title            54273 non-null  object 
 12  price                  54273 non-null  int64  
 13  horsepower             50216 non-null  float64
 14  displacement_value     53667 non-null  float64
 15  en

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36183 entries, 0 to 36182
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     36183 non-null  int64  
 1   brand                  36183 non-null  object 
 2   model                  36183 non-null  object 
 3   model_year             36183 non-null  int64  
 4   milage                 36183 non-null  int64  
 5   fuel_type              33485 non-null  object 
 6   engine                 36183 non-null  object 
 7   transmission           36183 non-null  object 
 8   ext_col                36183 non-null  object 
 9   int_col                36183 non-null  object 
 10  accident               36183 non-null  object 
 11  clean_title            36183 non-null  object 
 12  horsepower             33577 non-null  float64
 13  displacement_value     35778 non-null  float64
 14  engine_type            33451 non-null  object 
 15  br

In [15]:
cols_drop = ['brand', 'model', 'engine','clean_title', 'id', 'transmission']

In [16]:
train.drop(columns=cols_drop, axis=1, inplace=True)
test.drop(columns=cols_drop, axis=1, inplace=True)

In [17]:
display(train.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   model_year             54273 non-null  int64  
 1   milage                 54273 non-null  int64  
 2   fuel_type              50120 non-null  object 
 3   ext_col                54273 non-null  object 
 4   int_col                54273 non-null  object 
 5   accident               54273 non-null  object 
 6   price                  54273 non-null  int64  
 7   horsepower             50216 non-null  float64
 8   displacement_value     53667 non-null  float64
 9   engine_type            50065 non-null  object 
 10  brand_group            54273 non-null  object 
 11  transmission_category  54273 non-null  object 
 12  dual_shift_mode        54273 non-null  int32  
 13  overdrive              54273 non-null  int32  
 14  auto_shift             54273 non-null  int32  
dtypes:

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36183 entries, 0 to 36182
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   model_year             36183 non-null  int64  
 1   milage                 36183 non-null  int64  
 2   fuel_type              33485 non-null  object 
 3   ext_col                36183 non-null  object 
 4   int_col                36183 non-null  object 
 5   accident               36183 non-null  object 
 6   horsepower             33577 non-null  float64
 7   displacement_value     35778 non-null  float64
 8   engine_type            33451 non-null  object 
 9   brand_group            36183 non-null  object 
 10  transmission_category  36183 non-null  object 
 11  dual_shift_mode        36183 non-null  int32  
 12  overdrive              36183 non-null  int32  
 13  auto_shift             36183 non-null  int32  
dtypes: float64(2), int32(3), int64(2), object(7)
memory us

None

### Imputing the missing values in both the test and train sets 

In [18]:
from sklearn.impute import KNNImputer

# Imputing categorical variables 
train['fuel_type'] = train['fuel_type'].ffill().bfill()
test['fuel_type'] =  test['fuel_type'].ffill().bfill()
train['engine_type'] = train['engine_type'].bfill().ffill()
test['engine_type'] = test['engine_type'].bfill().ffill()


In [19]:
# imputing the numerical variables 
train['horsepower'] = train['horsepower'].fillna(train['horsepower'].mean())
test['horsepower'] = test['horsepower'].fillna(test['horsepower'].median())
train['displacement_value'] = train['displacement_value'].fillna(train['displacement_value'].mean())
test['displacement_value'] = test['displacement_value'].fillna(test['displacement_value'].median())

### Encoding the categorical variables 

In [20]:
## Encoding training data
from sklearn.preprocessing import LabelEncoder
Le_fuel = LabelEncoder()
Le_ext = LabelEncoder()
Le_int = LabelEncoder()
Le_acc = LabelEncoder()
Le_engine = LabelEncoder()
Le_brand = LabelEncoder()
Le_trans_cat = LabelEncoder()

train['ext_col'] = Le_ext.fit_transform(train['ext_col'])
train['int_col'] = Le_int.fit_transform(train['int_col'])
train['accident'] = Le_acc.fit_transform(train['accident'])
train['engine_type'] = Le_engine.fit_transform(train['engine_type'])
train['brand_group'] = Le_brand.fit_transform(train['brand_group'])
train['transmission_category'] = Le_trans_cat.fit_transform (train['transmission_category'])
train['fuel_type'] = Le_fuel.fit_transform(train['fuel_type'])

In [21]:
#Encoding testing data

test['ext_col'] = Le_ext.fit_transform(test['ext_col'])
test['int_col'] = Le_int.fit_transform(test['int_col'])
test['accident'] = Le_acc.fit_transform(test['accident'])
test['engine_type'] = Le_engine.fit_transform(test['engine_type'])
test['brand_group'] = Le_brand.fit_transform(test['brand_group'])
test['transmission_category'] = Le_trans_cat.fit_transform(test['transmission_category'])
test['fuel_type'] = Le_fuel.fit_transform(test['fuel_type'])

### Scaling the numerical data 

In [24]:
from sklearn.preprocessing import MinMaxScaler
# Combinin the train and test data for the scaling
combined_data = pd.concat([train, test], axis=0)

# Scaling all columns using MinMaxScaler
scaler = MinMaxScaler()
scaled_combined_data = scaler.fit_transform(combined_data)

# Convert back to DataFrame
scaled_combined_data = pd.DataFrame(scaled_combined_data, columns=combined_data.columns)

# Split back into train and test sets
scaled_train = scaled_combined_data.iloc[:len(train), :]
scaled_test = scaled_combined_data.iloc[len(train):, :]


In [25]:
scaled_train.head()

Unnamed: 0,model_year,milage,fuel_type,ext_col,int_col,accident,price,horsepower,displacement_value,engine_type,brand_group,transmission_category,dual_shift_mode,overdrive,auto_shift
0,0.88,0.183376,0.6,0.100386,0.463415,1.0,0.003049,0.321053,0.367742,1.0,0.4,0.0,0.0,0.0,0.0
1,0.66,0.197333,0.6,0.065637,0.073171,1.0,0.002117,0.242105,0.303226,0.875,0.8,0.666667,0.0,0.0,0.0
2,0.7,0.225713,0.6,0.698842,0.04878,1.0,0.004404,0.242105,0.458065,0.625,0.2,0.0,0.0,0.0,0.0
3,0.96,0.005772,0.8,0.3861,0.195122,1.0,0.020833,0.278947,0.303226,0.875,0.8,0.0,1.0,0.0,0.0
4,0.54,0.273895,0.6,0.96139,0.073171,1.0,0.001982,0.136842,0.406452,1.0,0.8,0.0,0.0,0.0,0.0


In [26]:
scaled_test.head()

Unnamed: 0,model_year,milage,fuel_type,ext_col,int_col,accident,price,horsepower,displacement_value,engine_type,brand_group,transmission_category,dual_shift_mode,overdrive,auto_shift
54273,0.8,0.180044,0.6,0.849421,0.065041,1.0,,0.244211,0.367742,1.0,0.4,0.0,0.0,0.0,0.0
54274,0.82,0.315959,0.6,0.722008,0.081301,1.0,,0.215789,0.367742,1.0,0.4,0.0,0.0,0.0,0.0
54275,0.82,0.128138,0.6,0.100386,0.869919,1.0,,0.18,0.174194,0.375,0.4,0.0,0.0,0.0,0.0
54276,0.88,0.072611,0.6,0.849421,0.869919,0.0,,0.471579,0.56129,0.625,0.4,0.0,1.0,0.0,0.0
54277,0.92,0.22203,0.6,0.849421,0.081301,0.0,,0.278947,0.303226,0.875,1.0,0.0,0.0,0.0,0.0


In [None]:
scaled_test.head()

Unnamed: 0,model_year,milage,fuel_type,ext_col,int_col,accident,horsepower,displacement_value,engine_type,brand_group,transmission_category,dual_shift_mode,overdrive,auto_shift
54273,0.8,0.180044,0.6,0.849421,0.065041,1.0,0.244211,0.367742,1.0,0.4,0.0,0.0,0.0,0.0
54274,0.82,0.315959,0.6,0.722008,0.081301,1.0,0.215789,0.367742,1.0,0.4,0.0,0.0,0.0,0.0
54275,0.82,0.128138,0.6,0.100386,0.869919,1.0,0.18,0.174194,0.375,0.4,0.0,0.0,0.0,0.0
54276,0.88,0.072611,0.6,0.849421,0.869919,0.0,0.471579,0.56129,0.625,0.4,0.0,1.0,0.0,0.0
54277,0.92,0.22203,0.6,0.849421,0.081301,0.0,0.278947,0.303226,0.875,1.0,0.0,0.0,0.0,0.0


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [28]:
X = scaled_train.drop(columns='price', axis=1, inplace=False) 
y = scaled_train['price']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=42 )


In [31]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)


In [32]:
# Make predictions on the testing set
predictions = model.predict(X_test)

# Calculate RMSE using the imported function
from sklearn.metrics import mean_squared_error  # Import RMSE function

rmse = mean_squared_error(y_test, predictions, squared=False)  # Calculate RMSE
print(f"CatBoost Model RMSE: {rmse}")

CatBoost Model RMSE: 0.03846107935439743




## Tuning the model

In [33]:
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

In [34]:
from sklearn.model_selection import GridSearchCV
tuning_model=GridSearchCV(model,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)

In [35]:
# best hyperparameters 
tuning_model.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
#cat_predictions = model.predict(scaled_test)

In [None]:
# Create submission file
#sub = pd.DataFrame({'id': submission['id'], 'price': cat_predictions})

# Preview sub file
#sub.head()

Unnamed: 0,id,price
0,54273,0.000151
1,54274,0.004636
2,54275,0.007117
3,54276,0.017343
4,54277,0.011346


In [None]:
# Create a csv file
#sub.to_csv('submission.csv', index = False)

## Tuning the catboost regressor model

In [None]:
import catboost as cb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2024-06-21 10:10:08,992] A new study created in memory with name: no-name-ecd5b551-3418-4c69-b6eb-62171d987ad4
[I 2024-06-21 10:10:42,935] Trial 0 finished with value: 0.016858731951802257 and parameters: {'learning_rate': 0.05648886342676039, 'depth': 10, 'subsample': 0.9410584695505052, 'colsample_bylevel': 0.2352414404604335, 'min_data_in_leaf': 9}. Best is trial 0 with value: 0.016858731951802257.
[I 2024-06-21 10:11:09,121] Trial 1 finished with value: 0.016160124387081372 and parameters: {'learning_rate': 0.002550260994108929, 'depth': 10, 'subsample': 0.16072203915690203, 'colsample_bylevel': 0.449948980006006, 'min_data_in_leaf': 63}. Best is trial 1 with value: 0.016160124387081372.
[I 2024-06-21 10:11:20,822] Trial 2 finished with value: 0.016138508965196113 and parameters: {'learning_rate': 0.0030375454710057826, 'depth': 6, 'subsample': 0.6026258161038592, 'colsample_bylevel': 0.8583418730895376, 'min_data_in_leaf': 21}. Best is trial 2 with value: 0.016138508965196113.


In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'learning_rate': 0.006010746289448281, 'depth': 8, 'subsample': 0.45389486020016206, 'colsample_bylevel': 0.6409166252012644, 'min_data_in_leaf': 98}
Best RMSE: 0.016010797412101666


In [None]:
# Fit the final model with the best parameters
best_model = CatBoostRegressor(**study.best_params)
best_model.fit(X_train, y_train)

# Make predictions and calculate RMSE on the test set
y_pred = best_model.predict(X_test)
final_score = mean_squared_error(y_test, y_pred, squared=False)

print("Final RMSE on the test set: ", final_score)


0:	learn: 0.0259923	total: 16.4ms	remaining: 16.4s
1:	learn: 0.0259758	total: 31.8ms	remaining: 15.9s
2:	learn: 0.0259610	total: 48.2ms	remaining: 16s
3:	learn: 0.0259435	total: 63.9ms	remaining: 15.9s
4:	learn: 0.0259275	total: 78.7ms	remaining: 15.7s
5:	learn: 0.0259132	total: 93.4ms	remaining: 15.5s
6:	learn: 0.0258981	total: 112ms	remaining: 15.9s
7:	learn: 0.0258835	total: 133ms	remaining: 16.5s
8:	learn: 0.0258691	total: 159ms	remaining: 17.5s
9:	learn: 0.0258526	total: 174ms	remaining: 17.2s
10:	learn: 0.0258384	total: 191ms	remaining: 17.2s
11:	learn: 0.0258255	total: 204ms	remaining: 16.8s
12:	learn: 0.0258078	total: 220ms	remaining: 16.7s
13:	learn: 0.0257936	total: 238ms	remaining: 16.7s
14:	learn: 0.0257801	total: 253ms	remaining: 16.6s
15:	learn: 0.0257640	total: 264ms	remaining: 16.2s
16:	learn: 0.0257515	total: 278ms	remaining: 16.1s
17:	learn: 0.0257380	total: 293ms	remaining: 16s
18:	learn: 0.0257217	total: 308ms	remaining: 15.9s
19:	learn: 0.0257080	total: 322ms	remai



In [None]:
boost_predictions = best_model.predict(scaled_test)

In [None]:
# Create submission file
sub = pd.DataFrame({'id': submission['id'], 'price': boost_predictions})

# Preview sub file
sub.head()

Unnamed: 0,id,price
0,54273,0.007077
1,54274,0.005938
2,54275,0.00782
3,54276,0.019559
4,54277,0.011912


In [None]:
# Create a csv file
#sub.to_csv('submission5.csv', index = False)