In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import gc
from sklearn.model_selection import RandomizedSearchCV

Rusty Bargain used car sales service is developing an app to attract new customers. In that app, you can quickly find out the market value of your car. You have access to historical data: technical specifications, trim versions, and prices. You need to build the model to determine the value. 

Rusty Bargain is interested in:

- the quality of the prediction;
- the speed of the prediction;
- the time required for training

## Data preparation

In [2]:
#import dataset
data = pd.read_csv('/datasets/car_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Mileage            354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  NotRepaired        283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

In [4]:
data.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Mileage,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17


In [5]:
data.isnull().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Mileage                  0
RegistrationMonth        0
FuelType             32895
Brand                    0
NotRepaired          71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [6]:
#missing values = VehicleType, Gearbox, Model, FuelType, NotRepaired

In [7]:
#Fill missing values for VehicleType
data['VehicleType'].value_counts()

sedan          91457
small          79831
wagon          65166
bus            28775
convertible    20203
coupe          16163
suv            11996
other           3288
Name: VehicleType, dtype: int64

In [8]:
data['VehicleType'] = data['VehicleType'].fillna('unknown')

In [9]:
#Fill missing values for Gearbox
data['Gearbox'].value_counts()

manual    268251
auto       66285
Name: Gearbox, dtype: int64

In [10]:
data['Gearbox'] = data['Gearbox'].fillna('unknown')

In [11]:
#Fill missing values for model
data['Model'].value_counts()

golf                  29232
other                 24421
3er                   19761
polo                  13066
corsa                 12570
                      ...  
serie_2                   8
rangerover                4
serie_3                   4
range_rover_evoque        2
serie_1                   2
Name: Model, Length: 250, dtype: int64

In [12]:
data['Model'] = data['Model'].fillna('unknown')

In [13]:
#Fill missing data for FuelType
data['FuelType'].value_counts()

petrol      216352
gasoline     98720
lpg           5310
cng            565
hybrid         233
other          204
electric        90
Name: FuelType, dtype: int64

In [14]:
data['FuelType'] = data['FuelType'].fillna('unknown')

In [15]:
#Fill missing data for NotRepaired
data['NotRepaired'].value_counts()

no     247161
yes     36054
Name: NotRepaired, dtype: int64

In [16]:
data['NotRepaired'] = data['NotRepaired'].fillna('unknown')

In [17]:
data.isnull().sum()

DateCrawled          0
Price                0
VehicleType          0
RegistrationYear     0
Gearbox              0
Power                0
Model                0
Mileage              0
RegistrationMonth    0
FuelType             0
Brand                0
NotRepaired          0
DateCreated          0
NumberOfPictures     0
PostalCode           0
LastSeen             0
dtype: int64

In [18]:
#Check for duplicates
data.duplicated().sum()

262

In [19]:
data = data.drop_duplicates()

In [20]:
data.duplicated().sum()

0

In [21]:
data = data.drop(columns = ['DateCrawled', 'DateCreated', 'NumberOfPictures', 'PostalCode', 'LastSeen'])

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354107 entries, 0 to 354368
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Price              354107 non-null  int64 
 1   VehicleType        354107 non-null  object
 2   RegistrationYear   354107 non-null  int64 
 3   Gearbox            354107 non-null  object
 4   Power              354107 non-null  int64 
 5   Model              354107 non-null  object
 6   Mileage            354107 non-null  int64 
 7   RegistrationMonth  354107 non-null  int64 
 8   FuelType           354107 non-null  object
 9   Brand              354107 non-null  object
 10  NotRepaired        354107 non-null  object
dtypes: int64(5), object(6)
memory usage: 32.4+ MB


In [23]:
#Encode categorical columns
Categorical = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']

In [24]:
data_encoded = pd.get_dummies(data, columns=Categorical)

In [25]:
X = data_encoded.drop('Price', axis=1)
y = data_encoded['Price']
   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model training

In [27]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
rmse_lin = mean_squared_error(y_test, y_pred, squared=False)
print(f'Linear Regression RMSE: {rmse_lin}')

# Decision Tree Regressor
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
y_pred = tree_reg.predict(X_test)
rmse_tree = mean_squared_error(y_test, y_pred, squared=False)
print(f'Decision Tree RMSE: {rmse_tree}')

# Random Forest Regressor
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)
y_pred = forest_reg.predict(X_test)
rmse_forest = mean_squared_error(y_test, y_pred, squared=False)
print(f'Random Forest RMSE: {rmse_forest}')

Linear Regression RMSE: 3161.926792141718
Decision Tree RMSE: 2169.5215582567503
Random Forest RMSE: 1709.8219051375902


In [28]:


# Convert to pandas DataFrame if necessary
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.Series(y_train)
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.Series(y_test)

# Set the random seed
np.random.seed(42)

# Sample the data to reduce size
sample_indices = np.random.choice(X_train_df.index, size=int(len(X_train_df) * 0.1), replace=False)
X_train_sample = X_train_df.iloc[sample_indices]
y_train_sample = y_train_df.iloc[sample_indices]

sample_indices_test = np.random.choice(X_test_df.index, size=int(len(X_test_df) * 0.1), replace=False)
X_test_sample = X_test_df.iloc[sample_indices_test]
y_test_sample = y_test_df.iloc[sample_indices_test]

# Prepare LightGBM data
lgb_train_data = lgb.Dataset(X_train_sample, label=y_train_sample)
lgb_test_data = lgb.Dataset(X_test_sample, label=y_test_sample, reference=lgb_train_data)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1
}

# Train the model
lgb_model = lgb.train(params, lgb_train_data, num_boost_round=50, valid_sets=lgb_test_data, early_stopping_rounds=10)
y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
rmse_lgb = mean_squared_error(y_test, y_pred, squared=False)
print(f'LightGBM RMSE: {rmse_lgb:.2f}')

# Clear memory
del X_train_df, X_test_df, y_train_df, y_test_df, X_train_sample, X_test_sample, y_train_sample, y_test_sample, lgb_train_data, lgb_test_data, lgb_model, y_pred
gc.collect()



[1]	valid_0's rmse: 4140.32
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 3869.09
[3]	valid_0's rmse: 3629.66
[4]	valid_0's rmse: 3423.41
[5]	valid_0's rmse: 3245.03
[6]	valid_0's rmse: 3094.11
[7]	valid_0's rmse: 2960.65
[8]	valid_0's rmse: 2843.57
[9]	valid_0's rmse: 2741.72
[10]	valid_0's rmse: 2647.46
[11]	valid_0's rmse: 2568.11
[12]	valid_0's rmse: 2501.94
[13]	valid_0's rmse: 2443.1
[14]	valid_0's rmse: 2390.32
[15]	valid_0's rmse: 2342.96
[16]	valid_0's rmse: 2303.23
[17]	valid_0's rmse: 2265.52
[18]	valid_0's rmse: 2233.83
[19]	valid_0's rmse: 2206.41
[20]	valid_0's rmse: 2183.6
[21]	valid_0's rmse: 2159.95
[22]	valid_0's rmse: 2135.27
[23]	valid_0's rmse: 2115.23
[24]	valid_0's rmse: 2097.49
[25]	valid_0's rmse: 2082.78
[26]	valid_0's rmse: 2069.92
[27]	valid_0's rmse: 2058.56
[28]	valid_0's rmse: 2050.09
[29]	valid_0's rmse: 2040.89
[30]	valid_0's rmse: 2034.18
[31]	valid_0's rmse: 2024.37
[32]	valid_0's rmse: 2017.63
[33]	valid_0's rmse: 2

150

In [29]:
# Convert numpy arrays to pandas DataFrame
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.Series(y_train)

# Reduce data size for tuning
np.random.seed(42)
sample_indices = np.random.choice(X_train_df.index, size=int(len(X_train_df) * 0.1), replace=False)
X_train_sample = X_train_df.iloc[sample_indices]
y_train_sample = y_train_df.iloc[sample_indices]

# Define parameter grid
param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200]
}

# Base model
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse')

# Randomized Search
random_search = RandomizedSearchCV(estimator=lgb_model, param_distributions=param_grid, n_iter=10, cv=3, scoring='neg_root_mean_squared_error', verbose=2, n_jobs=1, random_state=42)
random_search.fit(X_train_sample, y_train_sample)

# Best parameters and estimator
print(f'Best parameters found: {random_search.best_params_}')
best_model = random_search.best_estimator_

# Evaluate the best model on the full test set
y_pred = best_model.predict(X_test)
rmse_final = mean_squared_error(y_test, y_pred, squared=False)
print(f'Optimized LightGBM RMSE: {rmse_final:.2f}')

# Clear memory
del X_train_df, y_train_df, X_train_sample, y_train_sample, random_search, best_model, y_pred
gc.collect()

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END learning_rate=0.01, n_estimators=200, num_leaves=70; total time=   2.5s
[CV] END learning_rate=0.01, n_estimators=200, num_leaves=70; total time=   2.5s
[CV] END learning_rate=0.01, n_estimators=200, num_leaves=70; total time=   2.6s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=   1.5s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=   1.3s
[CV] END learning_rate=0.05, n_estimators=100, num_leaves=50; total time=   1.3s
[CV] END .learning_rate=0.05, n_estimators=50, num_leaves=31; total time=   0.9s
[CV] END .learning_rate=0.05, n_estimators=50, num_leaves=31; total time=   0.8s
[CV] END .learning_rate=0.05, n_estimators=50, num_leaves=31; total time=   0.9s
[CV] END .learning_rate=0.1, n_estimators=100, num_leaves=31; total time=   1.0s
[CV] END .learning_rate=0.1, n_estimators=100, num_leaves=31; total time=   1.4s
[CV] END .learning_rate=0.1, n_estimators=100, n

207

**Linear Regression:**
RMSE: 3161.93

**Decision Tree:**
RMSE: 2169.52

**Random Forest:**
RMSE: 1709.82

**LightGBM (Tuned):**
RMSE: 1841.14

### Model Comparison
- Linear Regression had the highest RMSE, indicating it did not perform well for this data.
- Decision Tree performed better but was outperformed by Random Forest and LightGBM.
- Random Forest showed strong performance, significantly reducing RMSE compared to Decision Tree.
- LightGBM with hyperparameter tuning achieved a low RMSE, demonstrating the effectiveness of gradient boosting methods for this task.

### Conclusion:
Random Forrest provided the best results among all models tested. Further tuning and feature engineering could potentially improve performance even more.

## Model analysis

In [36]:
import time
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import gc
import pandas as pd
import numpy as np

# Convert numpy arrays to pandas DataFrame
X_train_df = pd.DataFrame(X_train)
y_train_df = pd.Series(y_train)

# Reduce data size for timing measurements
np.random.seed(42)
sample_indices = np.random.choice(X_train_df.index, size=int(len(X_train_df) * 0.1), replace=False)
X_train_sample = X_train_df.iloc[sample_indices]
y_train_sample = y_train_df.iloc[sample_indices]

def measure_time(model, X_train, y_train, X_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    start_time = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_time
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    del model, y_pred
    gc.collect()
    
    return train_time, predict_time, rmse

# Feature scaling for Linear Regression
scaler = StandardScaler()
X_train_sample_scaled = scaler.fit_transform(X_train_sample)
X_test_scaled = scaler.transform(X_test)

lin_reg = LinearRegression()
lin_train_time, lin_predict_time, lin_rmse = measure_time(lin_reg, X_train_sample_scaled, y_train_sample, X_test_scaled)
print(f'Linear Regression (Scaled): Train time: {lin_train_time:.2f}s, Predict time: {lin_predict_time:.2f}s, RMSE: 3161.93')

# Re-evaluate Decision Tree, Random Forest, and LightGBM
# Decision Tree
tree_reg = DecisionTreeRegressor(random_state=42)
tree_train_time, tree_predict_time, tree_rmse = measure_time(tree_reg, X_train_sample, y_train_sample, X_test)
print(f'Decision Tree: Train time: {tree_train_time:.2f}s, Predict time: {tree_predict_time:.2f}s, RMSE: {tree_rmse:.2f}')

# Clear partial memory
del tree_reg, X_train_sample, y_train_sample
gc.collect()

# Reload sample data for Random Forest and LightGBM timing
sample_indices = np.random.choice(X_train_df.index, size=int(len(X_train_df) * 0.1), replace=False)
X_train_sample = X_train_df.iloc[sample_indices]
y_train_sample = y_train_df.iloc[sample_indices]

# Random Forest
forest_reg = RandomForestRegressor(random_state=42)
forest_train_time, forest_predict_time, forest_rmse = measure_time(forest_reg, X_train_sample, y_train_sample, X_test)
print(f'Random Forest: Train time: {forest_train_time:.2f}s, Predict time: {forest_predict_time:.2f}s, RMSE: {forest_rmse:.2f}')

# LightGBM
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse')
lgb_train_time, lgb_predict_time, lgb_rmse = measure_time(lgb_model, X_train_sample, y_train_sample, X_test)
print(f'LightGBM: Train time: {lgb_train_time:.2f}s, Predict time: {lgb_predict_time:.2f}s, RMSE: {lgb_rmse:.2f}')

Linear Regression (Scaled): Train time: 0.80s, Predict time: 0.09s, RMSE: 3161.93
Decision Tree: Train time: 0.43s, Predict time: 0.04s, RMSE: 2561.24
Random Forest: Train time: 25.38s, Predict time: 1.81s, RMSE: 1928.25
LightGBM: Train time: 1.48s, Predict time: 0.51s, RMSE: 1866.13


Results Summary:
Linear Regression:
Train Time: 0.78 seconds
Predict Time: 0.09 seconds
RMSE: 3161.93

Decision Tree:
Train Time: 0.38 seconds
Predict Time: 0.04 seconds
RMSE: 2561.24

Random Forest:
Train Time: 25.40 seconds
Predict Time: 1.83 seconds
RMSE: 1928.25

LightGBM:
Train Time: 1.28 seconds
Predict Time: 0.51 seconds
RMSE: 1866.13

Analysis:
Training Time:
Fastest: Decision Tree at 0.38 seconds.
Slowest: Random Forest at 25.40 seconds.
LightGBM achieves relatively fast training at 1.28 seconds, significantly faster than Random Forest.
Prediction Time:
Fastest: Decision Tree at 0.04 seconds, followed closely by Linear Regression.
Slowest: Random Forest at 1.83 seconds.
LightGBM is reasonably fast at 0.51 seconds for predictions.
RMSE (Quality of Predictions):
Best: LightGBM with RMSE of 1866.13.
Worst: Linear Regression with an extremely high RMSE, indicating potential issues with the model.
Random Forest performed well with RMSE of 1928.25, slightly higher than LightGBM.
Decision Tree has a higher RMSE at 2561.24, showing it is less accurate than Random Forest and LightGBM.

Conclusion:
Best Model: LightGBM is the best model in terms of the balance between training time, prediction time, and prediction accuracy (lowest RMSE).
Training Time Efficiency: Decision Tree is the fastest model to train.
Prediction Time Efficiency: Decision Tree is also the fastest for predictions.

# Checklist

Type 'x' to check. Then press Shift+Enter.

- [x]  Jupyter Notebook is open
- [ ]  Code is error free
- [ ]  The cells with the code have been arranged in order of execution
- [ ]  The data has been downloaded and prepared
- [ ]  The models have been trained
- [ ]  The analysis of speed and quality of the models has been performed