# Regression Used Cars Dataset

https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data 


#### **Dependency**

In [2]:
#! pip install lightgbm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from itertools import combinations  # For creating combinations of elements
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
# import lightgbm as lgb 
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [3]:
from itertools import combinations  # For creating combinations of elements
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

#### **Utility Function**

In [4]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype
        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    # Return the DataFrame with optimized memory usage
    return df

#### **Load Data**

load csv inside data folder

In [6]:
df = pd.read_csv("kaggle/input/vehicles/vehicles.csv")
df=reduce_mem_usage(df,verbose=True)
df.shape
df[0:4]

Memory usage of dataframe is 84.68 MB
Memory usage after optimization is: 76.54 MB
Decreased by 9.62%


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,


#### **Split dataset** 

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2)
print(df_train.shape, df_test.shape)

(341504, 26) (85376, 26)


#### **Data Function**

data preperation

feature engineering

In [8]:
def data_preparation(df:pd.DataFrame)->pd.DataFrame:
    """"
    make data valid
    """
    #There are too many categories for manufacturer and region,to reduce the risk of overfitting,we should cut the number of categories.
    mf = df['manufacturer'].value_counts()
    df['manufacturer'] = df['manufacturer'].apply(lambda s: s if str(s) in mf[:30] else 'others')

    rg = df['region'].value_counts()
    df['region'] = df['region'].apply(lambda s: s if str(s) in rg[:100] else 'others')
  
    #Create df1, get the average value of each year with deleting NAN and outliers, then assign it to the NAN in df, 
    #and then delete the outliers in df to get the latest data set
    df1 = df.copy()
    odometer_data = df1['odometer']
    odometer_data_cleaned = odometer_data.dropna()
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(odometer_data_cleaned.values.reshape(-1, 1))
    outliers = odometer_data_cleaned[kmeans.labels_ == 1]
    df1 = df1[~df1['odometer'].isin(outliers)]

    mean_odometer = df1.groupby('year')['odometer'].mean()
    mean_odometer_dict = mean_odometer.to_dict()

    df['odometer'] = df.apply(lambda row: mean_odometer_dict[row['year']] if pd.isnull(row['odometer']) and row['year'] in mean_odometer_dict else row['odometer'], axis=1)

    #The rest in odometer does not match the NAN of yaer,replaced using the mean
    mean_avg_odometer = mean_odometer.mean()
    df['odometer'] = df['odometer'].fillna(mean_avg_odometer)

    odometer_data_new = df['odometer']
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(odometer_data_new.values.reshape(-1, 1))
    outliers = odometer_data_new[kmeans.labels_ == 1]
    df = df[~df['odometer'].isin(outliers)]

    #We just keep 10%(about $500)~99.9% of the 'price' value,kick off the ridiculious price.
    df = df[(df['price']<df['price'].quantile(0.999)) & (df['price']>df['price'].quantile(0.1))]

    #use linear regression to prediect values for null in year col.
    features = ["price",'odometer']
    df_complete = df.dropna(subset=["year"] + features)
    X_train = df_complete[features]
    y_train = df_complete["year"]
    regression_model_year = LinearRegression()
    regression_model_year.fit(X_train, y_train)
    X_missing = df[df["year"].isnull()][features]
    if X_missing.shape[0] > 0:
        predicted_year = regression_model_year.predict(X_missing)
        df.loc[df["year"].isnull(), "year"] = predicted_year.round().astype(int)
   

    #Fill null values using the modes of "manufacturer", "cylinders", 'fuel',"title_status", "transmission","drive", "type", "paint_color", "lat", "long"
    columns_to_fillna = ['manufacturer', 'cylinders', 'fuel', 'title_status', 'transmission',
             'drive', 'type', 'paint_color', 'lat', 'long','posting_date']
    modes = df[columns_to_fillna].mode().iloc[0]
    df.loc[:, columns_to_fillna] = df.loc[:, columns_to_fillna].fillna(value=modes)

    #for "condition" col: Fill NAN with randomly selected data from that column to manipulate the more possible results.
    # Calculate the probability of each value (excluding NaN)
    prob_values = df[df['condition'].notna()]['condition'].value_counts(normalize=True)
    condition_options = df["condition"].unique()
    df.loc[df['condition'].isna(), 'condition'] = np.random.choice(prob_values.index, size=df['condition'].isna().sum(), p=prob_values.values)
    df['condition'] = df['condition'].astype(str)

    # Calculate the probability of each value (excluding 'other')
    prob_values = df[df['cylinders'] != 'other']['cylinders'].value_counts(normalize=True)
    # Assign 'other' values based on the probability
    df.loc[df['cylinders'] == 'other', 'cylinders'] = np.random.choice(prob_values.index, size=df['cylinders'].eq('other').sum(), p=prob_values.values)
    # Extract numeric values from the 'cylinder' column
    df['cylinders'] = df['cylinders'].str.extract('(\d+)').astype(int)


    y = df['price']
    x =df.drop(columns=['price'])

    return x,y

def feature_engineering(df:pd.DataFrame)->pd.DataFrame:
    """"
    select useful feature
    """
    #vehicle_age=posting_year - year, get the usage time of this vehicle
    df["posting_date"] = pd.to_datetime(df["posting_date"], utc=True)
    df["posting_year"] = df["posting_date"].dt.year
    df["vehicle_age"] = df["posting_year"] - df["year"]
    
    #Remove meaningless columns
    #modify
    df.drop(['id','url','region','region_url','model','title_status',
         'image_url','VIN','size','county','description','state',
         'lat','long','posting_date','posting_year'], 
        axis=1, inplace=True)
    
    return df

def encoding(df:pd.DataFrame,encoders=None)->pd.DataFrame:
    cols = df.columns.values.tolist()
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    categorical_cols = []
    for col in cols:
        if df[col].dtype in numerics: continue
        categorical_cols.append(col)

    if(encoders==None):
        encoders=[]
        for col in categorical_cols:
            le = preprocessing.LabelEncoder()
            le.fit(list(df[col].astype(str).values))
            df[col] = le.transform(list(df[col].astype(str).values))
            encoders.append(le)
    else:
        id=0
        for col in categorical_cols:
            le = encoders[id]
            df[col] = le.transform(list(df[col].astype(str).values))
            id=id+1

    return df,encoders



def data_preprocessing(df:pd.DataFrame ,datasetType="train", labelEncoders=None)->pd.DataFrame:
    x , y =data_preparation(df)
    x=feature_engineering(x)
    encoders=labelEncoders
    if(datasetType=="train"):
        x,encoders=encoding(x,encoders=encoders)
    if(datasetType=="test"):
        if(encoders==None):
            print("Please give a encoder")
            raise RuntimeError
        x,_ = encoding(x,encoders)
    return x,y,encoders

In [9]:
df_train_x,df_train_y,encoders = data_preprocessing(df_train,datasetType="train")
df_test_x, df_test_y,_ = data_preprocessing(df_test,datasetType="test",labelEncoders=encoders)
print(df_train_x.isna().sum())
print(df['type'].unique())
print(df_train_x.shape, df_train_y.shape)

year            0
manufacturer    0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
drive           0
type            0
paint_color     0
vehicle_age     0
dtype: int64
[nan 'pickup' 'truck' 'other' 'coupe' 'SUV' 'hatchback' 'mini-van' 'sedan'
 'offroad' 'bus' 'van' 'convertible' 'wagon']
(306495, 11) (306495,)


In [10]:
df_train_x[0:4]

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,vehicle_age
187408,2016.0,12,2,6,4,38145.0,0,2,9,10,5.0
70526,2007.0,7,0,4,2,77236.0,0,0,11,10,14.0
293302,2010.0,3,0,6,2,88112.0,0,2,9,8,11.0
33138,2018.0,30,2,4,2,18774.0,0,0,0,10,3.0


In [11]:
df_test_x[0:4]

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,vehicle_age
35189,2018.0,28,2,6,4,14835.0,2,1,4,10,3.0
69361,2016.0,8,3,8,4,79630.0,0,0,8,0,5.0
19229,2015.0,21,0,6,2,61000.0,0,0,9,10,6.0
125443,2019.0,4,2,6,4,5897.0,2,2,9,0,2.0


#### **Training and Evaluation**

In [None]:
# training
numJobs=3
cv=5
paramgrid={
                "objective": ["mae"],
                "n_estimators": [500,750],
                "num_leaves": [256],
                "subsample": [1],
                "colsample_bytree":[0.6],
                "learning_rate": [0.001],
            }

model = lgb.LGBMRegressor(n_estimators= 500,random_state=4487,objective='mae',learning_rate=0.05,importance_type= "gain",device="gpu")
modelCV= GridSearchCV(model,param_grid=paramgrid,n_jobs=numJobs,verbose=True,cv=cv)
modelCV.fit(df_train_x, df_train_y)

#predict
y_pred = modelCV.predict(df_test_x)

#evaluation

# RMSE
rmse = np.sqrt(mean_squared_error(df_test_y, y_pred))
print(f"Root Mean Squared Error: {rmse}")

# MAE
mae = mean_absolute_error(df_test_y, y_pred)
print(f"Mean Absolute Error: {mae}")

# R2
r2 = r2_score(df_test_y, y_pred)
print(f"R-squared: {r2}")

#Residual Analysis: 
residuals = df_test_y - y_pred

plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [12]:
preprocessor = make_column_transformer(
    (OneHotEncoder(), ["manufacturer",'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'type', 'paint_color' ]),
    (StandardScaler(), ["year","odometer","vehicle_age"]),
    remainder="passthrough"
)

In [13]:
def random_search(_model,_preprocessor,_param_dist):
    pipeline = make_pipeline(_preprocessor, _model)
    random = RandomizedSearchCV(pipeline, param_distributions=_param_dist, n_iter=100, cv=5, n_jobs=16)
    random.fit(df_train_x, df_train_y)
    # Print the best hyperparameters and the corresponding score
    print("Best parameters: {}".format(random.best_params_))
    print("Best cross-validation score: {:.2f}".format(random.best_score_))
    return random


def grid_search(_model,_preprocessor,_param_grid):
    # Define the Gradient Boosting regression model
    # Define the pipeline to preprocess the features and fit the model
    pipeline = make_pipeline(_preprocessor, _model)
    grid = GridSearchCV(pipeline, param_grid=_param_grid, cv=5, n_jobs=16)
    grid.fit(df_train_x, df_train_y)
    # Print the best hyperparameters and the corresponding score
    print("Best parameters: {}".format(grid.best_params_))
    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    return grid
    
def test_score(_model,_preprocessor):
    best_pipeline = make_pipeline(_preprocessor, _model)
    best_pipeline.fit(df_train_x, df_train_y)
    df_test_pred = best_pipeline.predict(df_test_x)
    rmse = np.sqrt(mean_squared_error(df_test_y, df_test_pred))
    mae = mean_absolute_error(df_test_y, df_test_pred)
    r2 = r2_score(df_test_y, df_test_pred)
    print("R2 score: {}".format(r2))
    print("RMSE: {}".format(rmse))
    print("MAE: {}".format(mae))
def test_score_param(_grid):
    df_test_pred = _grid.best_estimator_.predict(df_test_x)
    rmse = np.sqrt(mean_squared_error(df_test_y, df_test_pred))
    mae = mean_absolute_error(df_test_y, df_test_pred)
    r2 = r2_score(df_test_y, df_test_pred)
    print("R2 score: {}".format(r2))
    print("RMSE: {}".format(rmse))
    print("MAE: {}".format(mae))

In [24]:
param_dist = {
        "gradientboostingregressor__n_estimators": sp_randint(50, 200),
        "gradientboostingregressor__max_depth": sp_randint(5, 10),
        "gradientboostingregressor__learning_rate": uniform(0.0, 1.0),
        "gradientboostingregressor__subsample": uniform(0.0, 1.0),
        "gradientboostingregressor__loss": ['squared_error', 'quantile', 'huber', 'absolute_error'],
        "gradientboostingregressor__min_samples_split": sp_randint(2, 10),
        "gradientboostingregressor__min_samples_leaf": sp_randint(1, 10),
        "gradientboostingregressor__max_features": ['sqrt', 'log2', None],
        "gradientboostingregressor__min_impurity_decrease": uniform(0.0, 0.1)
    }

In [25]:
param_grid = {
        "gradientboostingregressor__n_estimators": [120,128,135,145],
        "gradientboostingregressor__max_depth": [6, 7, 8],
        "gradientboostingregressor__learning_rate": [0.55,0.6366443216222826,0.7,0.75],
        "gradientboostingregressor__subsample": [0.3, 0.39654278232127016, 0.5],
        "gradientboostingregressor__loss": ['huber'],
        "gradientboostingregressor__min_samples_split": [1,2,3],
        "gradientboostingregressor__min_samples_leaf": [3,4,5],
        "gradientboostingregressor__max_features": ['sqrt', 'log2', None],
        "gradientboostingregressor__min_impurity_decrease": [0.8,0.09041586944937485, 0.1]
    }

### GradientBoostingRegressor

In [14]:
best_model = GradientBoostingRegressor(
        n_estimators=100,
        max_depth=7,
        learning_rate=1,
        subsample=0.8,
        loss='huber',
        random_state=42
    )
best_model_2=GradientBoostingRegressor(
        n_estimators=128,
        max_depth=7,
        learning_rate=0.6366443216222826,
        subsample=0.39654278232127016,
        loss='huber',
        min_samples_leaf=4,
        min_impurity_decrease=0.09041586944937485,
        min_samples_split=2
)
param_grid_G={}
grid_G=grid_search(GradientBoostingRegressor(),preprocessor,param_grid_G)
test_score_param(grid_G)

Best parameters: {}
Best cross-validation score: 0.69
R2 score: 0.6929723535773081
RMSE: 8036.188145187117
MAE: 5089.240008987108


### XGBRegressor

In [15]:
from xgboost import XGBRegressor
param_grid_X={}
grid_X=grid_search(XGBRegressor(),preprocessor,param_grid_X)
test_score_param(grid_X)

Best parameters: {}
Best cross-validation score: 0.78
R2 score: 0.7794553473377556
RMSE: 6810.980241943497
MAE: 4143.559798296579


### LinearRegression

In [16]:
param_grid_L={}
grid_L=grid_search(LinearRegression(),preprocessor,param_grid_L)
test_score_param(grid_L)

Best parameters: {}
Best cross-validation score: 0.48
R2 score: 0.47880117486908236
RMSE: 10470.395057481466
MAE: 6949.226828693397


### DecisionTree Regressor

In [18]:
from sklearn.tree import DecisionTreeRegressor

param_grid_D={}
grid_D=grid_search(DecisionTreeRegressor(),preprocessor,param_grid_D)
test_score_param(grid_D)

Best parameters: {}
Best cross-validation score: 0.76
R2 score: 0.7796164362180659
RMSE: 6808.492370239102
MAE: 2718.284856456684


### RandomForestRegressor

In [19]:
from sklearn.ensemble import RandomForestRegressor
param_grid_R={}
grid_R=grid_search(RandomForestRegressor(),preprocessor,param_grid_R)
test_score_param(grid_R)