# Regression Used Cars Dataset

https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data 


#### **Dependency**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from itertools import combinations  # For creating combinations of elements
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

#### **Utility Function**

In [2]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype
        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    # Return the DataFrame with optimized memory usage
    return df

#### **Load Data**

load csv inside data folder

In [3]:
df = pd.read_csv("kaggle/input/vehicles/vehicles.csv")
df=reduce_mem_usage(df,verbose=True)
df.shape
df[0:4]

Memory usage of dataframe is 84.68 MB
Memory usage after optimization is: 76.54 MB
Decreased by 9.62%


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,


#### **Split dataset** 

In [4]:
df_train, df_test = train_test_split(df, test_size=0.2)
print(df_train.shape, df_test.shape)

(341504, 26) (85376, 26)


#### **Data Function**

data preperation

feature engineering

In [6]:
# implement the function
def data_preparation(df:pd.DataFrame)->pd.DataFrame:
    """"
    make data valid
    """
    #Create df1, get the average value of deleting NAN and outliers, then assign it to the NAN in df, 
    #and then delete the outliers in df to get the latest data set
    df1 = df.copy()
    odometer_data = df1['odometer']
    odometer_data_cleaned = odometer_data.dropna()
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(odometer_data_cleaned.values.reshape(-1, 1))

    outliers = odometer_data_cleaned[kmeans.labels_ == 1]
    df1 = df1[~df1['odometer'].isin(outliers)]
    average_value = df1['odometer'].mean()
    df['odometer'] = df['odometer'].fillna(average_value)
    
    #Use Kmeans to calculate outliers and delete them together with data with a price less than 1,000
    #($1,000 is about ￥7,500. Even the price of a second-hand car is too low, so it is also regarded as an outlier and deleted)
    price_data = df['price']
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(price_data.values.reshape(-1, 1))
    outliers = price_data[kmeans.labels_ == 1]

    df = df[~((price_data.isin(outliers)) | (price_data < 1000))]
    
    #use linear regression to prediect values for null in year col.
    features = ["price",'odometer']
    df_complete = df.dropna(subset=["year"] + features)
    X_train = df_complete[features]
    y_train = df_complete["year"]
    regression_model_year = LinearRegression()
    regression_model_year.fit(X_train, y_train)
    X_missing = df[df["year"].isnull()][features]
    predicted_year = regression_model_year.predict(X_missing)

    df.loc[df["year"].isnull(), "year"] = predicted_year.round().astype(int)
    
    #Fill null values using the modes of "manufacturer", "cylinders", 'fuel',"title_status", "transmission","drive", "type", "paint_color", "lat", "long"
    columns_to_fillna = ['manufacturer', 'cylinders', 'fuel', 'title_status', 'transmission',
                     'drive', 'type', 'paint_color', 'lat', 'long','posting_date']
    modes = df[columns_to_fillna].mode().iloc[0]
    df.loc[:, columns_to_fillna] = df.loc[:, columns_to_fillna].fillna(value=modes)
    
    #for "condition" col: Fill NAN with randomly selected data from that column to manipulate the more possible results.
    condition_options = df["condition"].unique()
    df.loc[df["condition"].isnull(), "condition"] = np.random.choice(condition_options)

    # turn nominal data into number
    y = df['price']
    x =df.drop(columns=['price'])


    return x,y

def feature_engineering(df:pd.DataFrame)->pd.DataFrame:
    """"
    select useful feature
    """
    #vehicle_age=posting_year - year, get the usage time of this vehicle
    df["posting_date"] = pd.to_datetime(df["posting_date"], utc=True)
    df["posting_year"] = df["posting_date"].dt.year
    df["vehicle_age"] = df["posting_year"] - df["year"]
    
    #Remove meaningless columns
    df.drop(['id','url','region','region_url','model','title_status',
         'image_url','VIN','size','county','description',
         'state','lat','long','posting_date','posting_year'], 
        axis=1, inplace=True)
    
    return df

def data_preprocessing(df:pd.DataFrame)->pd.DataFrame:
    x , y =data_preparation(df)
    x=feature_engineering(x)
    return x,y


In [7]:
df_train_x,df_train_y = data_preprocessing(df_train)
df_test_x, df_test_y = data_preprocessing(df_test)

print(df_train_x.shape, df_train_y.shape)
df_train_x[0:10]

(304375, 11) (304375,)


Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,vehicle_age
173190,1968.0,ford,fair,8 cylinders,gas,9000.0,automatic,rwd,coupe,custom,53.0
217477,2012.0,chevrolet,good,6 cylinders,gas,120795.0,automatic,4wd,SUV,silver,9.0
277451,2008.0,nissan,excellent,6 cylinders,gas,145000.0,automatic,4wd,SUV,brown,13.0
56055,1951.0,gmc,fair,6 cylinders,gas,34000.0,manual,rwd,pickup,white,70.0
96833,2015.0,cadillac,like new,6 cylinders,gas,50809.0,automatic,4wd,other,silver,6.0
30997,2016.0,chevrolet,like new,6 cylinders,gas,56000.0,automatic,fwd,SUV,white,5.0
121764,2009.0,honda,excellent,3 cylinders,gas,1.0,automatic,fwd,sedan,silver,12.0
267037,2015.0,chevrolet,good,8 cylinders,other,51677.0,other,4wd,other,black,6.0
113401,2011.0,hyundai,good,4 cylinders,gas,137000.0,automatic,fwd,hatchback,black,10.0
20898,2015.0,ford,like new,6 cylinders,gas,78379.0,automatic,4wd,sedan,white,6.0


#### **Training and Evaluation**

In [8]:
numJobs=3
cv=5
paramgrid={}

model = LinearRegression().fit(df_train_x, df_train_y)
#cross validation 
modelCV= GridSearchCV(model,paramgrid,n_jobs=numJobs,cv=cv)

#predict
y_pred = modelCV.predict(df_test_x)

#evaluation

# RMSE
rmse = np.sqrt(mean_squared_error(df_test_y, y_pred))
print(f"Root Mean Squared Error: {rmse}")

# MAE
mae = mean_absolute_error(df_test_y, y_pred)
print(f"Mean Absolute Error: {mae}")

# R2
r2 = r2_score(df_test_y, y_pred)
print(f"R-squared: {r2}")

#Residual Analysis: 
residuals = df_test_y - y_pred

plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

ValueError: could not convert string to float: 'ford'

#### **Training and Evaluation by Hessia**

### Label and Normalization


In [9]:
preprocessor = make_column_transformer(
    (OneHotEncoder(), ["manufacturer",'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'type', 'paint_color' ]),
    (StandardScaler(), ["year","odometer","vehicle_age"]),
    remainder="passthrough"
)

## Regressor


#### RandomSearch, GridSearch and Test Function 

In [10]:
def random_search(_model,_preprocessor,_param_dist):
    pipeline = make_pipeline(_preprocessor, _model)
    random = RandomizedSearchCV(pipeline, param_distributions=_param_dist, n_iter=100, cv=5, n_jobs=16)
    random.fit(df_train_x, df_train_y)
    # Print the best hyperparameters and the corresponding score
    print("Best parameters: {}".format(random.best_params_))
    print("Best cross-validation score: {:.2f}".format(random.best_score_))
    return random


def grid_search(_model,_preprocessor,_param_grid):
    # Define the Gradient Boosting regression model
    # Define the pipeline to preprocess the features and fit the model
    pipeline = make_pipeline(_preprocessor, _model)
    grid = GridSearchCV(pipeline, param_grid=_param_grid, cv=5, n_jobs=16)
    grid.fit(df_train_x, df_train_y)
    # Print the best hyperparameters and the corresponding score
    print("Best parameters: {}".format(grid.best_params_))
    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    return grid
    
def test_score(_model,_preprocessor):
    best_pipeline = make_pipeline(_preprocessor, _model)
    best_pipeline.fit(df_train_x, df_train_y)
    df_test_pred = best_pipeline.predict(df_test_x)
    rmse = np.sqrt(mean_squared_error(df_test_y, df_test_pred))
    mae = mean_absolute_error(df_test_y, df_test_pred)
    r2 = r2_score(df_test_y, df_test_pred)
    print("R2 score: {}".format(r2))
    print("RMSE: {}".format(rmse))
    print("MAE: {}".format(mae))
def test_score_param(_grid):
    df_test_pred = _grid.best_estimator_.predict(df_test_x)
    rmse = np.sqrt(mean_squared_error(df_test_y, df_test_pred))
    mae = mean_absolute_error(df_test_y, df_test_pred)
    r2 = r2_score(df_test_y, df_test_pred)
    print("R2 score: {}".format(r2))
    print("RMSE: {}".format(rmse))
    print("MAE: {}".format(mae))



#### run RandomSearch

In [11]:
param_dist = {
        "gradientboostingregressor__n_estimators": sp_randint(50, 200),
        "gradientboostingregressor__max_depth": sp_randint(5, 10),
        "gradientboostingregressor__learning_rate": uniform(0.0, 1.0),
        "gradientboostingregressor__subsample": uniform(0.0, 1.0),
        "gradientboostingregressor__loss": ['squared_error', 'quantile', 'huber', 'absolute_error'],
        "gradientboostingregressor__min_samples_split": sp_randint(2, 10),
        "gradientboostingregressor__min_samples_leaf": sp_randint(1, 10),
        "gradientboostingregressor__max_features": ['sqrt', 'log2', None],
        "gradientboostingregressor__min_impurity_decrease": uniform(0.0, 0.1)
    }


In [31]:
param_grid = {
        "gradientboostingregressor__n_estimators": [120,128,135,145],
        "gradientboostingregressor__max_depth": [6, 7, 8],
        "gradientboostingregressor__learning_rate": [0.55,0.6366443216222826,0.7,0.75],
        "gradientboostingregressor__subsample": [0.3, 0.39654278232127016, 0.5],
        "gradientboostingregressor__loss": ['huber'],
        "gradientboostingregressor__min_samples_split": [1,2,3],
        "gradientboostingregressor__min_samples_leaf": [3,4,5],
        "gradientboostingregressor__max_features": ['sqrt', 'log2', None],
        "gradientboostingregressor__min_impurity_decrease": [0.8,0.09041586944937485, 0.1]
    }


## GradientBoostingRegressor

In [13]:
best_model = GradientBoostingRegressor(
        n_estimators=100,
        max_depth=7,
        learning_rate=1,
        subsample=0.8,
        loss='huber',
        random_state=42
    )
best_model_2=GradientBoostingRegressor(
        n_estimators=128,
        max_depth=7,
        learning_rate=0.6366443216222826,
        subsample=0.39654278232127016,
        loss='huber',
        min_samples_leaf=4,
        min_impurity_decrease=0.09041586944937485,
        min_samples_split=2
)
param_grid_G={}
grid_G=grid_search(GradientBoostingRegressor(),preprocessor,param_grid_G)
test_score_param(grid_G)

Best parameters: {}
Best cross-validation score: -0.31
R2 score: 0.16094345662558918
RMSE: 3303833.9760006675
MAE: 46915.341121454476


## XGBRegressor

In [14]:
from xgboost import XGBRegressor
param_grid_X={}
grid_X=grid_search(XGBRegressor(),preprocessor,param_grid_X)
test_score_param(grid_X)

R2 score: 0.3429775335073124
RMSE: 4681957.902403922
MAE: 61732.29532345271


## LinearRegression

In [12]:

param_grid_L={}
grid_L=grid_search(LinearRegression(),preprocessor,param_grid_L)
test_score_param(grid_L)

R2 score: -0.0002456337743594883
RMSE: 5762778.706001976
MAE: 105652.0382011966


### Random Forest Regression

In [15]:
param_grid_R={}
grid_R=grid_search(LinearRegression(),preprocessor,param_grid_R)
test_score_param(grid_R)

### DecisionTreeRegressor


In [13]:
from sklearn.tree import DecisionTreeRegressor

param_grid_D={}
grid_D=grid_search(DecisionTreeRegressor(),preprocessor,param_grid_D)
test_score_param(grid_D)

R2 score: -0.7986807042236448
RMSE: 7727795.990752676
MAE: 58473.15672608925
