# Regression Used Cars Dataset

https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data 


#### **Dependency**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from itertools import combinations  # For creating combinations of elements
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

#### **Utility Function**

In [2]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype
        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    # Return the DataFrame with optimized memory usage
    return df

#### **Load Data**

load csv inside data folder

In [3]:
df = pd.read_csv("/kaggle/input/vehicles/vehicles.csv")
df=reduce_mem_usage(df,verbose=True)
df.shape
df[0:4]

Memory usage of dataframe is 84.68 MB
Memory usage after optimization is: 76.54 MB
Decreased by 9.62%


Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,


#### **Split dataset** 

In [4]:
df_train, df_test = train_test_split(df, test_size=0.2)
print(df_train.shape, df_test.shape)

(341504, 26) (85376, 26)


#### **Data Function**

data preperation

feature engineering

In [5]:
# implement the function
def data_preparation(df:pd.DataFrame)->pd.DataFrame:
    """"
    make data valid
    """
    #Create df1, get the average value of deleting NAN and outliers, then assign it to the NAN in df, 
    #and then delete the outliers in df to get the latest data set
    df1 = df.copy()
    odometer_data = df1['odometer']
    odometer_data_cleaned = odometer_data.dropna()
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(odometer_data_cleaned.values.reshape(-1, 1))

    outliers = odometer_data_cleaned[kmeans.labels_ == 1]
    df1 = df1[~df1['odometer'].isin(outliers)]
    average_value = df1['odometer'].mean()
    df['odometer'] = df['odometer'].fillna(average_value)
    
    #Use Kmeans to calculate outliers and delete them together with data with a price less than 1,000
    #($1,000 is about ￥7,500. Even the price of a second-hand car is too low, so it is also regarded as an outlier and deleted)
    price_data = df['price']
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(price_data.values.reshape(-1, 1))
    outliers = price_data[kmeans.labels_ == 1]

    df = df[~((price_data.isin(outliers)) | (price_data < 1000))]
    
    #use linear regression to prediect values for null in year col.
    features = ["price",'odometer']
    df_complete = df.dropna(subset=["year"] + features)
    X_train = df_complete[features]
    y_train = df_complete["year"]
    regression_model_year = LinearRegression()
    regression_model_year.fit(X_train, y_train)
    X_missing = df[df["year"].isnull()][features]
    predicted_year = regression_model_year.predict(X_missing)

    df.loc[df["year"].isnull(), "year"] = predicted_year.round().astype(int)
    
    #Fill null values using the modes of "manufacturer", "cylinders", 'fuel',"title_status", "transmission","drive", "type", "paint_color", "lat", "long"
    columns_to_fillna = ['manufacturer', 'cylinders', 'fuel', 'title_status', 'transmission',
                     'drive', 'type', 'paint_color', 'lat', 'long']
    modes = df[columns_to_fillna].mode().iloc[0]
    df.loc[:, columns_to_fillna] = df.loc[:, columns_to_fillna].fillna(value=modes)
    
    #for "condition" col: Fill NAN with randomly selected data from that column to manipulate the more possible results.
    condition_options = df["condition"].unique()
    df.loc[df["condition"].isnull(), "condition"] = np.random.choice(condition_options)

    # turn nominal data into number
    y = df['price']
    x =df.drop(columns=['price'])


    return x,y

def feature_engineering(df:pd.DataFrame)->pd.DataFrame:
    """"
    select useful feature
    """
    #vehicle_age=posting_year - year, get the usage time of this vehicle
    df["posting_date"] = pd.to_datetime(df["posting_date"], utc=True)
    df["posting_year"] = df["posting_date"].dt.year
    df["vehicle_age"] = df["posting_year"] - df["year"]
    
    #Remove meaningless columns
    df.drop(['id','url','region','region_url','model','title_status',
         'VIN','size','image_url','description','county',
         'state','lat','long','posting_date','posting_year'], 
        axis=1, inplace=True)
    
    return df

def data_preprocessing(df:pd.DataFrame)->pd.DataFrame:
    x , y =data_preparation(df)
    x=feature_engineering(x)
    return x,y


In [6]:
df_train_x,df_train_y = data_preprocessing(df_train)
df_test_x, df_test_y = data_preprocessing(df_test)

print(df_train_x.shape, df_train_y.shape)



(304302, 11) (304302,)


#### **Training and Evaluation**

In [None]:
numJobs=3
cv=5
paramgrid={}

model = LinearRegression().fit(df_train_x, df_train_y)
#cross validation 
modelCV= GridSearchCV(model,paramgrid,n_jobs=numJobs,cv=cv)

#predict
y_pred = modelCV.predict(df_test_x)

#evaluation

# RMSE
rmse = np.sqrt(mean_squared_error(df_test_y, y_pred))
print(f"Root Mean Squared Error: {rmse}")

# MAE
mae = mean_absolute_error(df_test_y, y_pred)
print(f"Mean Absolute Error: {mae}")

# R2
r2 = r2_score(df_test_y, y_pred)
print(f"R-squared: {r2}")

#Residual Analysis: 
residuals = df_test_y - y_pred

plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()