In [1]:
import pandas as pd

df = pd.read_csv('used_cars_data_v3.csv')
df['back_legroom'] = pd.to_numeric(df['back_legroom'].str.replace(' in', ''), errors='coerce')
df['front_legroom'] = pd.to_numeric(df['front_legroom'].str.replace(' in', ''), errors='coerce')
df['fuel_tank_volume'] = pd.to_numeric(df['fuel_tank_volume'].str.replace(' gal', ''), errors='coerce')
df['height'] = pd.to_numeric(df['height'].str.replace(' in', ''), errors='coerce')
df['length'] = pd.to_numeric(df['length'].str.replace(' in', ''), errors='coerce')
df['wheelbase'] = pd.to_numeric(df['wheelbase'].str.replace(' in', ''), errors='coerce')
df['width'] = pd.to_numeric(df['width'].str.replace(' in', ''), errors='coerce')
df['power'] = df['power'].str.extract('(\d+)').fillna(0).astype(int)
df['torque'] = df['torque'].str.extract('(\d+)').fillna(0).astype(int)

In [2]:
# replace NaN values in fleet, frame_damaged, isCab, salvage, theft_title with false
bool_cols = ['fleet', 'frame_damaged', 'isCab', 'salvage', 'theft_title']
for col in bool_cols:
  df[col] = df[col].fillna(False)

  df[col] = df[col].fillna(False)


In [3]:
df = df.drop(columns=['horsepower', 'engine_type', 'transmission', 'franchise_make', 'fuel_type', 'fleet', 'frame_damaged', 'isCab', 'salvage', 'theft_title', 'is_new', 'maximum_seating'])

# Baseline Model - Linear Regression

In [4]:
df.dtypes
# body_type                object
# listing_color            object
# make_name                object
# wheel_system             object


back_legroom            float64
body_type                object
city_fuel_economy       float64
daysonmarket              int64
engine_displacement     float64
franchise_dealer           bool
front_legroom           float64
fuel_tank_volume        float64
height                  float64
highway_fuel_economy    float64
latitude                float64
length                  float64
listing_color            object
longitude               float64
make_name                object
mileage                 float64
owner_count             float64
power                     int64
price                   float64
torque                    int64
wheel_system             object
wheelbase               float64
width                   float64
year                      int64
dtype: object

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


## One-hot encoding

In [6]:
# Ref: https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/
selected_columns = df.select_dtypes(include=['bool', 'object'])
# Get the column names
column_names = selected_columns.columns.tolist()
# Perform one-hot encoding
one_hot_encoded_data = pd.get_dummies(df, columns=column_names)
print(one_hot_encoded_data.columns.tolist())
print(one_hot_encoded_data.dtypes)

['back_legroom', 'city_fuel_economy', 'daysonmarket', 'engine_displacement', 'front_legroom', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'latitude', 'length', 'longitude', 'mileage', 'owner_count', 'power', 'price', 'torque', 'wheelbase', 'width', 'year', 'body_type_Convertible', 'body_type_Coupe', 'body_type_Hatchback', 'body_type_Minivan', 'body_type_Pickup Truck', 'body_type_SUV / Crossover', 'body_type_Sedan', 'body_type_Van', 'body_type_Wagon', 'franchise_dealer_False', 'franchise_dealer_True', 'listing_color_BLACK', 'listing_color_BLUE', 'listing_color_BROWN', 'listing_color_GOLD', 'listing_color_GRAY', 'listing_color_GREEN', 'listing_color_ORANGE', 'listing_color_PINK', 'listing_color_PURPLE', 'listing_color_RED', 'listing_color_SILVER', 'listing_color_TEAL', 'listing_color_UNKNOWN', 'listing_color_WHITE', 'listing_color_YELLOW', 'make_name_AM General', 'make_name_AMC', 'make_name_Acura', 'make_name_Alfa Romeo', 'make_name_Allard', 'make_name_Ariel', 'make_name_Aston 

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [8]:
one_hot_encoded_data.columns[one_hot_encoded_data.isna().any()].tolist()

['back_legroom',
 'city_fuel_economy',
 'engine_displacement',
 'front_legroom',
 'fuel_tank_volume',
 'height',
 'highway_fuel_economy',
 'length',
 'mileage',
 'owner_count',
 'wheelbase',
 'width']

In [9]:
specific_columns = ['back_legroom',
 'city_fuel_economy',
 'engine_displacement',
 'front_legroom',
 'fuel_tank_volume',
 'height',
 'highway_fuel_economy',
 'length',
 'mileage',
 'owner_count',
 'wheelbase',
 'width']
data_types = one_hot_encoded_data[specific_columns].dtypes
data_types

back_legroom            float64
city_fuel_economy       float64
engine_displacement     float64
front_legroom           float64
fuel_tank_volume        float64
height                  float64
highway_fuel_economy    float64
length                  float64
mileage                 float64
owner_count             float64
wheelbase               float64
width                   float64
dtype: object

In [10]:
# Define the specific columns
specific_columns = ['back_legroom',
 'city_fuel_economy',
 'engine_displacement',
 'front_legroom',
 'fuel_tank_volume',
 'height',
 'highway_fuel_economy',
 'length',
 'mileage',
 'owner_count',
 'wheelbase',
 'width']

# Calculate the mean of each selected column
column_means = one_hot_encoded_data[specific_columns].mean()

# Replace null values in each selected column with its respective mean
one_hot_encoded_data[specific_columns] = one_hot_encoded_data[specific_columns].fillna(column_means)

In [11]:
# Check if there are any null values left
print(one_hot_encoded_data.isnull().any().tolist())

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False

### Simple Approach - only using numeric

In [12]:
# # Create a new dataframe with only the training data
# # Drop non-numeric columns (temporary approach for simplicity)
# df_numeric = df.select_dtypes(include = ['float64', 'int64']).fillna(0)

# X = df_numeric.drop("price", axis=1)
# # Extracting y (dependent variable)
# y = df_numeric["price"]

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train the model on the training data
# model = LinearRegression().fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)
# y_pred

### Combining the numeric and categorical - not working?

In [13]:
# Create a new dataframe with only the training data
# Drop non-numeric columns (temporary approach for simplicity)
df_numeric = df.select_dtypes(include = ['float64', 'int64'], exclude=['object'])
# print(df_numeric.dtypes)
print(df_numeric.columns.to_list())
# Extracting y (dependent variable)
y = df_numeric["price"]

df_numeric = df.drop('price', axis=1)
one_hot_encoded_data = one_hot_encoded_data.drop('price', axis=1)
# print(df_numeric)
print(one_hot_encoded_data.columns.to_list())
# numeric_columns = list(df_numeric.columns.tolist())
one_hot_columns = list(one_hot_encoded_data.columns.tolist())
X = one_hot_encoded_data

print(X.columns.tolist())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

print(X_train.head())

# # Train the model on the training data
model = LinearRegression().fit(X_train, y_train)

# # Make predictions on the test set
y_pred = model.predict(X_test)
y_pred

['back_legroom', 'city_fuel_economy', 'daysonmarket', 'engine_displacement', 'front_legroom', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'latitude', 'length', 'longitude', 'mileage', 'owner_count', 'power', 'price', 'torque', 'wheelbase', 'width', 'year']
['back_legroom', 'city_fuel_economy', 'daysonmarket', 'engine_displacement', 'front_legroom', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'latitude', 'length', 'longitude', 'mileage', 'owner_count', 'power', 'torque', 'wheelbase', 'width', 'year', 'body_type_Convertible', 'body_type_Coupe', 'body_type_Hatchback', 'body_type_Minivan', 'body_type_Pickup Truck', 'body_type_SUV / Crossover', 'body_type_Sedan', 'body_type_Van', 'body_type_Wagon', 'franchise_dealer_False', 'franchise_dealer_True', 'listing_color_BLACK', 'listing_color_BLUE', 'listing_color_BROWN', 'listing_color_GOLD', 'listing_color_GRAY', 'listing_color_GREEN', 'listing_color_ORANGE', 'listing_color_PINK', 'listing_color_PURPLE', 'listing_color_RED', 

array([21361.49781703, 37661.56576754, 11996.04510154, ...,
        1039.42321445, 53145.24851545, 10422.86292539])

## Evaluations

### Intercept and Coefficients

In [14]:
# Ref: https://realpython.com/linear-regression-in-python/
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")


intercept: -1389550.4049814704
slope: [ 4.91432294e+02  3.72433765e+02  6.58791252e+00  2.63702883e+00
  2.20904411e+02  2.63858258e+02  1.10098460e+02 -4.70066404e+02
 -5.55193716e+01  2.60393539e+02 -3.37063852e+01 -1.39510153e-01
  3.46439727e+02 -7.62156882e+00  9.71373969e+00 -2.38828381e+02
  2.91929363e+01  7.10233193e+02  1.07233805e+04  1.01400916e+04
  2.30625650e+03 -1.87986163e+03 -2.68996720e+03 -4.88376984e+02
 -1.37443654e+03 -6.88525638e+03 -4.50489988e+02 -1.61687696e+03
  1.61687696e+03 -5.18899342e+02 -8.74484799e+02 -3.01713329e+02
 -1.19961854e+03 -1.09477301e+03  1.47116480e+03 -3.09211829e+01
  4.77990987e+03  6.76970242e+02 -5.52679245e+02 -1.61003526e+03
 -1.99528138e+03 -1.84340878e+01 -8.13767157e+02  2.08256242e+03
 -1.60135588e+04 -4.25586954e+04 -6.82431432e+04 -6.18911409e+04
 -6.54836185e-11 -6.40534644e+04  5.05711359e+04 -5.84213196e+04
 -4.79485607e-09 -2.48118410e+04 -5.83433812e+04  6.70951475e+03
 -1.05501385e-09  1.10991076e+06 -7.42469902e+04 -6.

### Coefficient of determination (R^2)

In [15]:
# Ref: https://realpython.com/linear-regression-in-python/
r_sq = model.score(X_test, y_test)
print(f"Coefficient of determination: {r_sq}")

Coefficient of determination: 0.33815353818342864


### Mean Squared Error (MSE)

In [16]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

MSE = metrics.mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error: {MSE}")

Mean Squared Error: 256894726.68472502


### Mean Absolute Error (MAE)

In [17]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

MAE = metrics.mean_absolute_error(y_test,y_pred)
print(f"Mean Absolute Error: {MAE}")

Mean Absolute Error: 6029.5757984776865


### Root Mean Squared Error (RMSE)

In [18]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
print(f"Root Mean Squared Error: {RMSE}")

Root Mean Squared Error: 16027.935821082046
