In [1]:
import pandas as pd

df = pd.read_csv('used_cars_data_v3.csv')
df['back_legroom'] = pd.to_numeric(df['back_legroom'].str.replace(' in', ''), errors='coerce')
df['front_legroom'] = pd.to_numeric(df['front_legroom'].str.replace(' in', ''), errors='coerce')
df['fuel_tank_volume'] = pd.to_numeric(df['fuel_tank_volume'].str.replace(' gal', ''), errors='coerce')
df['height'] = pd.to_numeric(df['height'].str.replace(' in', ''), errors='coerce')
df['length'] = pd.to_numeric(df['length'].str.replace(' in', ''), errors='coerce')
df['wheelbase'] = pd.to_numeric(df['wheelbase'].str.replace(' in', ''), errors='coerce')
df['width'] = pd.to_numeric(df['width'].str.replace(' in', ''), errors='coerce')
df['power'] = df['power'].str.extract('(\d+)').fillna(0).astype(int)
df['torque'] = df['torque'].str.extract('(\d+)').fillna(0).astype(int)

In [2]:
# replace NaN values in fleet, frame_damaged, isCab, salvage, theft_title with false
bool_cols = ['fleet', 'frame_damaged', 'isCab', 'salvage', 'theft_title']
for col in bool_cols:
  df[col] = df[col].fillna(False)

In [3]:
df = df.drop(columns=['horsepower', 'engine_type', 'transmission', 'franchise_make', 'fuel_type', 'fleet', 'frame_damaged', 'isCab', 'salvage', 'theft_title', 'is_new', 'maximum_seating'])

# XGBoost

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


## One-hot encoding

In [5]:
# # Ref: https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/
# selected_columns = df.select_dtypes(include=['bool', 'object'])
# # Get the column names
# column_names = selected_columns.columns.tolist()
# # Perform one-hot encoding
# one_hot_encoded_data = pd.get_dummies(df, columns=column_names)
# # print(one_hot_encoded_data.columns.tolist())
# # print(one_hot_encoded_data.dtypes)


from math import ceil

# Define batch size
batch_size = 1000

# Get the total number of batches
num_batches = ceil(len(df) / batch_size)

# Process each batch
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size if (i + 1) * batch_size < len(df) else len(df)
    
    # Extract the current batch
    current_batch = df.iloc[start_idx:end_idx, :]
    
    # Perform one-hot encoding on the current batch
    one_hot_encoded_batch = pd.get_dummies(current_batch, columns=current_batch.select_dtypes(include=['bool', 'object']).columns)

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


In [7]:
# one_hot_encoded_data.columns[one_hot_encoded_data.isna().any()].tolist()
one_hot_encoded_data = one_hot_encoded_batch

In [8]:
specific_columns = ['back_legroom',
 'city_fuel_economy',
 'engine_displacement',
 'front_legroom',
 'fuel_tank_volume',
 'height',
 'highway_fuel_economy',
 'length',
 'mileage',
 'owner_count',
 'wheelbase',
 'width']
data_types = one_hot_encoded_data[specific_columns].dtypes
data_types

back_legroom            float64
city_fuel_economy       float64
engine_displacement     float64
front_legroom           float64
fuel_tank_volume        float64
height                  float64
highway_fuel_economy    float64
length                  float64
mileage                 float64
owner_count             float64
wheelbase               float64
width                   float64
dtype: object

In [9]:
# Define the specific columns
specific_columns = ['back_legroom',
 'city_fuel_economy',
 'engine_displacement',
 'front_legroom',
 'fuel_tank_volume',
 'height',
 'highway_fuel_economy',
 'length',
 'mileage',
 'owner_count',
 'wheelbase',
 'width']

# Calculate the mean of each selected column
column_means = one_hot_encoded_data[specific_columns].mean()

# Replace null values in each selected column with its respective mean
one_hot_encoded_data[specific_columns] = one_hot_encoded_data[specific_columns].fillna(column_means)

# Check if there are any null values left
print(one_hot_encoded_data.isnull().any().tolist())

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]


### Combining the numeric and categorical

In [10]:
# Create a new dataframe with only the training data
# Drop non-numeric columns (temporary approach for simplicity)
df_numeric = df.select_dtypes(include = ['float64', 'int64'], exclude=['object'])
# print(df_numeric.dtypes)
# print(df_numeric.columns.to_list())
# Extracting y (dependent variable)
y = df_numeric["price"]

df_numeric = df.drop('price', axis=1)
one_hot_encoded_data = one_hot_encoded_data.drop('price', axis=1)
# print(df_numeric)
print(one_hot_encoded_data.columns.to_list())
# numeric_columns = list(df_numeric.columns.tolist())
one_hot_columns = list(one_hot_encoded_data.columns.tolist())
X = one_hot_encoded_data

# print(X.columns.tolist())

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

# # print(X_train.head())

# # # Train the model on the training data
# model = LinearRegression().fit(X_train, y_train)

# # # Make predictions on the test set
# y_pred = model.predict(X_test)
# y_pred

['back_legroom', 'city_fuel_economy', 'daysonmarket', 'engine_displacement', 'front_legroom', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'latitude', 'length', 'longitude', 'mileage', 'owner_count', 'power', 'savings_amount', 'seller_rating', 'sp_id', 'torque', 'wheelbase', 'width', 'year', 'body_type_Coupe', 'body_type_Hatchback', 'body_type_Minivan', 'body_type_Pickup Truck', 'body_type_SUV / Crossover', 'body_type_Sedan', 'body_type_Van', 'body_type_Wagon', 'franchise_dealer_False', 'franchise_dealer_True', 'listing_color_BLACK', 'listing_color_BLUE', 'listing_color_GOLD', 'listing_color_GRAY', 'listing_color_GREEN', 'listing_color_RED', 'listing_color_SILVER', 'listing_color_UNKNOWN', 'listing_color_WHITE', 'make_name_BMW', 'make_name_Buick', 'make_name_Chevrolet', 'make_name_Dodge', 'make_name_FIAT', 'make_name_Ford', 'make_name_Jaguar', 'make_name_Jeep', 'make_name_MINI', 'make_name_Nissan', 'make_name_Scion', 'make_name_Subaru', 'make_name_Toyota', 'make_name_Volkswage

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df is your original DataFrame
df_numeric = df.select_dtypes(include=['float64', 'int64'], exclude=['object'])

# Extracting y (dependent variable)
y = df_numeric["price"]

# Drop the target variable from df_numeric
df_numeric = df_numeric.drop('price', axis=1)

# One-hot encode the remaining categorical columns
one_hot_encoded_data = pd.get_dummies(df_numeric)

# Extract the column names for later reference
one_hot_columns = list(one_hot_encoded_data.columns)

# Assign one-hot encoded data to X
X = one_hot_encoded_data.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.4, random_state=42)

# Specify XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # for regression tasks
    'colsample_bytree': 0.8,
    'learning_rate': 0.01,
    'max_depth': 5,
    'alpha': 10,
    'n_estimators': 100
}

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train the model on the training data
num_round = 50  # The num_round parameter controls the number of boosting rounds or iterations. This training process is performed in batches, with each batch represented by a subset of the training data.
model = xgb.train(params, dtrain, num_round)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Parameters: { "n_estimators" } are not used.



Mean Squared Error: 235469359.82729602


## Evaluations

### Feature Importance

In [16]:
# Extract the column names for later reference
one_hot_columns = list(one_hot_encoded_data.columns)

# Get feature importances
feature_importances = model.get_score(importance_type='weight')

# Map feature indices to column names
feature_importances = {one_hot_columns[int(k[1:])]: v for k, v in feature_importances.items()}

# Sort the features by importance
sorted_feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Output the most important features with column names
print("Most Important Features:")
for feature, importance in sorted_feature_importances:
    print(f"{feature}: {importance}")

Most Important Features:
power: 279.0
mileage: 153.0
engine_displacement: 136.0
fuel_tank_volume: 135.0
width: 133.0
torque: 120.0
year: 118.0
height: 104.0
back_legroom: 83.0
front_legroom: 80.0
wheelbase: 57.0
length: 51.0
owner_count: 35.0
highway_fuel_economy: 22.0
city_fuel_economy: 14.0
longitude: 8.0
savings_amount: 8.0
sp_id: 4.0
daysonmarket: 1.0


### Coefficient of determination (R^2)

In [18]:
from sklearn.metrics import r2_score
# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print(f"Coefficient of determination (R-squared): {r_squared}")

Coefficient of determination (R-squared): 0.3933524261898541


### Mean Squared Error (MSE)

In [19]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

MSE = metrics.mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error: {MSE}")

Mean Squared Error: 235469359.82729602


### Mean Absolute Error (MAE)

In [20]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

MAE = metrics.mean_absolute_error(y_test,y_pred)
print(f"Mean Absolute Error: {MAE}")

Mean Absolute Error: 9047.317431377149


### Root Mean Squared Error (RMSE)

In [21]:
# Ref: https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606
from sklearn import metrics

RMSE = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
print(f"Root Mean Squared Error: {RMSE}")

Root Mean Squared Error: 15345.010909976441
