Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load dataset
df = pd.read_csv("car_data.csv")
#print(df)
#print(df.head())

# Define features (X) and target (y)
X = df.drop(columns=["car_price"])
y = df["car_price"]

# Identify categorical and numeric columns
categorical = ["car_brand", "car_model", "car_city", "car_fuel", "car_transmission", "car_drive", "car_country"]
numeric = ["car_mileage", "car_engine_capacity", "car_engine_hp", "car_age"]

# Define preprocessor. One-hot encode the categorical data.
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
], remainder="passthrough")

# Split into train/test sets, can't use stratification because the target is continuous
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Random Forest Regressor to build the model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

# Predict prices for the test set
y_pred = model.predict(X_test)

# Evaluate the MAE which is the average value of how far away the predicted price is from the real one
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.10f}")

# new Mean Absolute Error: 204540.00 #n estimators = 100, max depth = inf
# R² Score: 0.9049410372

# Save the trained model to a file so that it doesn't need to be trained again
joblib.dump(model, "car_price_rf.pkl")

Mean Absolute Error: 288.46
R² Score: 0.9985139782


['car_price_rf.pkl']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import joblib
import numpy as np
import kagglehub
import os

path = kagglehub.dataset_download("volkanastasia/dataset-of-used-cars")
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_file = csv_files[0]
# Load dataset
df = pd.read_csv(f"{path}/{csv_file}")
print(f"✓ Dataset loaded: {df.shape[0]} rows")
#print(df)
#print(df.head())

# Define features (X) and target (y)
X = df.drop(columns=["car_price"])
y = df["car_price"]

# Identify categorical and numeric columns
categorical = ["car_brand", "car_model", "car_city", "car_fuel", "car_transmission", "car_drive", "car_country"]
numeric = ["car_mileage", "car_engine_capacity", "car_engine_hp", "car_age"]

# Define preprocessor. One-hot encode the categorical data.
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
], remainder="passthrough")

# Split into train/test sets, can't use stratification because the target is continuous
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Random Forest Regressor to build the model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

# Predict prices for the test set
y_pred = model.predict(X_test)

# Evaluate the MAE which is the average value of how far away the predicted price is from the real one
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.10f}")

# new Mean Absolute Error: 204540.00 #n estimators = 100, max depth = inf
# R² Score: 0.9049410372

# Save the trained model to a file so that it doesn't need to be trained again
joblib.dump(model, "car_price_rf.pkl")

Using Colab cache for faster access to the 'dataset-of-used-cars' dataset.
✓ Dataset loaded: 42089 rows
Mean Absolute Error: 199218.40
Root Mean Squared Error: 442819.80
R² Score: 0.9377565064


['car_price_rf.pkl']

In [None]:
# https://scikit-learn.org/stable/modules/neural_networks_supervised.html

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.neural_network import MLPRegressor
import joblib
import kagglehub
import os

path = kagglehub.dataset_download("volkanastasia/dataset-of-used-cars")
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_file = csv_files[0]
df = pd.read_csv(f"{path}/{csv_file}")
print(f"✓ Dataset loaded: {df.shape[0]} rows")

# Define features (X) and target (y)
X = df.drop(columns=["car_price"])
y = df["car_price"]

# Identify categorical and numeric columns
categorical = ["car_brand", "car_model", "car_city", "car_fuel", "car_transmission", "car_drive", "car_country"]
numeric = ["car_mileage", "car_engine_capacity", "car_engine_hp", "car_age"]

# Define preprocessor. One-Hot encoding because there are categorical features
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", StandardScaler(), numeric)  # Scale numeric features for NN
])

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Neural Network Regressor
nn_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MLPRegressor(
        hidden_layer_sizes=(100, 50),  # Two hidden layers: 100 neurons -> 50 neurons
        activation='relu',
        solver='adam',
        alpha=0.001,  # L2 regularization parameter
        learning_rate='adaptive',
        max_iter=1000,
        random_state=42
    ))
])

# Train the model
nn_model.fit(X_train, y_train)

# Predict prices for the test set
y_pred = nn_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.10f}")

# Save the trained model
joblib.dump(nn_model, "car_price_nn.pkl")

Using Colab cache for faster access to the 'dataset-of-used-cars' dataset.
✓ Dataset loaded: 42089 rows




Mean Absolute Error: 196505.75
Root Mean Squared Error: 463982.88
R² Score: 0.9316649049


['car_price_nn.pkl']

Neural Network MLPRegressor() (Multi-layer Perceptron)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor
import joblib

# Load dataset
df = pd.read_csv("car_data.csv")

# Define features (X) and target (y)
X = df.drop(columns=["car_price"])
y = df["car_price"]

# Identify categorical and numeric columns
categorical = ["car_brand", "car_model", "car_city", "car_fuel", "car_transmission", "car_drive", "car_country"]
numeric = ["car_mileage", "car_engine_capacity", "car_engine_hp", "car_age"]

# Define preprocessor. One-Hot encoding because there are categorical features
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", StandardScaler(), numeric)  # Scale numeric features for NN
])

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Neural Network Regressor
nn_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", MLPRegressor(
        hidden_layer_sizes=(100, 50),  # Two hidden layers: 100 neurons -> 50 neurons
        activation='relu',
        solver='adam',
        alpha=0.001,  # L2 regularization parameter
        learning_rate='adaptive',
        max_iter=1000,
        random_state=42
    ))
])

# Train the model
nn_model.fit(X_train, y_train)

# Predict prices for the test set
y_pred = nn_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.10f}")

# Save the trained model
joblib.dump(nn_model, "car_price_nn.pkl")

# Mean Absolute Error: 196505.75
# R² Score: 0.9316649049

Run some data to see output

In [None]:
import joblib
import pandas as pd

# Load the saved model
model = joblib.load("car_price_rf.pkl")

# Example input
new_car = pd.DataFrame([{
    "Manufacturer": "Toyota",
    "Model": "Corolla",
    "Engine size": 1.8,
    "Fuel type": "Petrol",
    "Year of manufacture": 2018,
    "Mileage": 45000
}])

# Predict the price
predicted_price = model.predict(new_car)
print(f"Predicted Price: ${predicted_price[0]:,.2f}")

# Predicted Price for rf: $38,596.89
# Predicted Price for nn: $34,668.41

ValueError: EOF: reading array data, expected 262144 bytes got 137456