# BMW Car Sales â€” Modeling and Prediction
### INFO-523 Final Project | Min Set Khant (Solo)

In [125]:
# 1. SETUP AND IMPORTS
# pandas for data manipulation and DataFrame operations
import pandas as pd
# numpy for numerical operations, especially array handling
import numpy as np
# os for operating system interaction (e.g., creating directories)
import os 

# Scikit-learn modules for modeling and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # <-- All necessary metrics
# Matplotlib and Seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [126]:
# Paths
DATA_PATH = "../data/cleaned/bmw_modeling_ready.csv"
# Load the dataset
df = pd.read_csv(DATA_PATH)
df.head()
OUTPUT_DIR = "outputs" # Using a local 'outputs' folder

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created/verified: {OUTPUT_DIR}")

Output directory created/verified: outputs


In [127]:
# Load the dataset
try:
    df = pd.read_csv(DATA_PATH)
    print(f"Data loaded successfully from {DATA_PATH}. Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found at {DATA_PATH}. Please check your file path.")
    exit()

# Display initial rows to confirm load
print("\nFirst 5 rows of the dataset:")
print(df.head())

Data loaded successfully from ../data/cleaned/bmw_modeling_ready.csv. Shape: (50000, 38)

First 5 rows of the dataset:
   Year  Engine_Size_L  Mileage_KM  Price_USD  Sales_Volume  \
0  2016            3.5      151748      98740          8300   
1  2013            1.6      121671      79219          3428   
2  2022            4.5       10991     113265          6994   
3  2024            1.7       27255      60971          4047   
4  2020            2.1      122131      49898          3080   

  Sales_Classification  Car_Age  Log_Price_USD  Price_per_KM Engine_Bin  \
0                 High        9      11.500256      0.650684    3.1-4.0   
1                  Low       12      11.279984      0.651092      <=1.6   
2                  Low        3      11.637494     10.305250       >4.0   
3                  Low        1      11.018170      2.237057    1.7-2.0   
4                  Low        5      10.817756      0.408561    2.1-3.0   

  Age_Bin  Total_Sales_Model  Region_Asia  Region_E

In [128]:
# 2. FEATURE AND TARGET DEFINITION

# The price in USD is the target variable
TARGET_COL = 'Price_USD' 

# Numeric and continuous features
NUMERIC_FEATURES = ['Engine_Size_L', 'Mileage_KM', 'Sales_Volume', 'Car_Age'] 

# One-Hot Encoded (OHE) categorical features
OHE_FEATURES = [
    'Region_Asia', 'Region_Europe', 'Region_Middle East', 'Region_North America', 'Region_South America',
    'Model_5 Series', 'Model_7 Series', 'Model_I3', 'Model_I8', 'Model_M3', 'Model_M5', 'Model_X1', 'Model_X3', 'Model_X5', 'Model_X6',
    'Fuel_Type_Electric', 'Fuel_Type_Hybrid', 'Fuel_Type_Petrol',
    'Transmission_Manual',
    'Color_Blue', 'Color_Grey', 'Color_Red', 'Color_Silver', 'Color_White'
]
FEATURES = NUMERIC_FEATURES + OHE_FEATURES

In [129]:
# Separate features (X) and target (y)
X = df[FEATURES]
y = df[TARGET_COL]

print(f"\nModeling Target: {TARGET_COL}")
print(f"Number of Features: {len(FEATURES)}")


Modeling Target: Price_USD
Number of Features: 28


## Part 2: Data Splitting (Standard and Temporal)

In [130]:
# 3. DATA SPLITTING
# 3.1. Standard Random Split (Baseline for Q2: General Accuracy)
# Splits 80% for training and 20% for testing, selected randomly
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\n--- Standard Random Split ---")
print(f"Train samples: {len(X_train_std)}")
print(f"Test samples: {len(X_test_std)}")


--- Standard Random Split ---
Train samples: 40000
Test samples: 10000


In [131]:
# 3.2. Temporal Split (For Q4: Next Year Prediction)
# Find the maximum year to use as the "next year" test set
max_year = df['Year'].max()

# Training data: all years EXCEPT the maximum year (historical data)
X_train_temp = df[df['Year'] < max_year][FEATURES]
y_train_temp = df[df['Year'] < max_year][TARGET_COL]

# Testing data: only the maximum year ('next year' data)
X_test_temp = df[df['Year'] == max_year][FEATURES]
y_test_temp = df[df['Year'] == max_year][TARGET_COL]

print("\n Temporal Split (Q4)")
print(f"Training on years < {max_year} ({len(X_train_temp)} samples)")
print(f"Testing on year {max_year} ({len(X_test_temp)} samples)")


 Temporal Split (Q4)
Training on years < 2024 (46573 samples)
Testing on year 2024 (3427 samples)


In [132]:
# Model Training and Evaluation
def evaluate_model(y_true, y_pred, model_name, split_type):
    """Calculates and prints R2 and MAE for a model."""
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    
    print(f"\n[{model_name} - {split_type}]")
    print(f"R-squared (R2): {r2:.4f}")
    print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
    print(f"Mean Squared Error (MSE): ${mse:,.2f}")
    return r2, mae

In [133]:
# 4.1. Model 1: Linear Regression (Simple Baseline)
lr_model = LinearRegression()
lr_model.fit(X_train_std, y_train_std)
y_pred_lr = lr_model.predict(X_test_std)
evaluate_model(y_test_std, y_pred_lr, "Linear Regression", "Standard Split")


[Linear Regression - Standard Split]
R-squared (R2): -0.0012
Mean Absolute Error (MAE): $22,573.38
Mean Squared Error (MSE): $677,724,206.47


(-0.001193794483200561, 22573.38111085739)

In [134]:
# 4.2. Model 2: Random Forest Regressor (Best Simple Real-World Model)
# Initialize Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)
# A. Train and Evaluate on Standard Split (Q2: General Accuracy)
rf_model_std = rf_model.fit(X_train_std, y_train_std)
y_pred_std = rf_model_std.predict(X_test_std)
r2_std, mae_std = evaluate_model(y_test_std, y_pred_std, "Random Forest", "Standard Split (Q2)")
# B. Train and Evaluate on Temporal Split (Q3, Q4: Stability & Next Year)
rf_model_temp = rf_model.fit(X_train_temp, y_train_temp) # Train model on historical data
y_pred_temp = rf_model_temp.predict(X_test_temp) # Predict 'next year'
r2_temp, mae_temp = evaluate_model(y_test_temp, y_pred_temp, "Random Forest", f"Temporal Split {max_year} (Q4)")


[Random Forest - Standard Split (Q2)]
R-squared (R2): -0.0033
Mean Absolute Error (MAE): $22,582.55
Mean Squared Error (MSE): $679,132,324.70

[Random Forest - Temporal Split 2024 (Q4)]
R-squared (R2): -0.0041
Mean Absolute Error (MAE): $22,656.62
Mean Squared Error (MSE): $682,702,074.15


In [135]:
# 5. ANALYSIS AND CONCLUSION (Q1, Q3)
# 5.1. Feature Importance (Q1: Key Factors)
# Extract feature importance from the temporal model (most realistic influence)
feature_importances = pd.Series(
    rf_model_temp.feature_importances_, index=FEATURES
).sort_values(ascending=False)

print("\n" + "="*50)
print("Q1: KEY FACTORS (FEATURE IMPORTANCE)")
print("="*50)
print("The most influential features for predicting price are (based on Random Forest):")
print(feature_importances.head(10).to_string())

# Save feature importances to CSV for reporting
feature_importances.to_csv(os.path.join(OUTPUT_DIR, "rf_feature_importances.csv"), header=['Importance'])


Q1: KEY FACTORS (FEATURE IMPORTANCE)
The most influential features for predicting price are (based on Random Forest):
Mileage_KM             0.245955
Sales_Volume           0.232423
Engine_Size_L          0.125108
Car_Age                0.092495
Transmission_Manual    0.019023
Fuel_Type_Petrol       0.015503
Fuel_Type_Hybrid       0.014968
Region_Asia            0.014649
Fuel_Type_Electric     0.014592
Region_Europe          0.014563


In [136]:
# 5.2. Stability Analysis (Q3: Prediction Stability Over Time)
print("\n" + "="*50)
print("Q3: PREDICTION STABILITY ANALYSIS")
print("="*50)
print(f"Accuracy on General Market (Standard Split): R2={r2_std:.4f}, MAE=${mae_std:,.2f}")
print(f"Accuracy on Future Market (Temporal Split {max_year}): R2={r2_temp:.4f}, MAE=${mae_temp:,.2f}")

if abs(mae_std - mae_temp) < 1000: # Simple threshold to check for stability
    print("Conclusion: The prediction performance is relatively STABLE over time (MAE difference is minor), but the overall high MAE indicates the model is fundamentally INACCURATE for this target (Price_USD).")
else:
    print("Conclusion: The prediction performance shows significant instability over time.")


Q3: PREDICTION STABILITY ANALYSIS
Accuracy on General Market (Standard Split): R2=-0.0033, MAE=$22,582.55
Accuracy on Future Market (Temporal Split 2024): R2=-0.0041, MAE=$22,656.62
Conclusion: The prediction performance is relatively STABLE over time (MAE difference is minor), but the overall high MAE indicates the model is fundamentally INACCURATE for this target (Price_USD).


## VISUALIZATION

In [137]:
# 6.1. Feature Importance Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances.head(10).values, y=feature_importances.head(10).index, palette="viridis")
plt.title('Top 10 Feature Importances (Random Forest)')
plt.xlabel('Relative Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "feature_importance_plot.png"))
plt.close()
print(f"\nVisualization saved: {os.path.join(OUTPUT_DIR, 'feature_importance_plot.png')}")


Visualization saved: outputs/feature_importance_plot.png


In [138]:
# 6.2. Predicted vs. Actual Plot (Temporal Split)
plt.figure(figsize=(8, 8))
plt.scatter(y_test_temp, y_pred_temp, alpha=0.6)
plt.plot([y_test_temp.min(), y_test_temp.max()], [y_test_temp.min(), y_test_temp.max()], 'r--', lw=2)
plt.title(f'Actual vs. Predicted Price ({max_year} Temporal Test)')
plt.xlabel('Actual Price (USD)')
plt.ylabel('Predicted Price (USD)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "actual_vs_predicted_temporal.png"))
plt.close()
print(f"Visualization saved: {os.path.join(OUTPUT_DIR, 'actual_vs_predicted_temporal.png')}")

Visualization saved: outputs/actual_vs_predicted_temporal.png


In [139]:
#  FUTURE PRICE PREDICTION FUNCTION
def predict_future_price(car_features_dict):
    """
    Predicts car price for a new car given features as a dictionary.
    Example input:
    car_features_dict = {
        'Engine_Size_L': 3.0,
        'Mileage_KM': 10000,
        'Sales_Volume': 50,
        'Car_Age': 1,
        'Region_Asia': 1, 'Region_Europe':0, ..., 
        'Model_5 Series':1, ..., 'Color_Blue':0, ..., 'Fuel_Type_Petrol':1, ..., 'Transmission_Manual':1
    }
    """
    df_new = pd.DataFrame([car_features_dict])
    price_pred = rf_model_temp.predict(df_new)[0]
    return price_pred


# Example usage
example_car = {
    'Engine_Size_L': 3.0, 'Mileage_KM': 10000, 'Sales_Volume': 50, 'Car_Age': 1,
    'Region_Asia':1, 'Region_Europe':0, 'Region_Middle East':0, 'Region_North America':0, 'Region_South America':0,
    'Model_5 Series':1, 'Model_7 Series':0, 'Model_I3':0, 'Model_I8':0, 'Model_M3':0, 'Model_M5':0, 'Model_X1':0,
    'Model_X3':0, 'Model_X5':0, 'Model_X6':0,
    'Fuel_Type_Electric':0, 'Fuel_Type_Hybrid':0, 'Fuel_Type_Petrol':1,
    'Transmission_Manual':1,
    'Color_Blue':0, 'Color_Grey':0, 'Color_Red':0, 'Color_Silver':0, 'Color_White':1
}

predicted_price = predict_future_price(example_car)
print(f"\nPredicted Price for Example Car: ${predicted_price:,.2f}")


Predicted Price for Example Car: $65,110.41
