# 🤖 Stage 4: Statistical Modeling & Forecasting

This notebook builds and evaluates a linear regression model to predict vehicle prices based on key features.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('/content/used_cars_cleaned.csv')

# Drop rows with missing values in target or key features if any
df = df.dropna(subset=['price', 'odometer', 'vehicle_age'])

# Preview data
df[['price', 'odometer', 'vehicle_age', 'manufacturer', 'condition']].head()


## 🧠 Feature Selection & Encoding

In [None]:
# Select features and target
features = ['odometer', 'vehicle_age', 'is_clean_title']
X = df[features]
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 📈 Linear Regression Model

In [None]:
# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


## 📊 Model Evaluation

In [None]:
# Metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: ${rmse:,.2f}")
print(f"MAE: ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")


## 🔍 Predicted vs Actual Price

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.3)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted vs. Actual Vehicle Prices")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red')
plt.show()

## 📌 Feature Coefficients

In [None]:
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients)