In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor  # Assuming you want to use MLP

# Load the data
try: 
    bmw_data = pd.read_csv('./Data/bmw.csv')
except FileNotFoundError:
    print("File not found")
    exit(1)

# Assuming 'price' is the target variable (modify if needed)
y = bmw_data['price'] if 'price' in bmw_data.columns else bmw_data.iloc[:, -1]

# All columns except the target
X = bmw_data.drop(y.name, axis=1)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

categorical_cols = ['model', 'transmission', 'fuelType', 'year']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Create preprocessing steps for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a pipeline that preprocesses the data and then applies the MLP model
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("random_forest", random_forest),
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Evaluate the model on test data
score = pipe.score(X_test, y_test)
print(f"\nR² score on test data: {score:.4f}")

# Make predictions
y_pred = pipe.predict(X_test)

# Calculate performance metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# If you want to use the pipeline for new data:
# predictions = pipe.predict(new_data)


Training set shape: (8624, 8)
Testing set shape: (2157, 8)

R² score on test data: 0.9415
Mean Squared Error: 7570151.4129
Root Mean Squared Error: 2751.3908
Mean Absolute Error: 1610.9943
R² Score: 0.9415
