# Model Training
This notebook trains a machine learning model to predict house prices using the processed dataset.



In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle
import os


## Load Processed Dataset
We load the processed dataset containing cleaned and engineered features.


In [None]:
# Load the processed dataset
data = pd.read_csv("../data/processed_train.csv")

# Display first few rows
print("Loaded processed dataset:")
data.head()




## Define Features and Target Variable
We separate the dataset into features (`X`) and the target variable (`y`), applying log transformation to stabilize predictions.


In [None]:
# Define target variable
target = "SalePrice"

# Separate features (X) and apply log transformation to target (y)
X = data.drop(columns=[target])
y = np.log(data[target])  # Apply log transformation

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset split into training and testing sets successfully.")


## Apply Feature Scaling
We scale the feature set using `StandardScaler` to improve model performance.


In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit on training data and transform both train and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



## Train the Model
We use a `RandomForestRegressor` to train the model on the training data.


In [None]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using scaled features
model.fit(X_train_scaled, y_train)

print("Model training completed.")



## Evaluate the Model
We use evaluation metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared Score to assess model performance.


In [None]:
# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Reverse log transformation for price predictions
y_pred_actual = np.exp(y_pred)
y_test_actual = np.exp(y_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test_actual, y_pred_actual)
mse = mean_squared_error(y_test_actual, y_pred_actual)
r2 = r2_score(y_test_actual, y_pred_actual)

# Display evaluation results
print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared Score: {r2}")



## Save Trained Model
The trained model, feature names, and scaler are saved as `.pkl` files for future use.


In [None]:
# Ensure the models directory exists
os.makedirs("../models", exist_ok=True)

# Save the trained model
with open("../models/trained_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save feature names
with open("../models/feature_names.pkl", "wb") as f:
    pickle.dump(X_train.columns.tolist(), f)

# Save the scaler
with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Trained model, feature names, and scaler saved successfully!")


## Summary
- Loaded the processed dataset.
- Applied log transformation to target variable.
- Scaled feature set using `StandardScaler`.
- Trained a `RandomForestRegressor` model on scaled data.
- Evaluated model performance using MAE, MSE, and R² score.
- Saved the trained model, feature names, and scaler.
