# Model Training
This notebook trains a machine learning model to predict house prices using the processed dataset.



In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import os


## Load Processed Dataset
We load the processed dataset containing cleaned and engineered features.



In [None]:
# Load the processed dataset
data = pd.read_csv("../data/processed_train.csv")

# Display first few rows
print("Loaded processed dataset:")
data.head()




## Define Features and Target Variable
We separate the dataset into features (`X`) and the target variable (`y`).


In [None]:
# Define target variable
target = "SalePrice"

# Separate features (X) and target (y)
X = data.drop(columns=[target])
y = data[target]

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset split into training and testing sets successfully.")



## Train the Model
We use a RandomForestRegressor to train the model on the training data.


In [None]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model training completed.")



## Evaluate the Model
We use evaluation metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared Score to assess model performance.


In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display evaluation results
print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared Score: {r2}")



## Save Trained Model
The trained model is saved as a `.pkl` file so that it can be used in later stages of the pipeline.


In [None]:
# Ensure the models directory exists
os.makedirs("../models", exist_ok=True)

# Save the trained model
with open("../models/trained_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Trained model saved successfully!")


## Summary
- Loaded the processed dataset.
- Split the data into training and testing sets.
- Trained a RandomForestRegressor model.
- Evaluated the model's performance.
- Saved the trained model for future use.
