In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib
import os

# Create directory for model if it does not exist
model_dir = "C:/Users/hp/Desktop/MLOps Assignment-01/house-price-predictor"
os.makedirs(model_dir, exist_ok=True)

# Load the dataset
data = pd.read_csv("C:/Users/hp/Desktop/MLOps Assignment-01/house-price-predictor/HousePriceDataset/Housing.csv")

# Prepare the data with one-hot encoding for categorical variables
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline to handle preprocessing and model fitting
pipe = Pipeline(steps=[('transformer', transformer), ('model', model)])

# Split data into features and target
X = data.drop("price", axis=1)
y = data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipe.fit(X_train, y_train)

# Evaluate the model
y_preds = pipe.predict(X_test)
print(f"Mean Squared Error: {mean_squared_error(y_test, y_preds)}")

# Save the model to the specified folder
model_path = os.path.join(model_dir, 'house_price_predictor_model.joblib')
joblib.dump(pipe, model_path)

# Modify the predict_house_price function to handle DataFrame input
def predict_house_price(input_data):
    model = joblib.load(model_path)
    # Convert input data into DataFrame with appropriate column names
    columns = ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement',
               'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
    input_df = pd.DataFrame([input_data], columns=columns)
    # Make prediction
    prediction = model.predict(input_df)
    return prediction

# Example usage with input data, corrected to pass a list with correct order and data types
input_data = [7420, 4, 2, 3, 'yes', 'no', 'no', 'no', 'yes', 2, 'yes', 'furnished']
predicted_price = predict_house_price(input_data)
print(f"Predicted Price: {predicted_price}")



Mean Squared Error: 322415859354.65436
Predicted Price: [12711259.4]
