### Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib

### Step 2: Load and Explore Datasets

In [None]:
# Load training and test datasets
trained_data = pd.read_excel('/mnt/data/Processed2.xlsx')
test_data = pd.read_csv('/path/to/test.csv')  # Replace with the actual path
dist_data = pd.read_csv('/mnt/data/dist_from_city_centre.csv')
rent_data = pd.read_csv('/mnt/data/avg_rent.csv')

# Preview datasets
print(trained_data.head())
print(test_data.head())
print(dist_data.head())
print(rent_data.head())

# Merge secondary datasets with training data
train_data = trained_data.merge(dist_data, on='property_id', how='left')
train_data = train_data.merge(rent_data, on='property_id', how='left')

# Merge secondary datasets with test data
test_data = test_data.merge(dist_data, on='property_id', how='left')
test_data = test_data.merge(rent_data, on='property_id', how='left')


### Step 3: Data Preprocessing

In [None]:
# Preprocess training data
train_data.fillna(train_data.mean(), inplace=True)
train_data['price_per_sqft'] = train_data['Price'] / train_data['Area']
train_data['rent_to_price_ratio'] = train_data['Average_Rent'] / train_data['Price']
train_data.drop(columns=['property_id'], inplace=True)

# Separate features and target in training data
X_train = train_data.drop(columns=['Price'])
y_train = train_data['Price']

# Preprocess test data
test_data.fillna(test_data.mean(), inplace=True)
test_data['price_per_sqft'] = test_data['Price'] / test_data['Area']
test_data['rent_to_price_ratio'] = test_data['Average_Rent'] / test_data['Price']
test_ids = test_data['property_id']  # Save property IDs if needed for submission
X_test = test_data.drop(columns=['Price', 'property_id'])
y_test = test_data['Price']

# Encode categorical variables for both training and test data
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align columns in training and test data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Step 4: Model Selection

In [None]:
# Instantiate and train a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)


### Step 5: Evaluate the Model

In [None]:
# Predict on test data
y_pred = rf_model.predict(X_test_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE on test data: {rmse}")


### Step 6: Hyperparameter Tuning

In [None]:
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate best model
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f"Optimized RMSE: {rmse_best}")


### Step 7: Save the Model

In [None]:
# Save the best model
joblib.dump(best_model, 'house_price_model.pkl')

# Save the scaler for deployment
joblib.dump(scaler, 'scaler.pkl')


In [None]:
### Step 8: Load and Use the Model for Predictions

In [None]:
# Load the model and scaler
loaded_model = joblib.load('house_price_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Predict for a new dataset
new_data = pd.DataFrame({
    'Area': [1200],
    'Bedrooms': [3],
    'Distance_from_City_Center': [5],
    'Average_Rent': [1500]
})

# Preprocess the new data
new_data_scaled = loaded_scaler.transform(new_data)
predicted_price = loaded_model.predict(new_data_scaled)
print(f"Predicted House Price: {predicted_price[0]}")
