In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import plotly.express as px

# Load dataset
df = pd.read_csv('data/bengaluru_prices_preprocessed-100.csv', encoding='utf-8')

# Select features and target
X = df[['Area_type', 'Location', 'Size', 'Total_sqft', 'Bathroom', 'Balcony']]
y = df['Price in lakhs']

# Clean target variable: remove currency symbol and convert to numeric
y = y.str.replace('₹', '').str.replace(',', '').astype(float)

# Define preprocessing steps
numeric_features = ['Total_sqft', 'Bathroom', 'Balcony']
categorical_features = ['Area_type', 'Location', 'Size']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numeric values with mean
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore')),  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Build Pipeline for preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print('R^2 score:', r2_score(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

# Create a DataFrame with Total_sqft, actual and predicted prices
results_df = pd.DataFrame({
    'Total_sqft': X_test['Total_sqft'],
    'Actual Price': y_test,
    'Predicted Price': y_pred
})

# Print the first few rows of the results
print(results_df.head())

# Save the results to a CSV file
results_df.to_csv('data/results.csv', index=False)

# Save the model
joblib.dump(model, 'models/model.joblib')

# Calculate sum of prices by area type
sum_price_by_area = df.groupby('Area_type')['Price in lakhs'].sum().reset_index()

# Sort by the sum of prices to ensure correct ordering in the chart
sum_price_by_area = sum_price_by_area.sort_values(by='Price in lakhs', ascending=False)

# Create a bar chart
fig = px.bar(sum_price_by_area, x='Area_type', y='Price in lakhs', title='Sum of Prices by Area Types')

# Hide y-axis tick labels
fig.update_yaxes(showticklabels=False)

# Show the figure
fig.show()


R^2 score: 0.7245813356228716
RMSE: 58.45149432376845
      Total_sqft  Actual Price  Predicted Price
747         1279          77.0        74.245357
4148        1650         135.0       205.320000
4454        1600          64.0        72.350000
9043        1268         127.0       103.543000
8810        1100          49.0        43.003429
