 **Step 1: Import Libraries**

We import required libraries for data processing, visualization, preprocessing, and model building.


In [None]:
import pandas as pd

**Step 2: Load Dataset**

We load the dataset from the given path and preview its structure.


In [None]:
df = pd.read_csv(r"../datasets/Bangalore.csv")



**Step 3: Explore Data**

We check dataset shape, info, descriptive statistics, column names, and missing values.



In [None]:
df.head()

In [None]:
df.shape
df.info()
df.describe()
df.columns


In [None]:
df.isnull().sum()

**Step 4: Handle Missing Values**

We remove rows with missing data and drop duplicate entries.


In [None]:
df_cleaned = df.dropna(inplace=False)

In [None]:
df_cleaned = df_cleaned.drop_duplicates(inplace=False)

In [None]:
# Check data types of each column
df_cleaned.dtypes

In [None]:
df_typed = df_cleaned.copy()
df_typed['Location'] = df_typed['Location'].astype('category')


**Step 5: Outlier Visualization**

We visualize potential outliers in key numeric features using boxplots.


In [None]:
#Detecting Outliers
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set(style="whitegrid")

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=df_typed['Price'])
plt.title("Boxplot of Price")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=df_typed['Area'])
plt.title("Boxplot of Area")
plt.show()

In [None]:
#def remove_outliers_iqr(df, column):
   # Q1 = df[column].quantile(0.25)
   # Q3 = df[column].quantile(0.75)
    #IQR = Q3 - Q1
    #lower_bound = Q1 - 1.5 * IQR
    #upper_bound = Q3 + 1.5 * IQR
    #return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
#df_no_outliers = remove_outliers_iqr(df_typed, 'Price')
#df_no_outliers = remove_outliers_iqr(df_no_outliers, 'Area')

In [None]:
#Performing the EDA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
if 'df_no_outliers' not in locals():
    df_no_outliers = df_typed.copy()


In [None]:
## Histogram of Price
plt.figure(figsize=(8,5))
sns.histplot(df_no_outliers['Price'], kde=True, bins=30)
plt.title("Distribution of Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

In [None]:
# Histogram of Area
plt.figure(figsize=(8,5))
sns.histplot(df_no_outliers['Area'], kde=True, bins=30)
plt.title("Distribution of Area")
plt.xlabel("Area")
plt.ylabel("Count")
plt.show()

In [None]:
# Countplot of top 15 locations
plt.figure(figsize=(12,6))
top_locations = df_no_outliers['Location'].value_counts().head(15)

sns.barplot(x=top_locations.values, y=top_locations.index, palette="viridis")
plt.title("Top 15 Locations by Number of Listings")
plt.xlabel("Number of Properties")
plt.ylabel("Location")
plt.show()


In [None]:
plt.figure(figsize=(14, 6))
top_loc = df_no_outliers['Location'].value_counts().head(10).index

# Filter only top locations to make it readable
sns.boxplot(x='Location', y='Price', data=df_no_outliers[df_no_outliers['Location'].isin(top_loc)])
plt.title("Price Distribution Across Top 10 Locations")
plt.xticks(rotation=45)
plt.show()


In [None]:
#We are doing this to see how area affects price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Area', y='Price', data=df_no_outliers, hue='Location', palette='cool', alpha=0.6)
plt.title("Area vs Price (Colored by Location)")
plt.xlabel("Area (sq ft)")
plt.ylabel("Price")
plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1))
plt.show()


In [None]:
#Excluding the non-numeric values
# Only select numeric columns for correlation
numeric_df = df_no_outliers.select_dtypes(include=['int64', 'float64'])

# Now generate the correlation heatmap
plt.figure(figsize=(18, 15))
correlation_matrix = numeric_df.corr()

sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Features", fontsize=16)
plt.show()


In [None]:
# Define features (X) and target (y)
X = df_no_outliers.drop('Price', axis=1)
y = df_no_outliers['Price']


**Step 6: Scale Numeric Columns**

We identify continuous numeric columns and prepare a scaler to normalize them.  
Scaling ensures all features have a similar range, which improves model performance.


In [None]:
from sklearn.preprocessing import StandardScaler

# Target column name
target_col = 'Price'

# Figure out which numeric columns should be scaled (exclude target + binary/dummy cols)
num_cols_all = df_no_outliers.select_dtypes(include=['int64', 'float64']).columns.tolist()
if target_col in num_cols_all:
    num_cols_all.remove(target_col)

# Exclude likely dummy/binary columns (nunique <= 2)
cont_cols = [c for c in num_cols_all if df_no_outliers[c].nunique() > 10]

print("Continuous numeric columns to scale:", cont_cols)

# Helper to scale AFTER we split (so we fit only on training data)
def fit_transform_scaler(X_train, X_test, cols):
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    if cols:
        scaler.fit(X_train[cols])
        X_train_scaled[cols] = scaler.transform(X_train[cols])
        X_test_scaled[cols] = scaler.transform(X_test[cols])
    return scaler, X_train_scaled, X_test_scaled


**Step 7: Train–Test Split**

We split the dataset into training and testing sets, then apply scaling only on the training data to avoid data leakage.


In [None]:
# Step 7: Encode Categorical Variables & Train–Test Split

from sklearn.model_selection import train_test_split
import pandas as pd

# One-hot encode 'Location' from the cleaned DataFrame
df_encoded = pd.get_dummies(df_no_outliers, columns=['Location'], drop_first=True)

# Separate features (X) and target (y)
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Step 8: Feature Scaling (Avoiding Data Leakage)
We scale the features so that all variables contribute equally to the model and avoid bias from different value ranges.  
Scaling is fitted only on training data to prevent data leakage.


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit scaler only on training data, then transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Step 9: Model Training (Linear Regression)
We train the Linear Regression model using the scaled training data so it can learn the relationship between features and target.


In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)


## Step 10: Model Evaluation
We evaluate the trained Linear Regression model on the test data to check its performance.


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


## Step 11: Model Improvement

In this step, we improve our model by using a **Random Forest Regressor**.  
Random Forest is an ensemble learning method that builds multiple decision trees and combines their results to improve accuracy and reduce overfitting.




In [None]:
# ## Step 11: Model Improvement

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Mean Squared Error:", mse_rf)
print("Random Forest R² Score:", r2_rf)


### Step 11 Output:

- **Random Forest Mean Squared Error:** 220,024,586,218,197.84  
- **Random Forest R² Score:** 0.25936823463030556  

The R² score improved compared to the previous model (0.24 → 0.26), showing that the Random Forest is capturing more patterns in the data.  
While the MSE is still high, this is expected given the large scale of the target values.


### Step 12: Hyperparameter Tuning for Random Forest

We use `GridSearchCV` to find the optimal combination of hyperparameters for the Random Forest model, aiming to improve the R² score and reduce the Mean Squared Error.


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='r2', verbose=2)

grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

# Evaluate on the test set with the best model
best_rf = grid_search.best_estimator_
y_pred_rf_best = best_rf.predict(X_test)

mse_rf_best = mean_squared_error(y_test, y_pred_rf_best)
r2_rf_best = r2_score(y_test, y_pred_rf_best)

print(f"Tuned Random Forest Mean Squared Error: {mse_rf_best}")
print(f"Tuned Random Forest R² Score: {r2_rf_best}")


### Hyperparameter Tuning Results
- **Best Parameters**:  
  - `max_depth`: 30  
  - `max_features`: 'sqrt'  
  - `min_samples_leaf`: 1  
  - `min_samples_split`: 10  
  - `n_estimators`: 100  

- **Performance**:  
  - **Best R² Score**: 0.25165614871803543  
  - **Tuned Random Forest Mean Squared Error**: 200471448148561.44  
  - **Tuned Random Forest R² Score**: 0.3251866752688929  


In [None]:
import pandas as pd

# Convert GridSearchCV results to DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# Select only important columns
results_df = results_df[["param_max_depth", "param_max_features", "param_min_samples_leaf", 
                          "param_min_samples_split", "param_n_estimators", "mean_test_score"]]

# Sort by best score
results_df = results_df.sort_values(by="mean_test_score", ascending=False)

# Convert to Markdown format
print(results_df.to_markdown(index=False))
