## importing the nessecesary library and modules

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


## load the csv

In [2]:
df0= pd.read_csv("../data/processed/processed_data_location.csv")
df0.head()
df0= df0[df0.BHK <=11]
df0.BHK.value_counts()


BHK
2     5517
3     4818
4     1390
1      640
5      349
6      220
7      100
8       88
9       52
10      14
11       4
Name: count, dtype: int64

# filter the correct data where using the domain knowledge that in bangalore there are no place sell the price of per sqft area as below 3k and above the 20k

In [3]:
df0= df0[(df0.price_1_sqft > 3000) & (df0.price_1_sqft < 20000)]
df0.price_1_sqft.describe()

count    12354.000000
mean      6455.005097
std       3138.176745
min       3000.000000
25%       4394.693201
50%       5500.000000
75%       7256.966452
max      19965.277778
Name: price_1_sqft, dtype: float64

In [4]:
import numpy as np

def remove_bhk_outliers(df):
    exclude_indices = np.array([])

    for location, location_df in df.groupby('location'):
        bhk_stats = {}

        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_1_sqft),
                'std': np.std(bhk_df.price_1_sqft),
                'count': bhk_df.shape[0]
            }

        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(
                    exclude_indices,
                    bhk_df[bhk_df.price_1_sqft < stats['mean']].index.values
                )

    return df.drop(exclude_indices, axis='index')
df0=remove_bhk_outliers(df0)
df0.shape

(9579, 528)

In [5]:
import numpy as np

def remove_location_pps_outliers(df):
    df_out = pd.DataFrame()
    for loc, subdf in df.groupby('location'):
        m = np.mean(subdf.price_1_sqft)
        s = np.std(subdf.price_1_sqft)
        reduced_df = subdf[(subdf.price_1_sqft > (m - s)) & (subdf.price_1_sqft < (m + s))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out
df0=remove_location_pps_outliers(df0)
df0.shape

(7197, 528)

In [6]:
X=df0.drop(["price","price_1_sqft","location"],axis=1)
y=df0["price"]

In [7]:
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
model=LinearRegression()
model.fit(X_train,Y_train)
model.score(X_test,Y_test)
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test)

print("R2 Score:", r2_score(Y_test, y_pred))
print("MAE:", mean_absolute_error(Y_test, y_pred))


R2 Score: 0.904645294024639
MAE: 16.20049382891534


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
models = {
    'Linear': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1, 10, 50, 100]
        }
    },
    'Lasso': {
        'model': Lasso(max_iter=5000),
        'params': {
            'alpha': [0.001, 0.01, 0.1, 1, 10]
        }
    }
}


# using gridcvsearch searching the appropriate parameter for each model

In [None]:
results = []

for name, mp in models.items():
    grid = GridSearchCV(
        mp['model'],
        mp['params'],
        cv=5,
        scoring='r2',   
        n_jobs=-1
    )

    grid.fit(X_train, Y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'R2 Score': r2_score(Y_test, y_pred),
        'MAE': mean_absolute_error(Y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(Y_test, y_pred))
    })


In [None]:
results_df = pd.DataFrame(results)
print(results_df)


    Model              Best Params  R2 Score        MAE       RMSE
0  Linear  {'fit_intercept': True}  0.908471  16.131659  26.127281
1   Ridge             {'alpha': 1}  0.906585  16.222435  26.395127
2   Lasso         {'alpha': 0.001}  0.909269  15.969553  26.013156


In [None]:
from sklearn.tree import DecisionTreeRegressor
model_des=DecisionTreeRegressor()
model_des.fit(X_train,Y_train)
model_des.score(X_test,Y_test)

0.772576698186296

# train the model using perfect paramter matches for the dataset

In [8]:
model_linear=LinearRegression(fit_intercept= True)
model_linear.fit(X_train,Y_train)
model_linear.score(X_test,Y_test)

0.904645294024639

In [18]:
X_columns=X.columns

In [19]:
import json

X_columns = X.columns.tolist()[1:]

with open("../models/X_columns.json", "w") as f:
    json.dump(X_columns, f)


## Saving the Model


In [10]:
import pickle

with open("../models/model.pickle", "wb") as f:
    pickle.dump(model_linear, f)