## Importing Libraries

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit, train_test_split, RandomizedSearchCV

## Data Loading

In [2]:
df= pd.read_csv("data/Bengaluru_House_Data_cleaned.csv")

## Choose Relevant columns 

In [3]:
df.columns

Index(['area_type', 'location', 'total_sqft', 'bath', 'balcony', 'price',
       'bedroom', 'ready_to_move'],
      dtype='object')

In [4]:
df_model = df[['location', 'total_sqft', 'price', 'bedroom', 'ready_to_move']]

- 'bath' is strongly correlated with 'bedroom', so we drop it to avoid multicollinearity
- 'balcony' has low correlation with 'price', so we drop it

## Get dummy data

In [5]:
df_dum = pd.get_dummies(df_model)

## Train Test Split

In [6]:
X = df_dum.drop(['price'], axis =1)
y = df_dum['price']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_sm = sm.add_constant(X)
model = sm.OLS(y,X_sm)
model.fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.761
Method:,Least Squares,F-statistic:,104.6
Date:,"Sun, 19 Mar 2023",Prob (F-statistic):,0.0
Time:,23:22:32,Log-Likelihood:,-39326.0
No. Observations:,7692,AIC:,79130.0
Df Residuals:,7455,BIC:,80770.0
Df Model:,236,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-28.3058,1.755,-16.126,0.000,-31.747,-24.865
total_sqft,0.0849,0.001,72.673,0.000,0.083,0.087
bedroom,2.4293,0.648,3.747,0.000,1.158,3.700
ready_to_move,0.4217,1.302,0.324,0.746,-2.131,2.975
location_1st Block Jayanagar,81.4836,10.934,7.452,0.000,60.049,102.918
location_1st Phase JP Nagar,24.6430,10.187,2.419,0.016,4.673,44.613
location_2nd Stage Nagarbhavi,131.4728,9.698,13.556,0.000,112.462,150.484
location_5th Block Hbr Layout,26.6603,12.297,2.168,0.030,2.555,50.765
location_5th Phase JP Nagar,-22.4382,7.853,-2.857,0.004,-37.832,-7.045

0,1,2,3
Omnibus:,2830.63,Durbin-Watson:,2.048
Prob(Omnibus):,0.0,Jarque-Bera (JB):,158979.854
Skew:,0.967,Prob(JB):,0.0
Kurtosis:,25.188,Cond. No.,5.57e+16


In [9]:
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.score(X_train, y_train)

0.783335007796705

In [10]:
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 42)
np.mean(cross_val_score(lm, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = cv))

-23.128249765599143

In [11]:
lm_l = Lasso()
lm_l.fit(X_train, y_train)
np.mean(cross_val_score(lm_l, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = cv))

-28.82802646084074

In [None]:
alpha = []
error = []

for i in range(1,20):
    alpha.append(i/1000)
    lml = Lasso(alpha=(i/1000))
    error.append(np.mean(cross_val_score(lml, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = cv)))
    
plt.plot(alpha,error)

In [None]:
err = tuple(zip(alpha,error))
df_err = pd.DataFrame(err, columns = ['alpha','error'])
df_err[df_err.error == max(df_err.error)]

In [None]:
lm_l = Lasso(alpha = 0.009)
lm_l.fit(X_train, y_train)
np.mean(cross_val_score(lm_l, X_train, y_train, scoring = 'neg_mean_absolute_error', cv = cv))

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
np.mean(cross_val_score(rf, X_train, y_train, scoring = 'neg_mean_absolute_error', cv= cv))

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
xgb.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], 
             verbose=False)
np.mean(cross_val_score(xgb, X_train, y_train, scoring = 'neg_mean_absolute_error', cv= cv))

In [None]:
parameters = {'n_estimators':range(10,300,10), 'criterion':('mse','mae'), 'max_features':('auto','sqrt','log2')}

In [None]:
gs = GridSearchCV(rf, parameters, scoring='neg_mean_absolute_error', cv = cv)

In [None]:
gs.fit(X_train,y_train)

In [None]:
gs.best_score_

In [None]:
gs.best_estimator_

In [None]:
# test ensembles 
tpred_lm = lm.predict(X_test)
tpred_lml = lm_l.predict(X_test)
tpred_rf = rf.predict(X_test)
tpred_xgb = xgb.predict(X_test)

In [None]:
print(f'Linear Regression: {mean_absolute_error(y_test,tpred_lm)}')
print(f'LASSO Regression: {mean_absolute_error(y_test,tpred_lml)}')
print(f'Random Forest Regression: {mean_absolute_error(y_test,tpred_rf)}')
print(f'XGB Forest Regression: {mean_absolute_error(y_test,tpred_xgb)}')

In [None]:
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

In [None]:
mape = MAPE(y_test, tpred_lm)
print('MAPE:', mape)