In [67]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

### In insights module, we want to explain the user that which features affect the price and by what factor do they affect it

#### ie we have to train a model again for doing inference. Some models are not good at inference like Deep learning models. While some are good for inference like Linear Regression

#### Therefore, for our insights module, we will use Linear Regression for inference

In [68]:
df = pd.read_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

# we dropped these 3 columns 'store room','floor_category','balcony' because the r2 score was not changing significantly due to these columns

In [69]:
# new_df = pd.read_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv')
# new_df

In [70]:
# new_df['sector'] = new_df['sector'].str.replace('sector 17a','sector 17')
# new_df['sector'] = new_df['sector'].str.replace('sector 17b','sector 17')
# new_df['sector'] = new_df['sector'].str.replace('sector 37c','sector 37')
# new_df['sector'] = new_df['sector'].str.replace('sector 88b','sector 88')
# new_df['sector'] = new_df['sector'].str.replace('sector 99a','sector 99')
# new_df['sector'] = new_df['sector'].str.replace('sector 10a','sector 10')

# new_df.to_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv',index=False)

In [71]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,price,luxury_category
0,flat,sector 78,2,2,Moderately Old,1239.0,0,0,0.75,Low
1,flat,sector 60,2,3,Relatively New,1250.0,1,1,2.15,Low
2,flat,sector 90,3,3,Relatively New,1578.0,0,0,1.23,Low
3,house,sector 2,6,6,Moderately Old,3611.0,0,0,5.0,Low
4,flat,sector 63,4,4,New Property,3956.0,0,0,7.52,Medium


In [72]:
# df['sector'].unique().tolist()

In [73]:
# df['sector'].value_counts()

#### We will train a Linear Regression model on this data. ie our goal is to get the best r2 score, because we want to understand that how much variance our features are explaining about the target column

In [74]:
# df = pd.read_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv')

# df['sector'] = df['sector'].str.replace('sector 36a','sector 36')
# df['sector'] = df['sector'].str.replace('sohna road road','sohna road')
# df['sector'] = df['sector'].str.replace('sector 3 phase 2','sector 3')
# df['sector'] = df['sector'].str.replace('sector 3 phase 3 extension','sector 3')


In [75]:
# df.to_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv',index=False)


In [76]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [77]:
# Numerical = bedRoom, bathroom, built_up_area, servant room   --> here we dont have to apply transformations
# Ordinal = property_type, furnishing_type, luxury_category    --> ordinal encoding is used where is natural order like low<medium<high
# OHE = sector, agePossession                                  --> here there is no natural order, so apply OHE

In [78]:
# to simplify, we are converting these 5 agePossession categories into 3 categories

df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [79]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,price,luxury_category
0,flat,sector 78,2,2,old,1239.0,0,0,0.75,Low
1,flat,sector 60,2,3,new,1250.0,1,1,2.15,Low
2,flat,sector 90,3,3,new,1578.0,0,0,1.23,Low
3,house,sector 2,6,6,old,3611.0,0,0,5.0,Low
4,flat,sector 63,4,4,new,3956.0,0,0,7.52,Medium


In [80]:
# doing ordinal encoding for property_type

df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

  df['property_type'] = df['property_type'].replace({'flat':0,'house':1})


In [81]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,price,luxury_category
0,0,sector 78,2,2,old,1239.0,0,0,0.75,Low
1,0,sector 60,2,3,new,1250.0,1,1,2.15,Low
2,0,sector 90,3,3,new,1578.0,0,0,1.23,Low
3,1,sector 2,6,6,old,3611.0,0,0,5.0,Low
4,0,sector 63,4,4,new,3956.0,0,0,7.52,Medium


In [82]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

  df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})


In [83]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,price,luxury_category
0,0,sector 78,2,2,old,1239.0,0,0,0.75,0
1,0,sector 60,2,3,new,1250.0,1,1,2.15,0
2,0,sector 90,3,3,new,1578.0,0,0,1.23,0
3,1,sector 2,6,6,old,3611.0,0,0,5.0,0
4,0,sector 63,4,4,new,3956.0,0,0,7.52,1


In [84]:
# this is doing OHE on sector and agePossession columns

new_df = pd.get_dummies(df,columns=['sector','agePossession'],drop_first=True)

In [85]:
new_df

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,price,luxury_category,sector_gwal pahari,sector_manesar,...,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 9a,sector_sohna road,agePossession_old,agePossession_under construction
0,0,2,2,1239.0,0,0,0.75,0,False,False,...,False,False,False,False,False,False,False,False,True,False
1,0,2,3,1250.0,1,1,2.15,0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,3,3,1578.0,0,0,1.23,0,False,False,...,True,False,False,False,False,False,False,False,False,False
3,1,6,6,3611.0,0,0,5.00,0,False,False,...,False,False,False,False,False,False,False,False,True,False
4,0,4,4,3956.0,0,0,7.52,1,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,0,5,4,2956.0,0,2,3.72,0,False,False,...,False,False,False,False,False,False,False,False,False,False
3550,0,3,3,1267.0,0,0,1.30,0,False,False,...,False,False,False,False,False,False,False,False,False,True
3551,0,3,4,2470.0,1,1,2.65,2,False,False,...,False,False,False,False,False,False,False,False,True,False
3552,0,3,3,1747.0,1,0,1.30,1,False,False,...,False,False,False,False,False,False,False,False,True,False


In [86]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [87]:
y_log = np.log1p(y)
# performing log transform on output column as it was becoming nearly normal distribution

In [88]:
y_log

0       0.559616
1       1.147402
2       0.802002
3       1.791759
4       2.142416
          ...   
3549    1.551809
3550    0.832909
3551    1.294727
3552    0.832909
3553    0.936093
Name: price, Length: 3554, dtype: float64

In [89]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [90]:
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [91]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 9a,sector_sohna road,agePossession_old,agePossession_under construction
0,-0.517180,-0.877269,-0.874300,-0.511829,-0.747968,-0.658035,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,1.654424,-0.290738
1,-0.517180,-0.877269,-0.184564,-0.502784,1.336956,1.055266,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,-0.604440,-0.290738
2,-0.517180,-0.074329,-0.184564,-0.233105,-0.747968,-0.658035,-0.984642,-0.071348,-0.093805,-0.041123,...,6.239598,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,-0.604440,-0.290738
3,1.933563,2.334488,1.884645,1.438415,-0.747968,-0.658035,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,1.654424,-0.290738
4,-0.517180,0.728610,0.505173,1.722072,-0.747968,-0.658035,0.440783,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,-0.604440,-0.290738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,-0.517180,1.531549,0.505173,0.899878,-0.747968,2.768566,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,-0.604440,-0.290738
3550,-0.517180,-0.074329,-0.184564,-0.488807,-0.747968,-0.658035,-0.984642,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,-0.604440,3.439524
3551,-0.517180,-0.074329,0.505173,0.500292,1.336956,1.055266,1.866207,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,1.654424,-0.290738
3552,-0.517180,-0.074329,-0.184564,-0.094154,1.336956,-0.658035,0.440783,-0.071348,-0.093805,-0.041123,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.053119,-0.219245,1.654424,-0.290738


In [92]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [93]:
scores.mean(),scores.std()

(0.8522471718971687, 0.022363208444418456)

In [94]:
lr = LinearRegression()
ridge = Ridge(alpha=0.0001)

In [95]:
lr.fit(X_scaled,y_log)

In [96]:
ridge.fit(X_scaled,y_log)

In [97]:
lr.coef_.shape

(113,)

In [98]:
coef_df = pd.DataFrame(ridge.coef_.reshape(1,113),columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature',0:'coef'})

In [99]:
coef_df

Unnamed: 0,feature,coef
0,property_type,0.120977
1,bedRoom,0.053679
2,bathroom,0.065013
3,built_up_area,0.210413
4,servant room,0.050877
...,...,...
108,sector_sector 99,-0.010226
109,sector_sector 9a,-0.005074
110,sector_sohna road,-0.029466
111,agePossession_old,-0.007465


### We have got the weights of the coeficients after training. Below is the regression analysis

In [100]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.861
Method:                 Least Squares   F-statistic:                     195.1
Date:                Sat, 07 Sep 2024   Prob (F-statistic):               0.00
Time:                        15:59:38   Log-Likelihood:                 590.14
No. Observations:                3554   AIC:                            -952.3
Df Residuals:                    3440   BIC:                            -248.2
Df Model:                         113                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [101]:
y_log.std()

0.5579613263072811

In [102]:
X_scaled['bedRoom'].std()

1.000140716246387

In [103]:
0.21 * (0.557/1)

0.11697

In [104]:
np.expm1(0.030)

0.030454533953516858

In [653]:
2.4726962617564903e-05 * 100

0.0024726962617564905