In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_2.csv').drop(columns=['store room','floor_category','balcony'])

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 63,7.9,4.0,4.0,Under Construction,3950.0,0.0,1.0,Medium
1,flat,sector 63a,3.7,3.0,3.0,Under Construction,2667.0,0.0,1.0,Medium
2,flat,manesar,1.2,3.0,3.0,Moderately Old,2944.0,1.0,1.0,Low
3,house,sector 33,11.5,5.0,6.0,Relatively New,4680.0,1.0,1.0,Medium
4,flat,sector 67a,1.85,3.0,4.0,New Property,1941.0,0.0,0.0,High


In [6]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [7]:
# Numerical = bedRoom, bathroom, built_up_area, servant room
# Ordinal = property_type, furnishing_type, luxury_category 
# OHE = sector, agePossession

In [8]:
df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [9]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 63,7.9,4.0,4.0,under construction,3950.0,0.0,1.0,Medium
1,flat,sector 63a,3.7,3.0,3.0,under construction,2667.0,0.0,1.0,Medium
2,flat,manesar,1.2,3.0,3.0,old,2944.0,1.0,1.0,Low
3,house,sector 33,11.5,5.0,6.0,new,4680.0,1.0,1.0,Medium
4,flat,sector 67a,1.85,3.0,4.0,new,1941.0,0.0,0.0,High


In [10]:
df['property_type'] = df['property_type'].replace({'flat':0,'house':1})

In [11]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 63,7.9,4.0,4.0,under construction,3950.0,0.0,1.0,Medium
1,0,sector 63a,3.7,3.0,3.0,under construction,2667.0,0.0,1.0,Medium
2,0,manesar,1.2,3.0,3.0,old,2944.0,1.0,1.0,Low
3,1,sector 33,11.5,5.0,6.0,new,4680.0,1.0,1.0,Medium
4,0,sector 67a,1.85,3.0,4.0,new,1941.0,0.0,0.0,High


In [12]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

In [13]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 63,7.9,4.0,4.0,under construction,3950.0,0.0,1.0,1
1,0,sector 63a,3.7,3.0,3.0,under construction,2667.0,0.0,1.0,1
2,0,manesar,1.2,3.0,3.0,old,2944.0,1.0,1.0,0
3,1,sector 33,11.5,5.0,6.0,new,4680.0,1.0,1.0,1
4,0,sector 67a,1.85,3.0,4.0,new,1941.0,0.0,0.0,2


In [14]:
new_df = pd.get_dummies(df,columns=['sector','agePossession'],drop_first=True)

In [15]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [16]:
y_log = np.log1p(y)

In [17]:
y_log

0       2.186051
1       1.547563
2       0.788457
3       2.525729
4       1.047319
          ...   
3549    2.261763
3550    0.854415
3551    1.667707
3552    0.741937
3553    0.932164
Name: price, Length: 3554, dtype: float64

In [18]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [19]:
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [20]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sohna road,agePossession_old,agePossession_under construction
0,-0.517180,0.729037,0.505525,1.661341,-0.747510,0.438920,0.441136,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,3.432808
1,-0.517180,-0.073875,-0.184162,0.638390,-0.747510,0.438920,0.441136,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,3.432808
2,-0.517180,-0.073875,-0.184162,0.859245,1.337774,0.438920,-0.984134,-0.071348,10.660448,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,1.655612,-0.291307
3,1.933563,1.531950,1.884899,2.243378,1.337774,0.438920,0.441136,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,-0.291307
4,-0.517180,-0.073875,0.505525,0.059542,-0.747510,-1.440504,1.866405,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,-0.291307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,1.933563,0.729037,1.195212,4.093141,-0.747510,-1.440504,-0.984134,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,-0.291307
3550,-0.517180,-0.876788,-0.873849,-0.603822,-0.747510,0.438920,-0.984134,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,1.655612,-0.291307
3551,-0.517180,-0.073875,0.505525,1.009937,1.337774,0.438920,-0.984134,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,1.655612,-0.291307
3552,-0.517180,-0.876788,-0.873849,-0.512928,-0.747510,-1.440504,-0.984134,-0.071348,-0.093805,-0.041123,...,-0.069328,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.219245,-0.604006,-0.291307


In [21]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [22]:
scores.mean(),scores.std()

(0.8454150924795399, 0.02391920131684731)

In [23]:
lr = LinearRegression()
ridge = Ridge(alpha=0.0001)

In [24]:
lr.fit(X_scaled,y_log)

In [25]:
ridge.fit(X_scaled,y_log)

In [27]:
coef_df = pd.DataFrame(ridge.coef_.reshape(1,118),columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature',0:'coef'})

In [28]:
coef_df

Unnamed: 0,feature,coef
0,property_type,0.123647
1,bedRoom,0.061607
2,bathroom,0.068861
3,built_up_area,0.192533
4,servant room,0.054812
...,...,...
113,sector_sector 99a,-0.014511
114,sector_sector 9a,-0.005254
115,sector_sohna road,-0.032042
116,agePossession_old,-0.009074


In [29]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.859
Model:                            OLS   Adj. R-squared:                  0.854
Method:                 Least Squares   F-statistic:                     177.3
Date:                Wed, 07 Feb 2024   Prob (F-statistic):               0.00
Time:                        19:17:40   Log-Likelihood:                 511.75
No. Observations:                3554   AIC:                            -785.5
Df Residuals:                    3435   BIC:                            -50.59
Df Model:                         118                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [30]:
y_log.std()

0.5580384141316777

In [32]:
X['bedRoom'].std()

1.2456412104814263