In [70]:
import numpy as np
import pandas as pd
import os, sys, pickle

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [71]:
file_path = os.path.join(os.path.dirname(sys.path[0]), 'clean_data/gurgaon_properties_imputed.csv')
df = pd.read_csv(file_path)

df.drop(columns=['pooja room', 'study room', 'others', 'society', 'price_per_sqft', 'store room', 'floorNum', 'balcony'], inplace=True)

def cat_luxury_score(score):
    if score < 50:
        return 'Low'
    elif score < 150:
        return 'Medium'
    elif score >= 150:
        return 'High'
    else:
        return np.nan
    
df['luxury_score'] = df['luxury_score'].apply(cat_luxury_score)

df.head()



Unnamed: 0,property_type,sector,price,bedroom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_score
0,flat,sector 68,0.9,2,2,New Property,1311.111111,0,0,Low
1,flat,sector 92,0.45,2,2,New Property,646.0,0,2,Low
2,flat,sector 43,8.44,4,5,Moderately Old,5000.0,1,1,Medium
3,flat,sector 81,1.35,3,3,Old Property,1988.888889,0,0,Medium
4,house,sector 33,0.85,3,2,Relatively New,900.0,0,0,Low


In [72]:
#Ordinal - property_type, furnishing_type, luxury_score
#Ohe - sector, 	agePossession

In [73]:
df['agePossession'].replace(
    {
        'Relatively New':2,
        'Moderately Old':1,
        'New Property':2,
        'Old Property':1,
        'Under Construction':0
    }, inplace=True
)

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['agePossession'].replace(
  df['agePossession'].replace(


Unnamed: 0,property_type,sector,price,bedroom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_score
0,flat,sector 68,0.9,2,2,2,1311.111111,0,0,Low
1,flat,sector 92,0.45,2,2,2,646.0,0,2,Low
2,flat,sector 43,8.44,4,5,1,5000.0,1,1,Medium
3,flat,sector 81,1.35,3,3,1,1988.888889,0,0,Medium
4,house,sector 33,0.85,3,2,2,900.0,0,0,Low


In [74]:
df['property_type'].replace({'flat':0, 'house':1}, inplace=True)
df['luxury_score'].replace({'Low':0, 'Medium':1, 'High':2}, inplace=True)

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['property_type'].replace({'flat':0, 'house':1}, inplace=True)
  df['property_type'].replace({'flat':0, 'house':1}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['luxury_score'].replace({'Low':0, 'Medium':1, 'High':2}, inplace=True)
  df['luxury_score'].repla

Unnamed: 0,property_type,sector,price,bedroom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_score
0,0,sector 68,0.9,2,2,2,1311.111111,0,0,0
1,0,sector 92,0.45,2,2,2,646.0,0,2,0
2,0,sector 43,8.44,4,5,1,5000.0,1,1,1
3,0,sector 81,1.35,3,3,1,1988.888889,0,0,1
4,1,sector 33,0.85,3,2,2,900.0,0,0,0


In [75]:
new_df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first=True)

new_df.head()

Unnamed: 0,property_type,price,bedroom,bathroom,built_up_area,servant room,furnishing_type,luxury_score,sector_gwal pahari,sector_manesar,...,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sohna road,sector_sohna road road,agePossession_1,agePossession_2
0,0,0.9,2,2,1311.111111,0,0,0,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0,0.45,2,2,646.0,0,2,0,False,False,...,True,False,False,False,False,False,False,False,False,True
2,0,8.44,4,5,5000.0,1,1,1,False,False,...,False,False,False,False,False,False,False,False,True,False
3,0,1.35,3,3,1988.888889,0,0,1,False,False,...,False,False,False,False,False,False,False,False,True,False
4,1,0.85,3,2,900.0,0,0,0,False,False,...,False,False,False,False,False,False,False,False,False,True


In [76]:
x = new_df.drop(columns=['price'])
y = np.log1p(new_df['price'])

In [77]:
scaler = StandardScaler()

x_scaled = scaler.fit_transform(x)

x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

x_scaled

Unnamed: 0,property_type,bedroom,bathroom,built_up_area,servant room,furnishing_type,luxury_score,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sohna road,sector_sohna road road,agePossession_1,agePossession_2
0,-0.517180,-0.877275,-0.873966,-0.450204,-0.747968,-0.685783,-1.019641,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,0.726146
1,-0.517180,-0.877275,-0.873966,-0.991567,-0.747968,1.553327,-1.019641,-0.071348,-0.093805,-0.041123,...,5.877074,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,0.726146
2,-0.517180,0.727375,1.192159,2.552347,1.336956,0.433772,0.344230,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,1.655612,-1.377133
3,-0.517180,-0.074950,-0.185258,0.101470,-0.747968,-0.685783,0.344230,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,1.655612,-1.377133
4,1.933563,-0.074950,-0.873966,-0.784825,-0.747968,-0.685783,-1.019641,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,0.726146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,-0.517180,-0.877275,-0.873966,-0.803487,-0.747968,-0.685783,-1.019641,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,-1.377133
3550,-0.517180,-0.877275,-0.873966,-0.269327,1.336956,-0.685783,0.344230,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,0.726146
3551,-0.517180,-0.877275,-0.873966,-0.460604,-0.747968,-0.685783,0.344230,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,0.726146
3552,-0.517180,-0.074950,-0.185258,-0.029485,-0.747968,-0.685783,0.344230,-0.071348,-0.093805,-0.041123,...,-0.170153,-0.050386,-0.125375,-0.058206,-0.092266,-0.053119,-0.211375,-0.05572,-0.604006,-1.377133


In [78]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = cross_val_score(LinearRegression(), x_scaled, y, cv = kfold, scoring='r2')

scores.mean(), scores.std()

(0.8489692014320477, 0.028660600050648063)

In [79]:
lr = LinearRegression()

lr.fit(x_scaled, y)

In [87]:
def un_std_coeff(row):
    
    return row[1]

In [88]:
coef_df = pd.DataFrame(lr.coef_.reshape(1, 121), columns=x.columns).stack().reset_index().drop(columns=['level_0'])

coef_df.rename(columns={'level_1':'feature', 0:'std_coef'}, inplace=True)

#std linear coef = ustd linear cof * (std dev of x / std dev of y)

coef_df['un_std_coeff'] = coef_df.apply(un_std_coeff, axis=1)

coef_df

  return row[1]


Unnamed: 0,feature,std_coef,un_std_coeff
0,property_type,0.121866,0.121866
1,bedroom,0.055891,0.055891
2,bathroom,0.065642,0.065642
3,built_up_area,0.207543,0.207543
4,servant room,0.051111,0.051111
...,...,...,...
116,sector_sector 9a,-0.005125,-0.005125
117,sector_sohna road,-0.028496,-0.028496
118,sector_sohna road road,-0.009413,-0.009413
119,agePossession_1,-0.032640,-0.032640


# Regression Analysis

In [82]:
import statsmodels.api as sm

x_c = sm.add_constant(x)

model = sm.OLS(y, x_c).fit()

print(model.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).