In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../data/gurgaon_properties_post_feature_selection.csv')
data.sample(5)

Unnamed: 0,property_type,sector,bedRoom,bathroom,built_up_area,study room,servant room,store room,price
596,1.0,105.0,4.0,4,1743.0,0,0,0,0.85
1006,0.0,17.0,3.0,4,1940.0,0,1,0,2.4
105,0.0,82.0,2.0,2,1084.0,0,0,0,0.79
2277,0.0,4.0,3.0,4,1945.0,1,1,0,2.15
1928,0.0,4.0,3.0,3,1653.0,0,0,0,2.5


## Baseline model : Linear regression

- Linear regression would be used as the baseline model to evaluate other model performances
- The Ordinal Encoded features ( sector, etc) would be One hot encoded (to avoid issues with ordinal features for linear models)
- Scaling the features (StandardScaler)
- log transform on the price values to prevent issues from the right-skewed distribution


In [3]:
data.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'built_up_area',
       'study room', 'servant room', 'store room', 'price'],
      dtype='object')

In [4]:
data['sector'] = data['sector'].astype(int)

In [5]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = data.drop(columns=['price'])
Y = data['price']

y_transformed = np.log1p(Y)

#Documented example : https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

numerical_feats = ['property_type', 'built_up_area', 'bedRoom','bathroom', 'study room', 'servant room', 'store room']
categorical_feats = ['sector']

#Create a Column transformer which has transformer parameter
#This transformer parameter takes a list of tuples which are the operations on columns to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_feats),
        ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_feats)
    ], 
    remainder='passthrough'
)



In [6]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())
    ]
)

In [7]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')



In [8]:
scores.mean()

0.8520962947008845

In [9]:
scores.std()

0.016011081968678984