In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../data/gurgaon_properties_post_feature_selection.csv')
data.sample(5)

Unnamed: 0,property_type,sector,bedRoom,bathroom,built_up_area,study room,servant room,store room,price
3432,0.0,96.0,3.0,3,1833.0,0,0,0,0.89
380,0.0,98.0,3.0,3,1700.0,0,0,0,1.59
692,0.0,74.0,3.0,4,2200.0,0,0,0,3.9
210,0.0,92.0,4.0,5,3070.0,1,1,0,3.3
2580,0.0,105.0,3.0,2,1141.0,0,0,0,0.61


## Baseline model : Linear regression

- Linear regression would be used as the baseline model to evaluate other model performances
- The Ordinal Encoded features ( sector, etc) would be One hot encoded (to avoid issues with ordinal features for linear models)
- Scaling the features (StandardScaler)
- log transform on the price values to prevent issues from the right-skewed distribution


In [4]:
data.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'built_up_area',
       'study room', 'servant room', 'store room', 'price'],
      dtype='object')

In [24]:
data['sector'] = data['sector'].astype(int)

In [25]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X = data.drop(columns=['price'])
Y = data['price']

y_transformed = np.log1p(Y)

#Documented example : https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

numerical_feats = ['property_type', 'built_up_area', 'bedRoom','bathroom', 'study room', 'servant room', 'store room']
categorical_feats = ['sector']

#Create a Column transformer which has transformer parameter
#This transformer parameter takes a list of tuples which are the operations on columns to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_feats),
        ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_feats)
    ], 
    remainder='passthrough'
)



In [26]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',LinearRegression())
    ]
)

In [27]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')

Traceback (most recent call last):
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/utils/_response.py", line 109, in _get_response_values
    y_pred, pos_label = estimator.predict(X), None
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/pipeline.py", line 507, in predict
    Xt = transform.transform(Xt)
  File "/home/siddesh/Downloads/enter/envs/siddesh/lib/python3.9/site-packages/sklearn/utils/_set_output.py",

In [18]:
scores.mean()

nan

In [None]:
scores.std()