In [8]:
import numpy as np
import pandas as pd

In [9]:
df=pd.read_csv('gurgaon_property_model_selction.csv')

In [10]:
x= df.drop(columns=['price'])
y=df['price']

In [11]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,flat,170.0,3.0,2,2.0,new property,850.0,0,0,medium,1.0,mid floor,0.82
1,flat,235.0,2.0,2,2.0,new property,1226.0,1,0,medium,1.0,high floor,0.95
2,flat,262.0,2.0,2,1.0,new property,1000.0,0,0,medium,1.0,low floor,0.32
3,flat,240.0,3.0,4,4.0,Relatively New,1615.0,1,0,high,0.0,high floor,1.6
4,flat,127.0,2.0,2,1.0,Relatively New,582.0,0,1,medium,0.0,high floor,0.48


In [12]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [13]:
df.shape

(3705, 13)

In [14]:
df.isnull().sum()

property_type      0
sector             0
bedRoom            0
bathroom           0
balcony            0
agePossession      0
built_up_area      0
servant room       0
store room         0
furnishing_type    0
luxury_category    0
floor_category     0
price              0
dtype: int64

In [15]:
pip install xgboost




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\Kishlay Kumar\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip





In [16]:
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler,OrdinalEncoder
from sklearn .compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


In [17]:
x=df.drop(columns=['price'])
y = df['price']

In [18]:
# apply log transformed to target columns
y_transformed = np.log1p(y)

# ordinal encoding

In [19]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [20]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# K-Fold Cross-Validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

In [21]:
print(scores.mean())

0.600234365044823


In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [23]:
pipeline.fit(x_train,y_train)

In [24]:
print(type(x))
print(x.shape)  # This will print the number of rows and columns
print(x.columns)

<class 'pandas.core.frame.DataFrame'>
(3705, 12)
Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')


In [25]:
y_pred = pipeline.predict(x_test)

In [26]:
y_pred = np.exp(y_pred)

In [27]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.5634322771067355

In [28]:
def scorer(model_name,model):
    output = []
    output.append(model_name)
    pipeline= Pipeline([
        ('preprocessor', preprocessor),
        ('regressor',model)
    ])

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    output.append(scores.mean())
    X_train,x_test,y_train,y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    pipeline.fit(x_train,y_train)
    y_pred= pipeline.predict(x_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    return output

In [29]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [30]:
model_output=[]
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))



In [31]:
model_output

[['linear_reg', np.float64(0.600234365044823), 1.085470827529458],
 ['svr', np.float64(0.37756512178537543), 1.141322573373813],
 ['ridge', np.float64(0.6002439292714156), 1.085432072703867],
 ['LASSO', np.float64(0.03538141482375706), 1.4147917978483662],
 ['decision tree', np.float64(0.6987582789503575), 0.9430700834209311],
 ['random forest', np.float64(0.8337584873595482), 0.6525961131174948],
 ['extra trees', np.float64(0.815497800842191), 0.6979182717034184],
 ['gradient boosting', np.float64(0.8185866166709191), 0.6936534860744762],
 ['adaboost', np.float64(0.5878420028541724), 1.0302929463114576],
 ['mlp', np.float64(0.6673308282200698), 1.0203276625569206],
 ['xgboost', np.float64(0.8428950087748526), 0.6296439940525613]]

In [32]:
model_df = pd.DataFrame(model_output,columns=['name','r2','mae'])

In [33]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.842895,0.629644
5,random forest,0.833758,0.652596
7,gradient boosting,0.818587,0.693653
6,extra trees,0.815498,0.697918
4,decision tree,0.698758,0.94307
9,mlp,0.667331,1.020328
8,adaboost,0.587842,1.030293
2,ridge,0.600244,1.085432
0,linear_reg,0.600234,1.085471
1,svr,0.377565,1.141323


# one hot encoding with pca

In [34]:
OneHotEncoder(handle_unknown='infrequent_if_exist', drop='first', sparse_output=False)

In [35]:
# Convert categorical columns to strings
x['sector'] = x['sector'].astype(str)
x['agePossession'] = x['agePossession'].astype(str)

In [36]:
# Group rare categories in 'sector' and 'agePossession'
top_sectors = x['sector'].value_counts().nlargest(10).index  # Keep top 10 categories
x['sector'] = x['sector'].apply(lambda x: x if x in top_sectors else 'other')

top_agePossession = x['agePossession'].value_counts().nlargest(5).index  # Keep top 5 categories
x['agePossession'] = x['agePossession'].apply(lambda x: x if x in top_agePossession else 'other')

# Now apply the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), columns_to_encode), 
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), ['sector', 'agePossession'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

kfold = KFold(n_splits=10, shuffle=True, random_state=45)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')

print("Cross-validation R^2 scores:", scores)
print("Mean R^2 score:", scores.mean())

Cross-validation R^2 scores: [0.6317342  0.64923937 0.66345918 0.6328179  0.5637858  0.64398486
 0.68637503 0.64694744 0.63063669 0.32636542]
Mean R^2 score: 0.607534590022936




In [37]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
       ('preprocessor',preprocessor),
       ('pca',PCA(n_components=0.95)),
       ('regressor',LinearRegression())
    ])
    kfold= KFold(n_splits=10, shuffle=True, random_state=45)
    scores = cross_val_score(pipeline,x,y_transformed,cv=kfold, scoring='r2')
    output.append(scores.mean())
    x_train,x_test,y_train,y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=45)
    pipeline.fit(x_train,y_train)
    y_pred = pipeline.predict(x_test)
    y_pred = np.expm1(y_pred)
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    return output

In [38]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [39]:
model_output=[]
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name,model))



In [40]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [41]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
0,linear_reg,0.607535,0.966296
1,svr,0.607535,0.966296
2,ridge,0.607535,0.966296
3,LASSO,0.607535,0.966296
4,decision tree,0.607535,0.966296
5,random forest,0.607535,0.966296
6,extra trees,0.607535,0.966296
7,gradient boosting,0.607535,0.966296
8,adaboost,0.607535,0.966296
9,mlp,0.607535,0.966296


# target encoder

In [42]:
pip install category_encoders





[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: C:\Users\Kishlay Kumar\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [43]:
import category_encoders as ce

columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [44]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [45]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')



In [46]:
scores.mean(),scores.std()

(np.float64(0.6226845525933269), np.float64(0.08639776461240664))

In [47]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    x_train, x_test, y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(x_train,y_train)
    
    y_pred = pipeline.predict(x_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [48]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}


In [49]:

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [50]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [51]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.796158,0.734064
10,xgboost,0.77722,0.769345
6,extra trees,0.778904,0.784804
7,gradient boosting,0.777896,0.800751
9,mlp,0.743387,0.835299
1,svr,0.728354,0.858018
4,decision tree,0.624368,0.965629
8,adaboost,0.59011,1.013375
2,ridge,0.622686,1.056901
0,linear_reg,0.622685,1.056938


# hyperparameter tuning

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
param_grid = {
   'regressor__n_estimators': [50, 100, 150],  # Fewer trees to save time
    'regressor__max_depth': [None, 10, 20],     # Avoid too deep trees for small datasets
    'regressor__max_samples': [0.5, 0.75, 1.0], # Use larger sample fractions for small datasets
    'regressor__max_features': ['sqrt', 0.5, 0.8]
}

In [54]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [55]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor()) 
])


In [56]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)

In [57]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [58]:
# Fit GridSearchCV
search.fit(x, y_transformed)

Fitting 10 folds for each of 81 candidates, totalling 810 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan]


In [59]:
final_pipe = search.best_estimator_

In [60]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 0.5,
 'regressor__n_estimators': 50}

In [61]:
search.best_score_

np.float64(nan)

In [65]:
final_pipe.fit(x,y_transformed)

# exporting model

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [67]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',RandomForestRegressor(n_estimators=500))
])

In [68]:
pipeline.fit(x,y_transformed)

In [69]:
import pickle
with open('pipeline.pkl','wb')as file:
    pickle.dump(pipeline,file)

In [70]:
with open('df.pkl','wb')as file:
    pickle.dump(x,file)