In [43]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,price,luxury_category,floor_category
0,flat,sector 78,2,2,1,Moderately Old,1239.0,0,0,0,0.75,Low,Mid Floor
1,flat,sector 60,2,3,2,Relatively New,1250.0,1,0,1,2.15,Low,Mid Floor
2,flat,sector 90,3,3,2,Relatively New,1578.0,0,0,0,1.23,Low,High Floor
3,house,sector 2,6,6,2,Moderately Old,3611.0,0,0,0,5.0,Low,Low Floor
4,flat,sector 63,4,4,3,New Property,3956.0,0,0,0,7.52,Medium,High Floor


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0    2374
1     995
2     185
Name: count, dtype: int64

In [5]:
df.shape

(3554, 13)

In [6]:
# converting these numbers into words
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [7]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,price,luxury_category,floor_category
0,flat,sector 78,2,2,1,Moderately Old,1239.0,0,0,unfurnished,0.75,Low,Mid Floor
1,flat,sector 60,2,3,2,Relatively New,1250.0,1,0,semifurnished,2.15,Low,Mid Floor
2,flat,sector 90,3,3,2,Relatively New,1578.0,0,0,unfurnished,1.23,Low,High Floor
3,house,sector 2,6,6,2,Moderately Old,3611.0,0,0,unfurnished,5.0,Low,Low Floor
4,flat,sector 63,4,4,3,New Property,3956.0,0,0,unfurnished,7.52,Medium,High Floor


In [8]:
X = df.drop(columns=['price'])
y = df['price']

In [10]:
# training will be done in log space and prediction will be done using exponent

# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### We will train different models using 3 types of encoding: 1. One Hot encoding, 2. Ordinal Encoding, 3. Target Encoding
##### then we will calculate the r2 scores for all the models, and consider that encoding technique for model with the largest r2 score

### 1. Ordinal Encoding - it will just assign a number to a category, it is good for tree based models and not good for linear models

In [11]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [22]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']), # applying standard scaling to these columns
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)  # applying Ordinal encoding on the list of columns_to_encode
    ], 
    remainder='passthrough'
)

In [23]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),   # first all columns will be applied with preprocessing
    ('regressor', LinearRegression()) # then after preprocessing is done, we apply LinearRegression
])

In [24]:
# X['sector'].sum()

In [25]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [27]:
scores.mean(),scores.std()

# the result is not very satisfactory, but this is due to ordinal encoding used on Linear model

(0.737354589135114, 0.01768761234524335)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [30]:
pipeline.fit(X_train,y_train)

In [31]:
y_pred = pipeline.predict(X_test)

In [32]:
y_pred = np.expm1(y_pred)

In [34]:
mean_absolute_error(np.expm1(y_test),y_pred)
# mean_absolute_error is 0.86 crores ie if a flat is for 1 crore then our model might predict it 1.86 crores or 0.14 crores, which is very wrong

0.8695203108694209

In [35]:
# we converted the whole flow into a function, which takes only one thing ie name of the ML algorithm

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [44]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [45]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [46]:
model_output

[['linear_reg', 0.737354589135114, 0.8695203108694209],
 ['svr', 0.7589027158375528, 0.817259184353498],
 ['ridge', 0.7373573016885551, 0.8695576003525612],
 ['LASSO', 0.056059015180325314, 1.4955989362859337],
 ['decision tree', 0.7815484686442261, 0.6210509585032764],
 ['random forest', 0.8825446413381263, 0.4958556477466499],
 ['extra trees', 0.8702804745358119, 0.527976605222991],
 ['gradient boosting', 0.8750941441814495, 0.5379184665333824],
 ['adaboost', 0.7557740007504916, 0.7799275882334782],
 ['mlp', 0.8077834246154458, 0.685976283286071],
 ['xgboost', 0.8926933610469353, 0.4745154129360631]]

In [47]:
# converting it to a dataframe with columns as name, r2 score, and mae

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [48]:
# values are sorted according to mae as we want the least mae
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.892693,0.474515
5,random forest,0.882545,0.495856
6,extra trees,0.87028,0.527977
7,gradient boosting,0.875094,0.537918
4,decision tree,0.781548,0.621051
9,mlp,0.807783,0.685976
8,adaboost,0.755774,0.779928
1,svr,0.758903,0.817259
0,linear_reg,0.737355,0.86952
2,ridge,0.737357,0.869558


### Observation - the best R2 score using Ordinal Encoding is given by xgboost algorithm ie 0.89 and mae is 0.47 crores

### OneHotEncoding - 

we will apply it for columns where there is no order between the categories. ie eg in balcony there is an order that 1 balcony < 2 balconies < 3 balconies and so on. Luxury also has order ie low<medium<high. But in agePossession or sector, there is no order like this. So we will apply OHE only on those columns where the order of categories does not matter

In [57]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),  # saare categorical columns par ordinal encoding lagaya
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])  # but unme se ye 3 columns ke upar OHE laga rahe hai
    ], 
    remainder='passthrough'
)

In [58]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [59]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [60]:
scores.mean()

0.8554608717625344

In [61]:
scores.std()

0.02136608294802365

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [63]:
pipeline.fit(X_train,y_train)

In [64]:
y_pred = pipeline.predict(X_test)

In [65]:
y_pred = np.expm1(y_pred)

In [68]:
mean_absolute_error(np.expm1(y_test),y_pred)

# mae got reduced for linear regression when trained for one hot encoding as compared to ordinal encoding

0.6226735861119063

In [69]:
# we converted the whole flow into a function, which takes only one thing ie name of the ML algorithm

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [70]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [71]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [72]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [73]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894094,0.444655
5,random forest,0.89122,0.451727
10,xgboost,0.898389,0.476891
7,gradient boosting,0.877285,0.532809
9,mlp,0.870673,0.595102
0,linear_reg,0.855461,0.622674
4,decision tree,0.815294,0.62499
2,ridge,0.85558,0.626658
8,adaboost,0.756708,0.766071
1,svr,0.763092,0.806241


### Observation: Using OHE, the linear models performed better. The best model is extra trees with 0.89 r2 score and the mae got reduced to 0.44 crores

### Problem:
In OHE, the number of dimensions increases. We saw that performing OHE only for sector column, the dimensions increases by 100. So if there is a very large dataset with very high number of dimensions after applying OHE, then the model training will be very slow

### Solution:
to solve this, we can do dimensionality reduction using PCA

### OneHotEncoding With PCA

In [77]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)



# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),  # saare categorical columns par ordinal encoding lagaya
#         ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])  # but unme se ye 3 columns ke upar OHE laga rahe hai
#     ], 
#     remainder='passthrough'
# )

In [78]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [79]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [80]:
scores.mean()

0.05905955923173253

In [81]:
scores.std()

0.02558389900172419

In [82]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [83]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [84]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [85]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [87]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762525,0.69303
6,extra trees,0.732446,0.704002
4,decision tree,0.69171,0.765081
10,xgboost,0.618997,0.993971
7,gradient boosting,0.613364,1.014445
8,adaboost,0.317436,1.314192
1,svr,0.228118,1.331702
9,mlp,0.214716,1.385667
3,LASSO,0.056256,1.495525
2,ridge,0.05906,1.502639


### Observation: the results were bad after applying PCA on OHE

### Target Encoder
it is used for those categorical columns which have got high dimentionality. like sector has around 100 categories. then you can use target encoding for such columns

In [99]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])    # doing target encoding on sector column because it is high dimensional
    ], 
    remainder='passthrough'
)

# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),  # saare categorical columns par ordinal encoding lagaya
#         ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])  # but unme se ye 3 columns ke upar OHE laga rahe hai
#     ], 
#     remainder='passthrough'
# )

In [100]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [101]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [103]:
scores.mean(),scores.std()
# the results using linear regression and target encoding seems to be ok

(0.8274280472406664, 0.02373407753358409)

In [104]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [105]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [106]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [107]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [108]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.900469,0.444206
6,extra trees,0.90098,0.481326
10,xgboost,0.900226,0.503479
7,gradient boosting,0.889883,0.519803
4,decision tree,0.824321,0.590122
9,mlp,0.850556,0.650888
8,adaboost,0.815874,0.670446
0,linear_reg,0.827428,0.69016
2,ridge,0.827454,0.690451
1,svr,0.776029,0.795119


### Observation: random forest is performing best using Target Encoding and mae also reduced to 0.44 crores and r2 score increased to 0.90

# Final Conclusion:
## Random forests and XGBoost were giving best results using Target Encoding on sector column. So now we have use the same setup and perform hyperparameter tuning

### Hyperparameter Tuning on Random Forest

In [135]:
from sklearn.model_selection import GridSearchCV

In [136]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['sqrt', 'log2', None]
}

In [137]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),  # saare categorical columns par ordinal encoding lagaya
#         ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])  # but unme se ye 3 columns ke upar OHE laga rahe hai
#     ], 
#     remainder='passthrough'
# )

In [138]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [139]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [140]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [141]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


In [142]:
final_pipe = search.best_estimator_

In [143]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': None,
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [144]:
search.best_score_

0.9018667869212422

## Conclusion: we could get maximum r2 score as 0.901, yet we can try some other methods too. But for now we will deploy the model

In [146]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [147]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False, handle_unknown='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [148]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [149]:
pipeline.fit(X,y_transformed)

In [150]:
import pickle

with open('mandar_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [151]:
with open('mandar_df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [152]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 78,2,2,1,Moderately Old,1239.0,0,0,unfurnished,Low,Mid Floor
1,flat,sector 60,2,3,2,Relatively New,1250.0,1,0,semifurnished,Low,Mid Floor
2,flat,sector 90,3,3,2,Relatively New,1578.0,0,0,unfurnished,Low,High Floor
3,house,sector 2,6,6,2,Moderately Old,3611.0,0,0,unfurnished,Low,Low Floor
4,flat,sector 63,4,4,3,New Property,3956.0,0,0,unfurnished,Medium,High Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,New Property,2956.0,0,0,furnished,Low,Low Floor
3550,flat,dwarka expressway,3,3,2,Under Construction,1267.0,0,0,unfurnished,Low,Mid Floor
3551,flat,sector 50,3,4,3+,Moderately Old,2470.0,1,0,semifurnished,High,Mid Floor
3552,flat,sector 86,3,3,3,Moderately Old,1747.0,1,0,unfurnished,Medium,High Floor


### Trying out the predictions

In [153]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [154]:
X.iloc[0].values

array(['flat', 'sector 78', 2, 2, '1', 'Moderately Old', 1239.0, 0, 0,
       'unfurnished', 'Low', 'Mid Floor'], dtype=object)

In [166]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 1750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,1750,0,0,unfurnished,Low,Low Floor


In [167]:
np.expm1(pipeline.predict(one_df))

array([2.90011341])

In [157]:
X.dtypes

property_type       object
sector              object
bedRoom              int64
bathroom             int64
balcony             object
agePossession       object
built_up_area      float64
servant room         int64
store room           int64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [306]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 10',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'sector 60',
 'sector 61',
 'sector 62',
 'sector 63',
 'sector 63a',
 'sector 65',
 'sector 66',
 'sector 67',
 'se