In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df=pd.read_csv('States.csv')

In [3]:
df.head()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,locality,price,area,furnish_type,bathroom,city
0,OWNER,2.0,BHK,Apartment,Bodakdev,20000.0,1450.0,Furnished,2.0,Ahmedabad
1,OWNER,1.0,RK,Studio Apartment,CG Road,7350.0,210.0,Semi-Furnished,1.0,Ahmedabad
2,OWNER,3.0,BHK,Apartment,Jodhpur,22000.0,1900.0,Unfurnished,3.0,Ahmedabad
3,OWNER,2.0,BHK,Independent House,Sanand,13000.0,1285.0,Semi-Furnished,2.0,Ahmedabad
4,OWNER,2.0,BHK,Independent House,Navrangpura,18000.0,1600.0,Furnished,2.0,Ahmedabad


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193011 entries, 0 to 193010
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   seller_type    193011 non-null  object 
 1   bedroom        193011 non-null  float64
 2   layout_type    193011 non-null  object 
 3   property_type  193011 non-null  object 
 4   locality       193011 non-null  object 
 5   price          193011 non-null  float64
 6   area           193011 non-null  float64
 7   furnish_type   193011 non-null  object 
 8   bathroom       193011 non-null  float64
 9   city           193011 non-null  object 
dtypes: float64(4), object(6)
memory usage: 14.7+ MB


In [5]:
df.describe()

Unnamed: 0,bedroom,price,area,bathroom
count,193011.0,193011.0,193011.0,193011.0
mean,2.081679,44336.54,1264.604468,2.059883
std,0.956901,91951.99,1043.725561,0.934805
min,1.0,1200.0,3.0,1.0
25%,1.0,13000.0,650.0,1.0
50%,2.0,21000.0,1000.0,2.0
75%,3.0,36000.0,1440.0,3.0
max,15.0,5885000.0,19800.0,19.0


In [6]:
df.shape

(193011, 10)

In [7]:
df.isnull().sum()

seller_type      0
bedroom          0
layout_type      0
property_type    0
locality         0
price            0
area             0
furnish_type     0
bathroom         0
city             0
dtype: int64

In [8]:
!pip install category_encoders




In [9]:
import category_encoders as ce

In [10]:
encoder = ce.TargetEncoder(cols=['locality'])

encoder.fit(df['locality'], df['price'])
df['locality_encoded'] = encoder.transform(df['locality'], df['price'])


In [11]:
df.drop(columns=['locality'], inplace=True)

In [12]:
encoder = ce.TargetEncoder(cols=['city'])

encoder.fit(df['city'], df['price'])
df['city_encoded'] = encoder.transform(df['city'], df['price'])
df.drop(columns=['city'], inplace=True)

In [13]:
df.head()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,price,area,furnish_type,bathroom,locality_encoded,city_encoded
0,OWNER,2.0,BHK,Apartment,20000.0,1450.0,Furnished,2.0,38505.388764,24328.495367
1,OWNER,1.0,RK,Studio Apartment,7350.0,210.0,Semi-Furnished,1.0,41116.446295,24328.495367
2,OWNER,3.0,BHK,Apartment,22000.0,1900.0,Unfurnished,3.0,25073.559563,24328.495367
3,OWNER,2.0,BHK,Independent House,13000.0,1285.0,Semi-Furnished,2.0,23187.914514,24328.495367
4,OWNER,2.0,BHK,Independent House,18000.0,1600.0,Furnished,2.0,28556.61328,24328.495367


In [14]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
X=df.drop(labels=["price"],axis=1)
y=df["price"]

In [16]:
X.head()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,area,furnish_type,bathroom,locality_encoded,city_encoded
0,OWNER,2.0,BHK,Apartment,1450.0,Furnished,2.0,38505.388764,24328.495367
1,OWNER,1.0,RK,Studio Apartment,210.0,Semi-Furnished,1.0,41116.446295,24328.495367
2,OWNER,3.0,BHK,Apartment,1900.0,Unfurnished,3.0,25073.559563,24328.495367
3,OWNER,2.0,BHK,Independent House,1285.0,Semi-Furnished,2.0,23187.914514,24328.495367
4,OWNER,2.0,BHK,Independent House,1600.0,Furnished,2.0,28556.61328,24328.495367


In [17]:
y.head()

0    20000.0
1     7350.0
2    22000.0
3    13000.0
4    18000.0
Name: price, dtype: float64

In [18]:
cat_cols=X.select_dtypes(include="object").columns

num_cols=X.select_dtypes(exclude="object").columns

In [19]:
cat_cols

Index(['seller_type', 'layout_type', 'property_type', 'furnish_type'], dtype='object')

In [20]:
num_cols

Index(['bedroom', 'area', 'bathroom', 'locality_encoded', 'city_encoded'], dtype='object')

In [21]:
sell_cat=['OWNER', 'AGENT', 'BUILDER']
layout_cat=['BHK', 'RK']
property_cat=['Apartment', 'Studio Apartment', 'Independent House', 'Villa','Independent Floor', 'Penthouse']
furnished_cat=['Furnished', 'Semi-Furnished', 'Unfurnished']

In [22]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='median')),
        ("scaler",StandardScaler())
    ]
)

In [23]:
cat_pipeline=Pipeline(

    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder",OrdinalEncoder(categories=[sell_cat,layout_cat,property_cat,furnished_cat]))

    ]
)

In [24]:
preprocessor=ColumnTransformer(

    [
        ("num_pipeline",num_pipeline,num_cols),
        ("cat_pipeline",cat_pipeline,cat_cols)
    ]
)

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=42)

In [26]:
X_train.head()

Unnamed: 0,seller_type,bedroom,layout_type,property_type,area,furnish_type,bathroom,locality_encoded,city_encoded
100515,AGENT,3.0,BHK,Apartment,2010.0,Furnished,3.0,22811.34696,20658.327297
67379,AGENT,15.0,BHK,Independent House,15461.0,Unfurnished,16.0,279371.473354,109204.482305
86996,AGENT,2.0,BHK,Apartment,1150.0,Semi-Furnished,2.0,19429.688188,19171.829287
6767,AGENT,3.0,BHK,Apartment,1385.0,Furnished,3.0,40475.747557,24328.495367
130574,AGENT,2.0,BHK,Apartment,1150.0,Semi-Furnished,2.0,43876.129614,43937.807108


In [27]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(135107, 9)
(57904, 9)
(135107,)
(57904,)


In [28]:
preprocessor.fit_transform(X_train)

array([[ 0.95975728,  0.71276079,  1.00221358, ...,  0.        ,
         0.        ,  0.        ],
       [13.51018051, 13.60106632, 14.88694845, ...,  0.        ,
         2.        ,  2.        ],
       [-0.08611132, -0.11126291, -0.06584295, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.08611132, -0.1610876 , -0.06584295, ...,  0.        ,
         0.        ,  2.        ],
       [-0.08611132, -0.06335456, -0.06584295, ...,  0.        ,
         0.        ,  1.        ],
       [-0.08611132, -0.4753664 , -0.06584295, ...,  0.        ,
         0.        ,  2.        ]])

In [29]:
preprocessor.transform(X_test)

array([[-1.13197992, -0.63825481, -1.13389947, ...,  0.        ,
         0.        ,  2.        ],
       [ 2.00562588,  4.17270215,  2.07027011, ...,  0.        ,
         4.        ,  2.        ],
       [ 2.00562588,  5.01493102,  2.07027011, ...,  0.        ,
         2.        ,  1.        ],
       ...,
       [-0.08611132, -0.25498797, -0.06584295, ...,  0.        ,
         0.        ,  1.        ],
       [-0.08611132, -0.68616316, -0.06584295, ...,  0.        ,
         0.        ,  1.        ],
       [-1.13197992, -0.49452975, -1.13389947, ...,  0.        ,
         2.        ,  2.        ]])

In [30]:
preprocessor.get_feature_names_out()

array(['num_pipeline__bedroom', 'num_pipeline__area',
       'num_pipeline__bathroom', 'num_pipeline__locality_encoded',
       'num_pipeline__city_encoded', 'cat_pipeline__seller_type',
       'cat_pipeline__layout_type', 'cat_pipeline__property_type',
       'cat_pipeline__furnish_type'], dtype=object)

In [31]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [32]:
X_train

Unnamed: 0,num_pipeline__bedroom,num_pipeline__area,num_pipeline__bathroom,num_pipeline__locality_encoded,num_pipeline__city_encoded,cat_pipeline__seller_type,cat_pipeline__layout_type,cat_pipeline__property_type,cat_pipeline__furnish_type
0,0.959757,0.712761,1.002214,-0.351646,-0.766403,1.0,0.0,0.0,0.0
1,13.510181,13.601066,14.886948,3.688067,2.092232,1.0,0.0,2.0,2.0
2,-0.086111,-0.111263,-0.065843,-0.404893,-0.814393,1.0,0.0,0.0,1.0
3,0.959757,0.113906,1.002214,-0.073508,-0.647914,1.0,0.0,0.0,0.0
4,-0.086111,-0.111263,-0.065843,-0.019967,-0.014845,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
135102,-1.131980,-0.571183,-0.065843,-0.367311,-0.014845,1.0,0.0,0.0,0.0
135103,-1.131980,-0.638255,-1.133899,-0.477875,-0.014845,1.0,0.0,0.0,2.0
135104,-0.086111,-0.161088,-0.065843,-0.535103,-0.014845,1.0,0.0,0.0,2.0
135105,-0.086111,-0.063355,-0.065843,-0.019967,-0.014845,1.0,0.0,0.0,1.0


In [33]:
X_test

Unnamed: 0,num_pipeline__bedroom,num_pipeline__area,num_pipeline__bathroom,num_pipeline__locality_encoded,num_pipeline__city_encoded,cat_pipeline__seller_type,cat_pipeline__layout_type,cat_pipeline__property_type,cat_pipeline__furnish_type
0,-1.131980,-0.638255,-1.133899,-0.370417,-0.766247,1.0,0.0,0.0,2.0
1,2.005626,4.172702,2.070270,4.296481,2.092232,1.0,0.0,4.0,2.0
2,2.005626,5.014931,2.070270,1.090602,-0.625775,1.0,0.0,2.0,1.0
3,-0.086111,-0.590346,-0.065843,-0.496357,-0.625775,1.0,0.0,0.0,2.0
4,3.051494,5.035053,2.070270,7.905952,2.092232,1.0,0.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...
57899,2.005626,0.511546,2.070270,-0.017757,2.092232,1.0,0.0,4.0,1.0
57900,-1.131980,-0.542438,-1.133899,-0.426732,-0.766403,1.0,0.0,0.0,0.0
57901,-0.086111,-0.254988,-0.065843,-0.354040,-0.014845,1.0,0.0,0.0,1.0
57902,-0.086111,-0.686163,-0.065843,-0.521668,2.092232,1.0,0.0,0.0,1.0


# Model Training

In [34]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [35]:
models={

    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Randomforest':RandomForestRegressor(),
    'xgboost':XGBRegressor(),
    'KNN_R':KNeighborsRegressor(n_neighbors=6,algorithm='auto')

}

In [36]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [37]:
list(models)

['LinearRegression',
 'Lasso',
 'Ridge',
 'Elasticnet',
 'Randomforest',
 'xgboost',
 'KNN_R']

In [38]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    print(model)

LinearRegression()
Lasso()
Ridge()
ElasticNet()
RandomForestRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
KNeighborsRegressor(n_neighbors=6)


In [39]:
def evaluate_model(true,pred):
    r2=r2_score(true,pred)
    mae=mean_absolute_error(true,pred)
    mse=mean_squared_error(true,pred)

    return mae, mse,r2

In [40]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    
    model.fit(X_train,y_train)

    #make_prediction
    y_pred=model.predict(X_test)

    #this is for the validaiton
    MAE,MSE,R2=evaluate_model(y_test,y_pred)


    print("model training performance",model)
    print("MSE:", MSE)
    print("MAE:",MAE)
    print("R2 SCORE:",R2)

    r2_list.append(R2)

    print("="*40)
    print("\n")

model training performance LinearRegression()
MSE: 3081810100.9460673
MAE: 21203.000924597844
R2 SCORE: 0.6658985298620749


model training performance Lasso()
MSE: 3081827968.875893
MAE: 21202.583395713627
R2 SCORE: 0.6658965927856726


model training performance Ridge()
MSE: 3081813822.5069036
MAE: 21202.922230938264
R2 SCORE: 0.6658981264047217


model training performance ElasticNet()
MSE: 3606503074.5126333
MAE: 22041.960792684768
R2 SCORE: 0.6090161496707014


model training performance RandomForestRegressor()
MSE: 1346426505.163923
MAE: 7616.709377406294
R2 SCORE: 0.8540328378215645


model training performance XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraint

In [41]:
r2_list

[0.6658985298620749,
 0.6658965927856726,
 0.6658981264047217,
 0.6090161496707014,
 0.8540328378215645,
 0.8335815373456342,
 0.8292123092674379]

In [42]:
print(f"RandomForest: {max(r2_list)*100:.2f}")

RandomForest: 85.40


# Hyperparameter Tuning KNN

In [47]:
regressor=KNeighborsRegressor()

In [48]:
param_grid={
    'n_neighbors':[1,2,3,4,5,6,7,8,9,10],
    'algorithm':['ball_tree','kd_tree'],
    'p':[1,2]
}

In [49]:
from sklearn.model_selection import RandomizedSearchCV

In [50]:
cv=RandomizedSearchCV(regressor,param_distributions=param_grid,scoring='r2',cv=3,verbose=2)
cv.fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=2; total time=   2.7s
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=2; total time=   2.5s
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=2; total time=   2.4s
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=1; total time=   2.6s
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=1; total time=   2.7s
[CV] END .............algorithm=kd_tree, n_neighbors=10, p=1; total time=   2.5s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time=  34.3s
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time= 1.2min
[CV] END ............algorithm=ball_tree, n_neighbors=5, p=1; total time=  28.0s
[CV] END ..............algorithm=kd_tree, n_neighbors=2, p=1; total time=   2.1s
[CV] END ..............algorithm=kd_tree, n_neighbors=2, p=1; total time=   1.8s
[CV] END ..............algorithm=kd_tree, n_neig

In [52]:
best_params = cv.best_params_
best_params

{'p': 1, 'n_neighbors': 4, 'algorithm': 'ball_tree'}

In [53]:
regressor=KNeighborsRegressor(n_neighbors=4,algorithm='ball_tree',p=1)
regressor.fit(X_train,y_train)

In [55]:
y_pred=regressor.predict(X_test)

In [56]:
MAE,MSE,R2=evaluate_model(y_test,y_pred)

print("model training performance",model)
print("MSE:", MSE)
print("MAE:",MAE)
print("R2 SCORE:",R2)

model training performance KNeighborsRegressor(n_neighbors=6)
MSE: 1385600149.9344778
MAE: 8881.820685962974
R2 SCORE: 0.8497859920134839
