In [1]:
import pandas as pd
import numpy as np

In [22]:
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder,OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import xgboost as xg
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('outlier_removed.csv')

In [4]:
test_df = df.copy()

In [5]:
test_df.drop(columns=['price_persqft','index'],inplace=True)

In [6]:
test_df

Unnamed: 0,BHK,sector,price,SuperArea,floor,Furnishing,facing,Car_Parking,Bathroom,Balcony,overlooking,city
0,3,Panathur,2.25,1611.0,10+,Unfurnished,East,available,2,1,Garden/Park,Bangalore
1,3,Panathur,2.13,1650.0,10+,Unfurnished,East,available,3,2,Garden/Park,Bangalore
2,3,Thanisandra,1.37,1420.0,8,Unfurnished,North - East,available,2,1,"Garden/Park, Pool",Bangalore
3,3,Panathur,1.62,1473.0,5,Unfurnished,North - East,available,2,1,"Garden/Park, Main Road",Bangalore
4,4,Whitefield,3.93,2616.0,10+,Unfurnished,East,available,3,2,Garden/Park,Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...
7435,1,Andheri,0.85,429.0,1,Semi-Furnished,East,not sure,1,1,Main Road,Mumbai
7436,1,Andheri,1.29,435.0,10+,Unfurnished,East,available,2,1,Main Road,Mumbai
7437,2,others,2.75,1095.0,10+,Unfurnished,East,not sure,2,1,Main Road,Mumbai
7438,4,Andheri,4.15,1300.0,8,Semi-Furnished,East,not sure,2,1,"Garden/Park, Main Road",Mumbai


In [7]:
# pipeline
col = df.select_dtypes(include=object).columns.tolist()

In [8]:
col

['BHK',
 'sector',
 'floor',
 'Furnishing',
 'facing',
 'Car_Parking',
 'Bathroom',
 'Balcony',
 'overlooking',
 'city']

In [9]:
len(test_df['sector'].unique())

106

In [10]:
X = test_df.drop(columns=['price'])
y = test_df['price']

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['SuperArea']),
        ('cat',OneHotEncoder(drop='first'),col)
    ],
    remainder='passthrough'
)

In [24]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',LinearRegression())
    ]
)

In [25]:
# using K-fold cross val socre
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')

In [26]:
score.mean()

0.6144426126815613

In [27]:
score.std()

0.04361148683634919

In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
pipeline.fit(X_train,y_train)

In [30]:
y_pred = pipeline.predict(X_test)

In [31]:
y_pred = np.expm1(y_pred)

In [32]:
mean_squared_error(y_test,y_pred)

18108873924.100792

In [33]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',Lasso())
    ]
)

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.1680209283015579

In [34]:
score

array([0.18709297, 0.1490089 , 0.13061319, 0.13896385, 0.1772815 ,
       0.19247135, 0.1750397 , 0.1720097 , 0.17455219, 0.18317593])

In [35]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',Ridge())
    ]
)

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.6150494861597451

In [36]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',DecisionTreeRegressor())
    ]
)

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.4621802250798718

In [37]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',KNeighborsRegressor())
    ]
)

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.6091462837043043

In [38]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',RandomForestRegressor())
    ]
)

kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.7003749697117402

In [39]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 100, seed = 123) )
    ]
)

kfold = KFold(n_splits=15,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.7044752521433877

In [40]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',SVR() )
    ]
)

kfold = KFold(n_splits=15,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.6307106611872247

In [41]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',AdaBoostRegressor() )
    ]
)

kfold = KFold(n_splits=15,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.08660201019127418

In [42]:
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',GradientBoostingRegressor() )
    ]
)

kfold = KFold(n_splits=15,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.7115475401455947

In [43]:
from sklearn.neural_network import MLPRegressor
pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',MLPRegressor(random_state=1, max_iter=50))
    ]
)

kfold = KFold(n_splits=15,shuffle=True,random_state=42)
score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
score.mean()

0.7342462341073114

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['SuperArea']),
        ('cat',OneHotEncoder(),col)
    ],
    remainder='passthrough'
)

In [45]:
def modelsscore(model_name,model):
  score={}
  pipeline= Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ]
  )
  kfold = KFold(n_splits=15,shuffle=True,random_state=42)
  score = cross_val_score(pipeline,X,y,cv=kfold,scoring='r2')
  score={f'{model_name}': score.mean()}
  return score

In [23]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'knn':KNeighborsRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':xg.XGBRegressor(objective ='reg:linear',
                  n_estimators = 100, seed = 123)
}

In [49]:
for model_name,model in model_dict.items():
  score = modelsscore(model_name,model)
  print(score)

{'LinearRegression()': 0.6129028339192523}
{'SVR()': 0.6339072385477464}
{'Ridge()': 0.6138088755618186}
{'Lasso()': 0.1689010906515867}
{'DecisionTreeRegressor()': 0.44105125320824123}
{'RandomForestRegressor()': 0.6962608099602109}
{'KNeighborsRegressor()': 0.6146878213908491}
{'GradientBoostingRegressor()': 0.7128830243291622}
{'AdaBoostRegressor()': 0.07678160905753553}
{'MLPRegressor()': 0.6599702748567461}
{"XGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=None,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=