In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor , GradientBoostingRegressor

In [2]:
def rmsle(y_true, y_pred):
    return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())

In [3]:
train = pd.read_csv('train_preprocessed.csv')

In [4]:
train.head()

Unnamed: 0,availability,location,size,society,total_sqft,bath,balcony,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,price
0,-1.959852,-0.696718,-0.656027,-1.670517,-0.105648,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,39.07
1,0.510243,-0.974577,0.997617,0.612839,0.009227,1.718585,1.73776,-0.471113,-0.081226,2.343851,-1.386504,120.0
2,0.510243,1.391193,0.170795,0.90251,-0.077078,-0.518099,1.73776,2.122635,-0.081226,-0.426648,-1.386504,62.0
3,0.510243,0.224186,0.170795,0.320973,-0.071052,0.227462,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,95.0
4,0.510243,0.113042,-0.656027,0.90251,-0.094934,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,51.0


In [5]:
train.isna().sum()

availability                      0
location                          0
size                              0
society                           0
total_sqft                        0
bath                              0
balcony                           0
area_type_Built-up  Area          0
area_type_Carpet  Area            0
area_type_Plot  Area              0
area_type_Super built-up  Area    0
price                             0
dtype: int64

In [7]:
test = pd.read_csv('test_preprocessed.csv')

In [8]:
test.head()

Unnamed: 0,availability,location,size,society,total_sqft,bath,balcony,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,0.510243,-1.083074,-0.656027,-0.147548,-0.093074,-0.518099,0.495387,-0.471113,-0.081226,-0.426648,0.721238
1,0.510243,-1.56205,5.131728,0.90251,-0.005653,4.70083,0.495387,-0.471113,-0.081226,2.343851,-1.386504
2,-1.959852,-0.421505,0.997617,0.021427,-0.061454,1.718585,0.495387,-0.471113,-0.081226,2.343851,-1.386504
3,0.510243,0.073348,0.170795,0.884954,-0.085857,0.227462,-0.746985,-0.471113,-0.081226,-0.426648,0.721238
4,0.510243,0.089226,-0.656027,-2.056746,-0.097836,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238


In [9]:
X =train.drop(columns='price')

In [10]:
y =train['price']

In [11]:
y.shape

(13319,)

In [31]:
train_X, test_X,train_y, test_y  =train_test_split(X,y,random_state=102,test_size=0.25)

In [13]:
train_X.isna().sum()

availability                      0
location                          0
size                              0
society                           0
total_sqft                        0
bath                              0
balcony                           0
area_type_Built-up  Area          0
area_type_Carpet  Area            0
area_type_Plot  Area              0
area_type_Super built-up  Area    0
dtype: int64

In [32]:
def run_model(model, train_X , train_y , test_X , test_y, test ,X, y,  filename):
    print(model)
    model.fit(train_X, train_y)
    train_pred = model.predict(train_X)
    print('train_rmsle:', rmsle(train_y , train_pred))
    test_pred = model.predict(test_X)
    print('test_rmsle:', rmsle(test_y , test_pred))
    model.fit(X,y)
    pred = model.predict(test)
    res =pd.DataFrame(pred, columns=['price'])
    res.to_csv(filename, index=False)

In [15]:
linearreg = LinearRegression()
run_model(linearreg, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_linearreg.csv')

LinearRegression()
train_rmsle: 0.7607703330910434
test_rmsle: 0.7537030486005594


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


In [16]:
lasso = Lasso()
run_model(lasso, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_lasso.csv')

Lasso()
train_rmsle: 0.7641806497209663
test_rmsle: 0.7578027106428507


In [17]:
ridge = Ridge()
run_model(ridge, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_ridge.csv')

Ridge()
train_rmsle: 0.7607737487477286
test_rmsle: 0.7537066769043927


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


In [18]:
knn =KNeighborsRegressor()
run_model(knn, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_decisiontree.csv')

KNeighborsRegressor()
train_rmsle: 0.8366819472499762
test_rmsle: 0.7825143021086259


In [19]:
DecisionTree = DecisionTreeRegressor()
run_model(DecisionTree, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_decisiontree.csv')

DecisionTreeRegressor()
train_rmsle: 0.9862827469325843
test_rmsle: 0.8028363605904483


In [34]:
Randomforest = RandomForestRegressor(min_samples_leaf=2, n_estimators=200)
run_model(Randomforest, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_randomforest.csv')

RandomForestRegressor(min_samples_leaf=2, n_estimators=200)
train_rmsle: 0.9138256021711614
test_rmsle: 0.8534870642414528


In [None]:
Gradient =GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200)
run_model(Gradient, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_gradientboost.csv')

In [None]:
Adaboost =AdaBoostRegressor()
run_model(Adaboost, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_adaboost.csv')

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
parameters = [{'n_estimators':[25,50,75,100,150,200,250],'max_depth':[1,2,3,4,5,6,7,8,9,10,None],'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11]}]

In [None]:
%%time
model =RandomForestRegressor()
randomsearch = RandomizedSearchCV(model, parameters, verbose =1 ,cv=5, scoring ='neg_mean_absolute_error' )
randomsearch.fit(train_X, train_y)

In [None]:
randomsearch.best_score_

In [None]:
randomsearch.best_estimator_

In [None]:
train_pred = randomsearch.predict(train_X)
rmsle(train_y , train_pred)

In [None]:
test_pred = randomsearch.predict(test_X)
rmsle(test_y , test_pred)

In [None]:
model1 =DecisionTreeRegressor()
model2 =KNeighborsRegressor()
model3 = GradientBoostingRegressor()
voting = VotingRegressor(estimators=[('dt', model1),('rf',model2),('gra',model3)])

In [None]:
run_model(voting,train_X,train_y,test_X,test_y,X,y,test,'bengaluru_houseprice_voting.csv')

In [None]:
final_estimator = RandomForestRegressor()

In [None]:
estimators=[('dt', model1),('rf',model2),('gra',model3)]

In [None]:
stack =StackingRegressor(estimators=estimators, final_estimator=final_estimator)

In [None]:
run_model(stack,train_X,train_y,test_X,test_y,X,y,test,'flight_price_stack.csv')