In [69]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor , GradientBoostingRegressor

In [70]:
def rmsle(y_true, y_pred):
    return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())

In [71]:
train = pd.read_csv('train_preprocessed.csv')

In [72]:
train.head()

Unnamed: 0,availability,size,society,total_sqft,bath,balcony,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,...,location_Vittasandra,location_Volagerekallahalli,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_other,price
0,-1.959852,-0.656027,-1.670517,-0.418694,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,...,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507,39.07
1,0.510243,0.997617,0.612839,0.867743,1.718585,1.73776,-0.471113,-0.081226,2.343851,-1.386504,...,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507,120.0
2,0.510243,0.170795,0.90251,-0.098751,-0.518099,1.73776,2.122635,-0.081226,-0.426648,-1.386504,...,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507,62.0
3,0.510243,0.170795,0.320973,-0.031263,0.227462,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,...,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507,95.0
4,0.510243,-0.656027,0.90251,-0.298715,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,...,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507,51.0


In [73]:
train.isna().sum()

availability                   0
size                           0
society                        0
total_sqft                     0
bath                           0
                              ..
location_Yelahanka New Town    0
location_Yelenahalli           0
location_Yeshwanthpur          0
location_other                 0
price                          0
Length: 271, dtype: int64

In [74]:
test = pd.read_csv('test_preprocessed.csv')

In [75]:
test.head()

Unnamed: 0,availability,size,society,total_sqft,bath,balcony,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,...,location_Vishwapriya Layout,location_Vittasandra,location_Volagerekallahalli,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_other
0,0.510243,-0.656027,-0.147548,-0.277885,-0.518099,0.495387,-0.471113,-0.081226,-0.426648,0.721238,...,-0.030772,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507
1,0.510243,5.131728,0.90251,0.701106,4.70083,0.495387,-0.471113,-0.081226,2.343851,-1.386504,...,-0.030772,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507
2,-1.959852,0.997617,0.021427,0.076218,1.718585,0.495387,-0.471113,-0.081226,2.343851,-1.386504,...,-0.030772,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507
3,0.510243,0.170795,0.884954,-0.197067,0.227462,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,...,-0.030772,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507
4,0.510243,-0.656027,-2.056746,-0.331209,-0.518099,-0.746985,-0.471113,-0.081226,-0.426648,0.721238,...,-0.030772,-0.058806,-0.027274,-0.204131,-0.037697,-0.124532,-0.053982,-0.030772,-0.077784,-0.507507


In [76]:
X =train.drop(columns='price')

In [77]:
y =train['price']

In [78]:
y.shape

(13319,)

In [96]:
train_X, test_X,train_y, test_y  =train_test_split(X,y,random_state=42,test_size=0.2)

In [91]:
train_X.isna().sum()

availability                   0
size                           0
society                        0
total_sqft                     0
bath                           0
                              ..
location_Yelahanka             0
location_Yelahanka New Town    0
location_Yelenahalli           0
location_Yeshwanthpur          0
location_other                 0
Length: 270, dtype: int64

In [97]:
def run_model(model, train_X , train_y , test_X , test_y, test ,X, y,  filename):
    print(model)
    model.fit(train_X, train_y)
    train_pred = model.predict(train_X)
    print('train_rmsle:', rmsle(train_y , train_pred))
    test_pred = model.predict(test_X)
    print('test_rmsle:', rmsle(test_y , test_pred))
    model.fit(X,y)
    pred = model.predict(test)
    res =pd.DataFrame(pred, columns=['price'])
    res.to_csv(filename, index=False)

In [93]:
linearreg = LinearRegression()
run_model(linearreg, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_linearreg.csv')

LinearRegression()


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())
  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


train_rmsle: 0.7903514123442547
test_rmsle: 0.7674802611043735


In [83]:
lasso = Lasso()
run_model(lasso, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_lasso.csv')

Lasso()
train_rmsle: 0.7917801256755983
test_rmsle: 0.7754145950594893


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())
  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


In [84]:
ridge = Ridge()
run_model(ridge, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_ridge.csv')

Ridge()
train_rmsle: 0.7904705241637102
test_rmsle: 0.7857073025289646


  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())
  return 1 - np.sqrt(np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean())


In [85]:
knn =KNeighborsRegressor()
run_model(knn, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_decisiontree.csv')

KNeighborsRegressor()
train_rmsle: 0.8621404473763714
test_rmsle: 0.8114590720378393


In [86]:
DecisionTree = DecisionTreeRegressor()
run_model(DecisionTree, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_decisiontree.csv')

DecisionTreeRegressor()
train_rmsle: 0.9729001209102717
test_rmsle: 0.8223779726858051


In [98]:
Randomforest = RandomForestRegressor()
run_model(Randomforest, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_randomforest.csv')

RandomForestRegressor()
train_rmsle: 0.9372801732352212
test_rmsle: 0.8514141537035658


In [88]:
Gradient =GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200)
run_model(Gradient, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_gradientboost.csv')

GradientBoostingRegressor(min_samples_leaf=2, n_estimators=200)
train_rmsle: 0.8521063706978205
test_rmsle: 0.8390460092409688


In [89]:
Adaboost =AdaBoostRegressor()
run_model(Adaboost, train_X , train_y , test_X , test_y, test ,X, y,  'bengaluru_houseprice_adaboost.csv')

AdaBoostRegressor()
train_rmsle: 0.7539446003965307
test_rmsle: 0.7517141320151246


In [100]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [108]:
parameters = [{'n_estimators':[25,50,75,100,150,200,250,300,350],'max_depth':[1,2,3,4,5,6,7,8,9,10,None],'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11]}]

In [109]:
%%time
model =RandomForestRegressor()
randomsearch = RandomizedSearchCV(model, parameters, verbose =1 ,cv=10, scoring ='neg_mean_absolute_error' )
randomsearch.fit(train_X, train_y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Wall time: 15min 29s


RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(),
                   param_distributions=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8,
                                                       9, 10, None],
                                         'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11],
                                         'n_estimators': [25, 50, 75, 100, 150,
                                                          200, 250, 300,
                                                          350]}],
                   scoring='neg_mean_absolute_error', verbose=1)

In [110]:
randomsearch.best_score_

-34.197086877719585

In [111]:
randomsearch.best_estimator_

RandomForestRegressor(min_samples_leaf=6, n_estimators=50)

In [112]:
train_pred = randomsearch.predict(train_X)
rmsle(train_y , train_pred)

0.8754288828841196

In [113]:
test_pred = randomsearch.predict(test_X)
rmsle(test_y , test_pred)

0.8478984448749309

In [116]:
model1 =DecisionTreeRegressor()
model2 =RandomForestRegressor()
model3 = GradientBoostingRegressor()
voting = VotingRegressor(estimators=[('dt', model1),('rf',model2),('gra',model3)])

In [117]:
run_model(voting,train_X,train_y,test_X,test_y,X,y,test,'bengaluru_houseprice_voting.csv')

VotingRegressor(estimators=[('dt', DecisionTreeRegressor()),
                            ('rf', RandomForestRegressor()),
                            ('gra', GradientBoostingRegressor())])
train_rmsle: 0.923877327629726
test_rmsle: 0.8506011055262497


ValueError: y should be a 1d array, got an array of shape (1480, 270) instead.

In [118]:
final_estimator = RandomForestRegressor()

In [119]:
estimators=[('dt', model1),('rf',model2),('gra',model3)]

In [120]:
stack =StackingRegressor(estimators=estimators, final_estimator=final_estimator)

In [121]:
run_model(stack,train_X,train_y,test_X,test_y,X,y,test,'flight_price_stack.csv')

StackingRegressor(estimators=[('dt', DecisionTreeRegressor()),
                              ('rf', RandomForestRegressor()),
                              ('gra', GradientBoostingRegressor())],
                  final_estimator=RandomForestRegressor())
train_rmsle: 0.8942732090766722
test_rmsle: 0.8430715626052587


ValueError: y should be a 1d array, got an array of shape (1480, 270) instead.

In [54]:
model=RandomForestRegressor()
model.fit(X,y)
model.predict([[1,1,0,4,1,2600,5.0,3.0,1,2,3]])



array([926.19866667])