In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

### Import training data set

In [50]:
train_df = pd.read_csv(r'C:\Users\nikhi\OneDrive\Documents\train.csv')
train_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5
...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0


### Import testing data set

In [51]:
test_df = pd.read_csv(r'C:\Users\nikhi\OneDrive\Documents\test.csv')
test_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,Owner,0,0,1,BHK,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700
1,Dealer,1,1,2,BHK,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600
4,Owner,0,0,1,BHK,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000
...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,BHK,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330
68716,Dealer,0,1,3,BHK,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082
68717,Dealer,1,1,1,BHK,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231
68718,Dealer,0,0,2,BHK,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670


### Drop unrelated columns, which don't affect the house price from the test.csv

In [52]:
cleaned_train_df = train_df.drop(['UNDER_CONSTRUCTION','RERA','BHK_NO.','POSTED_BY','READY_TO_MOVE','RESALE','BHK_OR_RK','ADDRESS'],axis = 1)
cleaned_train_df

Unnamed: 0,SQUARE_FT,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,1300.236407,12.969910,77.597960,55.0
1,1275.000000,12.274538,76.644605,51.0
2,933.159722,12.778033,77.632191,43.0
3,929.921143,28.642300,77.344500,62.5
4,999.009247,22.592200,88.484911,60.5
...,...,...,...,...
29446,2500.000000,27.140626,78.043277,45.0
29447,769.230769,39.945409,-86.150721,16.0
29448,1022.641509,26.928785,75.828002,27.1
29449,927.079009,12.900150,80.227910,67.0


### Drop unrelated columns, which don't affect the house price from the test.csv 

In [54]:
cleaned_test_df = test_df.drop(["UNDER_CONSTRUCTION", "RERA", "BHK_NO.","POSTED_BY","READY_TO_MOVE","RESALE","BHK_OR_RK","ADDRESS"],axis = 1)
cleaned_test_df

Unnamed: 0,SQUARE_FT,LONGITUDE,LATITUDE
0,545.171340,21.262000,73.047700
1,800.000000,18.966114,73.148278
2,1257.096513,22.592200,88.484911
3,1400.329489,26.988300,75.584600
4,430.477830,22.700000,72.870000
...,...,...,...
68715,856.555505,19.180000,72.963330
68716,2304.147465,30.661104,76.746082
68717,33362.792750,19.222101,72.988231
68718,1173.708920,18.496670,73.941670


### Seperate independent variables into x_train and dependent variables into y_train

In [55]:
train_df_values = cleaned_train_df.values
x_train = train_df_values[:,0:3]
y_train = train_df_values[:,3]

### Also Insert test data into x_test

In [56]:
test_df_values = cleaned_test_df.values
x_test = test_df_values[:,0:3]

In [43]:
x_test.shape

(68720, 3)

In [30]:
x_train.shape,y_train.shape

((29451, 3), (29451,))

In [40]:
y_train =y_train.reshape(-1,1)

In [41]:
x_train.shape,y_train.shape

((29451, 3), (29451, 1))

# <font color = 'green'>Linear Regression</font>

In [45]:
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)

In [46]:
y_pred = lin_reg.predict(x_train)
y_pred

array([[165.27640901],
       [168.75673551],
       [165.77701834],
       ...,
       [124.11445148],
       [161.81812806],
       [124.37895431]])

### Calculating Root Mean Squared Error and r^2 score

In [47]:
rmse = np.sqrt(mean_squared_error(y_train,y_pred))
rmse

600.8428789425808

In [49]:
r2 = r2_score(y_train,y_pred)
r2

0.16331195702385093

### Adding predicted linear regression values to end of the train data set

In [58]:
train_df['Predicted_price_linear_regression'] = y_pred

In [60]:
train_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),Predicted_price_linear_regression
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0,165.276409
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0,168.756736
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0,165.777018
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5,116.664320
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5,120.203198
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0,120.606009
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0,306.666888
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1,124.114451
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0,161.818128


### Now predicting with testing data(x_test)

In [61]:
y_pred_test = lin_reg.predict(x_test)

In [62]:
y_pred_test

array([[145.5641077 ],
       [152.6258571 ],
       [120.23907213],
       ...,
       [156.573724  ],
       [153.04951982],
       [152.70931321]])

### Adding predicted linear regression values to end of the test data set

In [63]:
test_df['Predicted_price_linear_regression'] = y_pred_test

In [64]:
test_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,Predicted_price_linear_regression
0,Owner,0,0,1,BHK,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700,145.564108
1,Dealer,1,1,2,BHK,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278,152.625857
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911,120.239072
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600,124.316629
4,Owner,0,0,1,BHK,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000,141.305375
...,...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,BHK,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330,152.221091
68716,Dealer,0,1,3,BHK,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082,111.379749
68717,Dealer,1,1,1,BHK,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231,156.573724
68718,Dealer,0,0,2,BHK,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670,153.049520


# <font color = 'green'>Decision Tree Regressor</font>

In [65]:
decision_tree = DecisionTreeRegressor()
decision_tree.fit(x_train,y_train)

In [77]:
y_pred = decision_tree.predict(x_train)
y_pred

array([55. , 51. , 43. , ..., 27.1, 67. , 27.8])

### Calculating Root Mean Squared Error and r^2 score

In [73]:
rmse = np.sqrt(mean_squared_error(y_train,y_pred))
rmse

8.715009786368816

In [74]:
r2 = r2_score(y_train,y_pred)
r2

0.9998239737309261

### Adding the prediction values at the end of the train data set to compare the predictions

In [78]:
train_df['Predicted_price_decision_tree'] = y_pred

In [79]:
train_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),Predicted_price_linear_regression,Predicted_price_decision_tree
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0,165.276409,55.0
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0,168.756736,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0,165.777018,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5,116.664320,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5,120.203198,60.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0,120.606009,45.0
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0,306.666888,16.0
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1,124.114451,27.1
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0,161.818128,67.0


### Now predicting on the testing data 

In [72]:
y_pred_test = decision_tree.predict(x_test)
y_pred_test

array([  13.5,   81. ,   72. , ..., 8610. ,   84.4,  250. ])

### Adding the prediction values at the end of the test data set to compare the predictions¶

In [80]:
test_df['Predicted_price_decision_tree'] = y_pred_test

In [81]:
test_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,Predicted_price_linear_regression,Predicted_price_decision_tree
0,Owner,0,0,1,BHK,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700,145.564108,13.5
1,Dealer,1,1,2,BHK,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278,152.625857,81.0
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911,120.239072,72.0
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600,124.316629,35.0
4,Owner,0,0,1,BHK,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000,141.305375,19.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,BHK,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330,152.221091,69.0
68716,Dealer,0,1,3,BHK,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082,111.379749,120.0
68717,Dealer,1,1,1,BHK,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231,156.573724,8610.0
68718,Dealer,0,0,2,BHK,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670,153.049520,84.4


# <font color = 'green'>Random Forest Regressor</font>

In [88]:
rand_forest = RandomForestRegressor(n_estimators = 200,random_state = 7)
rand_forest.fit(x_train,y_train)

In [89]:
y_pred = rand_forest.predict(x_train)
y_pred

array([56.63  , 53.7825, 44.698 , ..., 27.3325, 59.959 , 27.695 ])

### Calculating Root Mean Squared Error and r^2 score

In [90]:
rmse = np.sqrt(mean_squared_error(y_train,y_pred))
rmse

84.26212462570449

In [91]:
r2 = r2_score(y_train,y_pred)
r2

0.9835446722540621

In [92]:
train_df['Predicted_price_random_forest'] = y_pred

In [93]:
train_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS),Predicted_price_linear_regression,Predicted_price_decision_tree,Predicted_price_random_forest
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0,165.276409,55.0,56.63000
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0,168.756736,51.0,53.78250
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0,165.777018,43.0,44.69800
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5,116.664320,62.5,58.63600
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5,120.203198,60.5,54.02435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0,120.606009,45.0,73.79850
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0,306.666888,16.0,25.24900
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1,124.114451,27.1,27.33250
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0,161.818128,67.0,59.95900


In [94]:
y_pred_test = rand_forest.predict(x_test)
y_pred_test

array([  27.444  ,   74.9805 ,   65.72   , ..., 8140.47925,   92.056  ,
        677.45   ])

In [95]:
test_df['predicted_price_random_forest'] = y_pred_test

In [96]:
test_df

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,Predicted_price_linear_regression,Predicted_price_decision_tree,predicted_price_random_forest
0,Owner,0,0,1,BHK,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700,145.564108,13.5,27.444000
1,Dealer,1,1,2,BHK,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278,152.625857,81.0,74.980500
2,Dealer,0,0,2,BHK,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911,120.239072,72.0,65.720000
3,Dealer,0,0,3,BHK,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600,124.316629,35.0,35.216333
4,Owner,0,0,1,BHK,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000,141.305375,19.2,14.493000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68715,Dealer,0,1,2,BHK,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330,152.221091,69.0,78.087500
68716,Dealer,0,1,3,BHK,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082,111.379749,120.0,116.632000
68717,Dealer,1,1,1,BHK,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231,156.573724,8610.0,8140.479250
68718,Dealer,0,0,2,BHK,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670,153.049520,84.4,92.056000
