In [1]:
# House Price Prediction

# By adding the most used regressions, we will continue the project with the regression that makes the best prediction among them.


In [2]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
     ---------------------------------------- 99.7/99.7 MB 2.4 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0


In [6]:
# python library
import numpy as np
import pandas as pd 

# ML library
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor


In [7]:
# close warning library
from warnings import filterwarnings
filterwarnings("ignore")

In [8]:
df = pd.read_csv("C:/Users/HAZAL/OneDrive/Masaüstü/Projeler/house_price_prediction/kc_house_data.csv")

In [9]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [11]:
df.shape

(21613, 21)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [13]:
df = df.dropna()

In [16]:
df.drop(["id","date"], axis = 1, inplace = True)

In [21]:
def compML(df, y, alg):
    
    y = df.price.values.reshape(-1,1)
    x = df.drop(["price"], axis=1)
    
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    y = scaler.fit_transform(y)
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, shuffle=True)
    
    model = alg().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    r2 = r2_score(y_test, y_pred)
    
    model_ismi = alg.__name__
    
    print(model_ismi, "R2_Score ----> ", r2)
    

In [22]:
models = [LinearRegression,
         DecisionTreeRegressor,
         KNeighborsRegressor,
         MLPRegressor,
         RandomForestRegressor,
         GradientBoostingRegressor,
         SVR,
         XGBRegressor]

In [23]:
for i in models:
    compML(df, "price", i)

LinearRegression R2_Score ---->  0.6951648030401463
DecisionTreeRegressor R2_Score ---->  0.7784758697709763
KNeighborsRegressor R2_Score ---->  0.80341482854232
MLPRegressor R2_Score ---->  0.8154091411121364
RandomForestRegressor R2_Score ---->  0.8916822668051372
GradientBoostingRegressor R2_Score ---->  0.8664202852022378
SVR R2_Score ---->  0.29606313030184794
XGBRegressor R2_Score ---->  0.8952819395370255
