
<h2>Predicting house prices in Georgia</h2>

<h3>Importing libraries and dataset</h3>

<h4>Importing dataset</h4>


In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor



Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Using cached xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
Using cached nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.1


In [2]:

data = pd.read_csv("archive/RealEstate_Georgia.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,id,stateId,countyId,cityId,country,datePostedString,is_bankOwned,is_forAuction,event,...,parking,garageSpaces,hasGarage,levels,pool,spa,isNewConstruction,hasPetsAllowed,homeType,county
0,0,31503-110785431,16,17,55064,USA,2021-07-12,0,0,Listed for sale,...,0,0.0,0,0,0,0,0,0,SINGLE_FAMILY,Brantley County
1,1,31503-76611082,16,18,55064,USA,2021-07-12,0,0,Listed for sale,...,0,0.0,0,0,0,0,0,0,SINGLE_FAMILY,Ware County
2,2,31503-93126153,16,19,55064,USA,2021-07-10,0,0,Listed for sale,...,1,0.0,0,0,0,0,0,0,SINGLE_FAMILY,Ware County
3,3,31503-110785598,16,20,55064,USA,2021-07-09,0,0,Listed for sale,...,0,0.0,0,0,0,0,0,0,SINGLE_FAMILY,Brantley County
4,4,31503-2101070583,16,21,55064,USA,2021-07-06,0,0,Listed for sale,...,0,0.0,0,0,0,0,0,0,LOT,Ware County


- view the dataset's features using pandas `.columns` method

In [3]:
data.columns

Index(['Unnamed: 0', 'id', 'stateId', 'countyId', 'cityId', 'country',
       'datePostedString', 'is_bankOwned', 'is_forAuction', 'event', 'time',
       'price', 'pricePerSquareFoot', 'city', 'state', 'yearBuilt',
       'streetAddress', 'zipcode', 'longitude', 'latitude', 'hasBadGeocode',
       'description', 'currency', 'livingArea', 'livingAreaValue',
       'lotAreaUnits', 'bathrooms', 'bedrooms', 'buildingArea', 'parking',
       'garageSpaces', 'hasGarage', 'levels', 'pool', 'spa',
       'isNewConstruction', 'hasPetsAllowed', 'homeType', 'county'],
      dtype='object')

- The target Variable is `price`

<h4>Data Cleaning</h4>
- First remove columns of little value to the target variable, price or just simply select the needed variables.

In [4]:
data.shape

(13804, 39)

In [5]:
trimmed_data = data.copy()
trimmed_data.shape

(13804, 39)

In [6]:
#using drop method
"""
trimmed_data.drop(columns=[], axis=1,inplace=True)
"""


'\ntrimmed_data.drop(columns=[], axis=1,inplace=True)\n'

In [7]:
# selecting specific columns
trimmed_data = trimmed_data[["price", "pricePerSquareFoot", "city", "state", "yearBuilt", "streetAddress", "livingArea", "livingAreaValue", "lotAreaUnits", "bathrooms", "bedrooms", "buildingArea", "parking", "garageSpaces", "hasGarage", "pool", "isNewConstruction", "hasPetsAllowed", "county"]]
trimmed_data.columns

Index(['price', 'pricePerSquareFoot', 'city', 'state', 'yearBuilt',
       'streetAddress', 'livingArea', 'livingAreaValue', 'lotAreaUnits',
       'bathrooms', 'bedrooms', 'buildingArea', 'parking', 'garageSpaces',
       'hasGarage', 'pool', 'isNewConstruction', 'hasPetsAllowed', 'county'],
      dtype='object')

- We've dropped 20 features

In [8]:
#trimmed data vs original dataset
trimmed_data.shape,trimmed_data.size, data.shape, data.size

((13804, 19), 262276, (13804, 39), 538356)

<h4>Simple summary statistics</h4>


In [9]:
trimmed_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13804 entries, 0 to 13803
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               13804 non-null  float64
 1   pricePerSquareFoot  13804 non-null  float64
 2   city                13804 non-null  object 
 3   state               13804 non-null  object 
 4   yearBuilt           13804 non-null  int64  
 5   streetAddress       13804 non-null  object 
 6   livingArea          13804 non-null  float64
 7   livingAreaValue     13804 non-null  float64
 8   lotAreaUnits        13804 non-null  object 
 9   bathrooms           13804 non-null  float64
 10  bedrooms            13804 non-null  float64
 11  buildingArea        13804 non-null  float64
 12  parking             13804 non-null  int64  
 13  garageSpaces        13804 non-null  float64
 14  hasGarage           13804 non-null  int64  
 15  pool                13804 non-null  int64  
 16  isNe

In [10]:
trimmed_data.describe()

Unnamed: 0,price,pricePerSquareFoot,yearBuilt,livingArea,livingAreaValue,bathrooms,bedrooms,buildingArea,parking,garageSpaces,hasGarage,pool,isNewConstruction,hasPetsAllowed
count,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0,13804.0
mean,367097.3,116.78173,1347.435961,1858.872,1858.872,1.878079,2.348667,1176.98993,0.460301,0.488409,0.354825,0.054405,0.045784,0.003839
std,647834.6,1748.198773,928.113617,43074.42,43074.42,1.886242,2.076541,1813.717292,0.49844,0.929491,0.478477,0.226822,0.209024,0.061847
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,95000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,247000.0,95.0,1965.0,1280.0,1280.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,425000.0,161.0,1999.0,2299.25,2299.25,3.0,4.0,1996.25,1.0,0.0,1.0,0.0,0.0,0.0
max,30504000.0,205000.0,9999.0,5057316.0,5057316.0,89.0,89.0,87120.0,1.0,8.0,1.0,1.0,1.0,1.0


In [11]:
#Missing values