In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import multivariate_normal as mvn

import warnings
warnings.filterwarnings("ignore")

import geopy
from geopy.geocoders import Nominatim

In [42]:
# read the data file
ncdata = pd.read_csv("~/Desktop/Work/ML_EIT/Data/raw_house_data.csv")
ncdata.head()
ncdata.shape

(5000, 16)

In [3]:
# Drop the 'MLS", kitchen_features', 'floor_covering'
ncdata = ncdata.drop(columns = ['MLS', 'kitchen_features', 'floor_covering',"fireplaces","HOA"])
# Data shape (number of rwos and columns)
print(ncdata.shape)

(5000, 11)


In [4]:
# Replacing any special characters to NaN values in the entire data frame
ncdata = ncdata.replace(r'^\s*$',np.nan, regex = True)
# Replace "none' to "NaN" values in the data frame

col_none = list(ncdata.columns)
ncdata = ncdata[col_none].replace(['None'],'NaN')
ncdata.replace(0., np.nan, inplace= True)

In [5]:
# convert columns: object to float  
cols = ['sqrt_ft', 'garage', 'bathrooms']
ncdata[cols] = ncdata[cols].astype('float')
ncdata.dtypes

sold_price    float64
zipcode         int64
longitude     float64
latitude      float64
lot_acres     float64
taxes         float64
year_built      int64
bedrooms        int64
bathrooms     float64
sqrt_ft       float64
garage        float64
dtype: object

In [6]:
# filling the Nan values
ncdata['sqrt_ft'] = ncdata['sqrt_ft'].fillna(ncdata['sqrt_ft'].mode()[0])
# calc the sldprice/sqft
sqft = ncdata['sqrt_ft']
sldprice = ncdata["sold_price"]
pr_sqft = sldprice/sqft
print(pr_sqft[0:10])
ncdata.shape

0     504.761905
1     575.342466
2    1186.105620
3     498.946668
4     533.372420
5     475.007308
6     199.584200
7     280.237642
8     706.376480
9     501.543210
dtype: float64


(5000, 11)

In [7]:
# Replace Nan to mean value (important)
# ncdata.fillna(ncdata.mean())
ncdata.fillna(ncdata.median(), inplace=True)

In [8]:
#calc the price category
def prcat(val):
    return(val//100)

price_catgry = prcat(pr_sqft)
print(min(price_catgry),max(price_catgry))

0.0 12.0


In [9]:
print(price_catgry[0:100])

0      5.0
1      5.0
2     11.0
3      4.0
4      5.0
      ... 
95     3.0
96     3.0
97     5.0
98     5.0
99     3.0
Length: 100, dtype: float64


In [10]:
np.count_nonzero(price_catgry)

4973

In [11]:
ncdata["pr_sqft"] = pr_sqft
ncdata["price_catgry"] = price_catgry

ncdata.info()
ncdata.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sold_price    5000 non-null   float64
 1   zipcode       5000 non-null   int64  
 2   longitude     5000 non-null   float64
 3   latitude      5000 non-null   float64
 4   lot_acres     5000 non-null   float64
 5   taxes         5000 non-null   float64
 6   year_built    5000 non-null   int64  
 7   bedrooms      5000 non-null   int64  
 8   bathrooms     5000 non-null   float64
 9   sqrt_ft       5000 non-null   float64
 10  garage        5000 non-null   float64
 11  pr_sqft       5000 non-null   float64
 12  price_catgry  5000 non-null   float64
dtypes: float64(10), int64(3)
memory usage: 507.9 KB


(5000, 13)

In [12]:
# Select the downtown area
indx = np.where(((ncdata.latitude >= 32.0) & (ncdata.latitude < 32.7)) &((ncdata.longitude > -111.5) & (ncdata.longitude < -110.5)))[0]
print(len(indx))

4726


In [13]:
sp=ncdata.sold_price[indx]; zipp=ncdata.zipcode[indx]
long=ncdata.longitude[indx]; latt=ncdata.latitude[indx]
la=ncdata.lot_acres[indx]; tx=ncdata.taxes[indx]
yb=ncdata.year_built[indx]; br=ncdata.bedrooms[indx]
bath=ncdata.bathrooms[indx]; sft=ncdata.sqrt_ft[indx]
ga=ncdata.garage[indx]; prs=ncdata.pr_sqft[indx]
prca=ncdata.price_catgry[indx]

long.shape, latt.shape

dup = np.stack((sp,zipp,long,latt,la,tx,yb,br,bath,sft,ga,prs,prca),axis=1)
dup= np.array(dup)
print(dup.shape)

# creating DataFrame
cdata = pd.DataFrame(data=dup,columns =["sold_price","zipcode","longitude",
                                     "latitude","lot_acres","taxes","year_built",
                                    "bedrooms","bathrooms","sqrt_ft","garage",
                                       "pr_sqft","price_catgry"])

print(cdata.shape)
cdata.columns

(4726, 13)
(4726, 13)


Index(['sold_price', 'zipcode', 'longitude', 'latitude', 'lot_acres', 'taxes',
       'year_built', 'bedrooms', 'bathrooms', 'sqrt_ft', 'garage', 'pr_sqft',
       'price_catgry'],
      dtype='object')

In [14]:
colls = ['longitude','latitude']
features = ncdata[colls].to_numpy()
prices = ncdata["price_catgry"].to_numpy(dtype='int64')

# features = cdata[colls].to_numpy()
# prices = cdata["price_catgry"].to_numpy(dtype='int64')
print(prices)

[ 5  5 11 ...  2  1  1]


In [15]:
# without using sklearn function
X_train = features[:4200,:]
X_test= features[4200:,:]
y_train = prices[:4200]
y_test = prices[4200:]

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4200, 2) (800, 2) (4200,) (800,)


## KNN Classifier

In [16]:
class KNNClassifier():
    def fit(self,X,y):
        self.X = X
        self.y = y
        
    def predict(self, X, K, epsilon=1e-3):
        N = len(X)
        y_hat = np.zeros(N)
        
        for i in range(N):
            dist2 = np.sum((self.X - X[i])**2,axis=1)
            idxt = np.argsort(dist2)[:K]
            gamma_k = 1/(np.sqrt(dist2[idxt])+epsilon)
            y_hat[i] = np.bincount(self.y[idxt], weights = gamma_k).argmax()
            
        return y_hat

In [17]:
knn = KNNClassifier()
knn.fit(X_train,y_train)

In [18]:
y_hat = knn.predict(X_test,10)

In [19]:
def accuracy(y,y_hat):
    return np.mean(y_test == y_hat)

In [20]:
accuracy(y_test,y_hat)

0.65125

## KNN Regression

In [21]:
ncdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sold_price    5000 non-null   float64
 1   zipcode       5000 non-null   int64  
 2   longitude     5000 non-null   float64
 3   latitude      5000 non-null   float64
 4   lot_acres     5000 non-null   float64
 5   taxes         5000 non-null   float64
 6   year_built    5000 non-null   int64  
 7   bedrooms      5000 non-null   int64  
 8   bathrooms     5000 non-null   float64
 9   sqrt_ft       5000 non-null   float64
 10  garage        5000 non-null   float64
 11  pr_sqft       5000 non-null   float64
 12  price_catgry  5000 non-null   float64
dtypes: float64(10), int64(3)
memory usage: 507.9 KB


In [22]:
##Normalize/rever normalize
# Normalize entire data frame except long and latitude columns: Panda
def normalize(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max()-dataset.min()))
    print("Minimum: ",dataset.min())
    dataNorm["longitude"] = dataset["longitude"]
    dataNorm["latitude"] = dataset["latitude"]
    return dataNorm

def normm (sprice,dataset):
    return ((sprice-dataset["sold_price"].min())/(dataset["sold_price"].max()-dataset["sold_price"].min()))

def revnormm(ndata,dataset):
    return ((ndata)*(dataset["sold_price"].max()-dataset["sold_price"].min())) + dataset["sold_price"].min()

In [23]:
ncdata1 =normalize(ncdata)
print(ncdata1.shape)
ncdata1.sample(5)

Minimum:  sold_price      169000.000000
zipcode          85118.000000
longitude         -112.520168
latitude            31.356362
lot_acres            0.020000
taxes                1.000000
year_built           0.000000
bedrooms             1.000000
bathrooms            1.000000
sqrt_ft           1100.000000
garage               0.000000
pr_sqft             24.544805
price_catgry         0.000000
dtype: float64
(5000, 13)


Unnamed: 0,sold_price,zipcode,longitude,latitude,lot_acres,taxes,year_built,bedrooms,bathrooms,sqrt_ft,garage,pr_sqft,price_catgry
2077,0.108361,0.522822,-110.750101,32.215913,0.002006,0.000459,0.978702,0.114286,0.085714,0.157218,0.066667,0.116893,0.083333
4341,0.074644,0.524481,-110.851864,32.292703,0.000427,0.000451,0.983655,0.028571,0.057143,0.080298,0.066667,0.14515,0.083333
1105,0.128825,0.419917,-110.738728,31.611187,0.036751,0.000553,0.988113,0.085714,0.057143,0.160691,0.066667,0.134248,0.083333
2000,0.106412,0.528631,-110.977216,32.438041,0.000232,0.00053,0.994056,0.057143,0.085714,0.092313,0.083333,0.176199,0.166667
2036,0.096667,0.517842,-111.026538,32.360568,0.000167,0.000597,0.987618,0.085714,0.085714,0.106251,0.066667,0.146256,0.083333


In [24]:
# normalize price only
# prices3 = ncdata["sold_price"]
# prices2 = (normm(prices3,ncdata))
# print(prices2)


In [25]:
colls1 = ['longitude','latitude','bedrooms','bathrooms','sqrt_ft','price_catgry']
# colls1 = ['bedrooms','bathrooms','sqrt_ft','price_catgry']
features1 = ncdata1[colls1].to_numpy()
prices1 = ncdata1["sold_price"].to_numpy(dtype='int64')
# prices2 = ncdata1["sold_price"]*20.
# price2 = prices2.to_numpy(dtype='int64')
print(prices1)

# test
datt = features1[:,2]
print(datt[0:10])

[1 0 0 ... 0 0 0]
[0.34285714 0.02857143 0.02857143 0.17142857 0.08571429 0.05714286
 0.22857143 0.14285714 0.11428571 0.11428571]


In [26]:
# without using sklearn function
X_train1 = features1[:4000,:]
X_test1= features1[4000:,:]
y_train1 = prices1[:4000]
y_test1 = prices1[4000:]

print(X_train1.shape, X_test1.shape, y_train1.shape, y_test1.shape)

(4000, 6) (1000, 6) (4000,) (1000,)


In [27]:
class KNNRegressor():
    def fit(self,X,y):
        self.X = X
        self.y = y
        
    def predict(self,X, K, epsilon=1e-03):
        N = len(X)
        y_hat = np.zeros(N)
        
        for i in range(N):
            dist2 = np.sum((self.X-X[i])**2,axis=1)
            idxt = np.argsort(dist2)[:K]
            gamma_k = 1/(np.sqrt(dist2[idxt]) + epsilon)
            y_hat[i] = gamma_k.dot(self.y[idxt])/gamma_k.sum() #traget value with the height weight
             
        return y_hat

In [28]:
# class KNNRegII():
#     def fit(self,X,y):
#         self.X = X
#         self.y = y
        
#     def predict(self,X, K, epsilon=1e-03):
#         N = len(X)
#         y_hat = np.zeros(N)
        
#         for i in range(N):
#             dist2 = np.sum((self.X-X[i])**2,axis=1)
#             idxt = np.argsort(dist2)[:K]
#             gamma_k = np.exp(-dist2[idxt])/(np.exp(-dist2[idxt]).sum()+epsilon)
#             y_hat[i] = gamma_k.dot(self.y[idxt])/gamma_k.sum() 
#         return y_hat
       

In [29]:
knnreg = KNNRegressor()

In [30]:
knnreg.fit(X_train1,y_train1)

In [31]:
y_hat1 = knnreg.predict(X_test1,300, epsilon=1e-3)

In [32]:
def accuracy(y,y_hat):
    return np.mean(y == y_hat1)

In [33]:
accuracy(y_test1,y_hat1)

0.994

## Single Home Price locator

In [41]:
geolocator=Nominatim(user_agent='myapp')
# from geopy.geocoders import Nominatim
address="10441 E Port Townsend St Tucson, AZ 85747"
# address="4650, West Lone Ridge Place, Marana, Pima County, Arizona, 85658"

# find the lon/lat using address
mlocation = geolocator.geocode(address)
mlat, mlon = mlocation.latitude, mlocation.longitude
# print(mlocation.latitude, mlocation.longitude)

bedrms= 4.0
bathrooms= 2.0
sqrft=1873.0

xknnclass = np.array([[mlon,mlat]])
prcat = knn.predict(xknnclass,10)
# print(prcat)

def normm (sprice,dataset):
    return ((sprice-dataset["bedrooms"].min())/(dataset["bedrooms"].max()-dataset["bedrooms"].min()))
bedr = normm(bedrms,ncdata1)

def normm (sprice,dataset):
    return ((sprice-dataset["bathrooms"].min())/(dataset["bathrooms"].max()-dataset["bathrooms"].min()))
bathr = normm(bathrooms,ncdata1)

def normm (sprice,dataset):
    return ((sprice-dataset["sqrt_ft"].min())/(dataset["sqrt_ft"].max()-dataset["sqrt_ft"].min()))
sqrt = normm(sqrft,ncdata1)
print(bedr,bathr,sqrt)

def normm (sprice,dataset):
    return ((sprice-dataset["price_catgry"].min())/(dataset["price_catgry"].max()-dataset["sqrt_ft"].min()))
sqrt = normm(prcat,ncdata1)
# print(bedr,bathr,sqrt,prcat)

# c_data =np.array([[mlon,mlat,bedr,bathr,sqrt,prcat]])
c_data =[mlon,mlat,bedr,bathr,np.asscalar(sqrt),np.asscalar(prcat)]
# print('Client values: ',c_data)

# c_data =np.array([[-110.77, 32.099, 0.167,32.09959666666667,mlat,bedr,bathr,normm]])

# val = knnreg.predict(c_data,100, epsilon=1e-3)
val = knnreg.predict(c_data,20)
# print("Predicted normalized price: ",val)

def revnormm(ndata,dataset):
    return ((ndata)*(dataset["sold_price"].max()-dataset["sold_price"].min())) + dataset["sold_price"].min()
prc = revnormm(val,ncdata1)
# print("Estimated Price: ",prc)


4.0 2.0 1873.0
