# Project Title: Property Usage
## By Go Code CO team "Tech Slope" - Aaron, Adam, Dex, Leo, Marc
### Version 1.1
### Last update: May 20, 2019

In [0]:
### Initiate
import pandas as pd
import geopandas as gpd
import time
import numpy as np
import matplotlib as plt
from matplotlib.path import Path
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder,LabelBinarizer

from google.colab import drive
drive.mount('/content/gdrive')

Using TensorFlow backend.


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Task 1 Training with Denver Data
Main parameters: Area sqft, zoning code <br>
Result: Property subtype provided by City of Denver <br>

*Note: Although we have processed crime, population, and traffic data, it seems that the only golden features are just the 'zoning code' and 'square footage' to determine the property usage type. We might put these attributes back to the model if we have a series of well-defined subtypes to use.*

In [0]:
## Load data
workdir='gdrive/My Drive/GoCode - PropertyUsage/Data processing/'
filename=workdir+'Clean Data/DenverParcels_ML.csv'
df = pd.read_csv(filename,na_values='None',dtype={"Area":np.float64,'PropertySubtype':'category'})
residential=['CONDOMINIUM','SINGLE FAMILY','ROWHOUSE']
vacant=['VCNT LAND R-X ZONE','VCNT LAND','FINANCIAL C&R']
df=df[~df['PropertySubtype'].isin(residential)] ## remove extra types
df=df[~df['PropertySubtype'].isin(vacant)] 
df=df.dropna(axis=0,how='any')
subtypelist = df['PropertySubtype'].unique()
num_classes = subtypelist.shape[0]

## Encode and normalize inputs to 0-1
lb = LabelBinarizer()
df_Y = lb.fit_transform(df['PropertySubtype'])
lb_make = LabelEncoder()
df["Zoning_Code"] = lb_make.fit_transform(df["ZoningCode"])
df_X = df.drop(columns=['PropertySubtype','ZoningCode'])

df_X_max =df_X['Area'].max()
df_X_min =df_X['Area'].min()
df_X_diff = df_X_max-df_X_min ##27940564
df_X['AreaNormalized']=(df_X['Area']-df_X_min)/df_X_diff
df_X.drop(columns='Area',inplace=True)
df_X.tail()

Unnamed: 0,Zoning_Code,AreaNormalized
231469,27,0.00036
231490,156,0.000142
231510,92,0.000636
231522,108,0.000111
231663,65,0.000228


In [0]:
df

Unnamed: 0,PropertySubtype,ZoningCode,Area,Zoning_Code
231469,"APT LOW-RISE>9UNT, WALK-UP",C-MX-5,9890.0,27
231490,AUTO DEALER MED SHOWROOM,U-MX-3,3909.0,156
231510,WAREHOUSE,I-MX-5,17449.0,92
231522,VCNT LAND R-X ZONE,PUD,3045.0,108
231663,APT W/2 UNITS,E-SU-DX,6250.0,65


In [0]:
# Create a model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical

## Fix random seed for reproducibility
seed = 42
np.random.seed(seed)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(df_X, df_Y, test_size=0.20, random_state=seed)
X_cols=X_train.shape[1]


def MultilabelClassification_model():
### YOUR TURN
    # Create a model with 2 convolutional layers and 1 dense layers before the output
    # Use Batch Normalization and Adam optimizer
    model = Sequential()
    model.add(Dense(155, activation='relu', input_shape=(X_cols,)))
    model.add(Dense(155, activation='relu'))
    model.add(Dense(num_classes))
    model.add(BatchNormalization())
    ####model.add(Activation('softmax')) # no good result 
    ####model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
    model.add(Activation('sigmoid'))
    model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
    return model
# build the model
model=MultilabelClassification_model()

In [0]:
# Fit the model
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=200, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100)) ## only showing the result from last execution.

Train on 21847 samples, validate on 5462 samples
Epoch 1/10
 - 2s - loss: 0.7708 - acc: 0.5354 - val_loss: 0.7493 - val_acc: 0.5560
Epoch 2/10
 - 1s - loss: 0.6990 - acc: 0.5822 - val_loss: 0.9918 - val_acc: 0.5113
Epoch 3/10
 - 1s - loss: 0.6350 - acc: 0.6456 - val_loss: 0.7242 - val_acc: 0.5814
Epoch 4/10
 - 1s - loss: 0.5773 - acc: 0.7151 - val_loss: 0.5771 - val_acc: 0.7178
Epoch 5/10
 - 1s - loss: 0.5257 - acc: 0.7660 - val_loss: 0.5222 - val_acc: 0.7871
Epoch 6/10
 - 1s - loss: 0.4798 - acc: 0.8253 - val_loss: 0.5139 - val_acc: 0.7228
Epoch 7/10
 - 1s - loss: 0.4398 - acc: 0.9463 - val_loss: 0.4191 - val_acc: 0.9454
Epoch 8/10
 - 1s - loss: 0.4047 - acc: 0.9930 - val_loss: 0.3873 - val_acc: 0.9895
Epoch 9/10
 - 1s - loss: 0.3737 - acc: 0.9933 - val_loss: 0.3619 - val_acc: 0.9910
Epoch 10/10
 - 1s - loss: 0.3464 - acc: 0.9933 - val_loss: 0.3400 - val_acc: 0.9929
CNN Error: 0.71%


In [0]:
## Point test
code=80 ## Zoning code: G-RO-3, could be vacant land.
area= 28000/df_X_diff ## 5540 sqft
test=np.array([pd.Series([code, area]),])
result=model.predict(test)
lb.inverse_transform(result)

array(['OFFICE BLDG'], dtype='<U29')

In [0]:
code= 92 ## Zoning code: I-MX-5, could be vacant land.
area= 3000/df_X_diff ## 5540 sqft
test=np.array([pd.Series([code, area]),])
result=model.predict(test)
lb.inverse_transform(result)

array(['P.I. SERVICE STATION'], dtype='<U29')

## Task 2 Prepare final data (non-residential)


In [0]:
## Match Opportunity Zones
df_mesazone = pd.read_csv('Address_Mesa_Cleaned.csv',dtype={'StreetNumber':str}) ## 49849 rows
df_mesazone['OppZoneGeoID']='0'
df_mesazone['EntZone']=False
df_mesazone['ZoningCode']='0'
df_mesazone.head()

Unnamed: 0,Latitude,Longitude,StreetNumber,StreetName,Unit,City,State,ZIP,FullAddress,OppZoneGeoID,EntZone,ZoningCode
0,38.523159,-108.889763,56500,HWY 141,,GATEWAY,CO,81522,56500 HWY 141,0,False,0
1,38.534833,-108.896495,55002,HWY 141,,GATEWAY,CO,81522,55002 HWY 141,0,False,0
2,38.535193,-108.838723,34165,S 12 8/10 RD,,WHITEWATER,CO,81527,34165 S 12 8/10 RD,0,False,0
3,38.541657,-108.567979,30200,UNCOMPAHGRE DIVIDE RD,,WHITEWATER,CO,815XX,30200 UNCOMPAHGRE DIVIDE RD,0,False,0
4,38.563368,-108.921568,52248,HWY 141,,GATEWAY,CO,81522,52248 HWY 141,0,False,0


In [0]:
for i in range(len(df_mesazone)):
    point=Point(df_mesazone['Longitude'][i],df_mesazone['Latitude'][i])
    for j in range(len(df_opp)): 
        if df_opp['geometry'][j].contains(point):
            df_mesazone['OppZoneGeoID'][i]= df_opp['GEOID'][j]
            break
##    if (i%1000==0)and(df_mesazone['OppZoneGeoID'][i]!='0'): print(i,df_mesazone['OppZoneGeoID'][i])      


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


13000 08077000800
14000 08077000800
15000 08077000900
17000 08077000200
19000 08077000800
21000 08077000800
25000 08077000200
28000 08077000800
29000 08077000700
30000 08077000500
31000 08077000800
34000 08077000500
37000 08077000400
41000 08077000400
47000 08077000400
48000 08077000400


Unnamed: 0,Latitude,Longitude,StreetNumber,StreetName,Unit,City,State,ZIP,FullAddress,OppZoneGeoID,EntZone,ZoningCode
49844,39.089047,-108.55364,1156.0,BOOKCLIFF AVE,12.0,GRAND JUNCTION,CO,81501.0,1156.0 BOOKCLIFF AVE,8077000500,False,
49845,39.087296,-108.471576,550.0,WARRIOR WAY,302.0,GRAND JUNCTION,CO,81504.0,550.0 WARRIOR WAY,0,False,
49846,39.088367,-108.521428,2864.0,PRESLEY AVE,,GRAND JUNCTION,CO,81501.0,2864.0 PRESLEY AVE,0,False,
49847,39.087289,-108.471221,550.0,WARRIOR WAY,312.0,GRAND JUNCTION,CO,81504.0,550.0 WARRIOR WAY,0,False,
49848,,-108.0,,,,,,,,0,False,


In [0]:
### Match enterprise zones:
for i in range(len(df_mesazone)):
    point=Point(df_mesazone['Longitude'][i],df_mesazone['Latitude'][i])
    for j in range(len(df_ent)): 
        if df_ent['geometry'][j].contains(point):
            df_mesazone['EntZone'][i]= True
            break
    if (i%1000==0)and(df_mesazone['EntZone'][i]==True): print(i,df_mesazone['EntZone'][i])  

In [0]:
def write_checkpoint(df, dfname):
  filename=time.strftime("%Y%m%d-%H%M")+' '+ dfname+'.csv'
  df.to_csv(filename, index=False)
  return

def read_checkpoint(filename):
    return pd.read_csv(filename,index_col=0)
  
write_checkpoint(df_mesazone,'Mesa_Zone')
## df_mesazone = pd.read_csv('Mesa_Zone.csv',dtype={"StreetNumber":str,'OppZoneGeoID':str})
## df_mesazone.head()

Unnamed: 0,Latitude,Longitude,StreetNumber,StreetName,Unit,City,State,ZIP,FullAddress,OppZoneGeoID,EntZone,ZoningCode
0,38.523159,-108.889763,56500,HWY 141,,GATEWAY,CO,81522,56500 HWY 141,0,False,0
1,38.534833,-108.896495,55002,HWY 141,,GATEWAY,CO,81522,55002 HWY 141,0,False,0
2,38.535193,-108.838723,34165,S 12 8/10 RD,,WHITEWATER,CO,81527,34165 S 12 8/10 RD,0,False,0
3,38.541657,-108.567979,30200,UNCOMPAHGRE DIVIDE RD,,WHITEWATER,CO,815XX,30200 UNCOMPAHGRE DIVIDE RD,0,False,0
4,38.563368,-108.921568,52248,HWY 141,,GATEWAY,CO,81522,52248 HWY 141,0,False,0


In [0]:
### Match zoning:
for i in range(len(df_mesazone)):
    point=Point(df_mesazone['Longitude'][i],df_mesazone['Latitude'][i])
    for j in range(len(df_zoning_gj)): 
        if df_zoning_gj['geometry'][j].contains(point):
            df_mesazone['ZoningCode'][i]= df_zoning_gj['Name'][j]
            if i%1000==0: print(i,j,df_zoning_gj['Name'][j])
            break  

In [0]:
''' Point Check
point=Point(-108.56177,39.0676422)
for j in range(len(df_zoning_gj)):  ##1005 records, 10s per point
    if df_zoning_gj['geometry'][j].contains(point):
        print(df_zoning_gj['Name'][j])
df_mesazone.iloc[1109]
'''

In [0]:
print(df_mesazone.tail(),'\n','---------','\n')
print('Shape of the current dataframe:',df_mesazone.shape,'\n')
print('Missing value counts:')
print(df_mesazone.shape[0]-df_mesazone.count())

In [0]:
## Save/Checkpoint
def write_checkpoint(df, dfname):
  filename=time.strftime("%Y%m%d-%H%M")+' '+ dfname+'.csv'
  df.to_csv(filename, index=False)
  return

def read_checkpoint(filename):
    return pd.read_csv(filename,index_col=0)
  
write_checkpoint(df_mesazone,'Mesa_Zone')
write_checkpoint(df_denzone,'Denver_Zone')
write_checkpoint(df_zone,'Both_Zone')

###### df_mesazone=read_checkpoint('20190331-0937 Mesa_Zone.csv')
###### df_mesazone.head()
