## Contributors: 
Neal Friesenhahn

### Importing Libraries:

In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import math
import numpy as np
from sklearn.metrics import classification_report

from amb_sdk.sdk import DarwinSdk

### Configure Darwin:

In [2]:
# Login
ds = DarwinSdk()
file = open("login.txt", "r")
username = file.readline(0)
password = file.readline(1)
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('username', 'password')
if not status:
    print(msg)
    

#### Data Path
Make sure to set this to your local machine's path to the data.

In [3]:
path = './'

### Importing Data:
Data used in this project:
https://data.austintexas.gov/Building-and-Development/Issued-Construction-Permits/3syk-w9eu

In [6]:
dataFile = "./Issued_Construction_Permits.csv"
filename= "test.csv"
data = pd.read_csv(dataFile, skipinitialspace=True)

#Columns with detected mixed types
mixedData_col = [52,54,56,58,59,60,61,62,63,64,65,66]

columnsNames = data.columns.values
#excess labels included in feature drop
featureDrop = ['Permit Type Desc', 'Permit Num', 'Permit Class', 'Contractor Trade']
#add mixed datatype col to feature drop until later date of processing 
print("Removed columns from dataset:")
for col in mixedData_col:
    print(columnsNames[col])
    featureDrop.append(columnsNames[col])
    
reduceData = data.drop(featureDrop, axis=1)
#data added chronologically to dataset, for now reduce by half for random sampling
reduceSize = math.floor(len(reduceData)/2)
reduceData = reduceData.head(reduceSize)
reduceData.to_csv(os.path.join(path, filename))
reduceData.head()

  interactivity=interactivity, compiler=compiler, result=result)


Removed columns from dataset:
Contractor Company Name
Contractor Phone
Contractor Address 2
Contractor Zip
Applicant Full Name
Applicant Organization
Applicant Phone
Applicant Address 1
Applicant Address 2
Applicant City
Applicant Zip
Certificate Of Occupancy


Unnamed: 0,Permit Type,Permit Class Mapped,Work Class,Condominium,Project Name,Description,TCAD ID,Property Legal Description,Applied Date,Issued Date,...,Link,Project ID,Master Permit Num,Latitude,Longitude,Location,Contractor Full Name,Contractor Address 1,Contractor City,Total Lot SQFT
0,EP,Residential,Addition,No,1407 ALAMEDA DR,expand exst basement by 550 sq ft entirely wit...,301020315,LOT 30 BLK 3 TRAVIS HEIGHTS PLUS 12 ADJ VAC ALLEY,2007/03/13,2009/10/26,...,https://abc.austintexas.gov/web/permit/public-...,10012479,10012477.0,30.247589,-97.742178,"(30.24758944, -97.74217752)",Lucy Taus Katz,,Austin,
1,MP,Residential,Addition,No,1407 ALAMEDA DR,expand exst basement by 550 sq ft entirely wit...,301020315,LOT 30 BLK 3 TRAVIS HEIGHTS PLUS 12 ADJ VAC ALLEY,2007/03/13,2009/10/15,...,https://abc.austintexas.gov/web/permit/public-...,10012480,10012477.0,30.247589,-97.742178,"(30.24758944, -97.74217752)",Lucy Taus Katz,,Austin,
2,MP,Commercial,Remodel,No,1524 S IH 35 SVRD SB UNIT 125,interior remodel to existing office Staples ...,302030941,TRT A TEAGUEBUDA SUBD NO 1,2007/03/21,2009/06/18,...,https://abc.austintexas.gov/web/permit/public-...,10015275,10015274.0,30.243101,-97.736862,"(30.24310053, -97.73686219)",Darron Duggins,450 LITTLE EL PASO RD,Martindale,
3,MP,Residential,New,No,2104 Wychwood Drive A 00000,2 Stry Frm Res WMas Ven And Att Gar,105170437,LOT 8 BLK E BEECAVE WOODS SEC 3,1982/01/15,1982/01/15,...,https://abc.austintexas.gov/web/permit/public-...,3175681,513615.0,30.262058,-97.807113,"(30.26205758, -97.80711332)",,,,
4,EP,Commercial,Addition,No,1201 BARBARA JORDAN BLVD BLDG 13,ADDITION CHIPOTLE MEXICAN GRILL,221130406,LOT 2A BLK E MUELLER RETAIL PHS A,2007/03/29,2009/07/20,...,https://abc.austintexas.gov/web/permit/public-...,10020014,10020013.0,30.306353,-97.709165,"(30.30635266, -97.70916489)",Larry Cheatwood,5331 MCCULLOUGH,San Antonio,


In this dataset we will attempt to use the Permit Type as the class label for the data and because of that we need to remove some columns that also act as labels in that capacity that might skew the results. Columns that are similar to Permit type are:

    Permit Type Description
    Permit Num (since it contain the type in the code)
    Permit Class (functions much like a label)
    Contractor Trade (plummers typically take plumbing jobs, electricians take electrician jobs thus could act as a label)

Tangental Job indicators: (subject to testing and feature engineering)

    *Plumbing Valuation
    *Plumbing Valuation Remodel
    *Electrical Valuation
    *Electrical Valuation Remodel
    *Mechanical Valuation
    *Mechanical Valuation Remodel 
    *MedGas Valuation
    *MedGas Valuation Remodel

It might be interesting to note that a renovation job might include plumbing costs and the difference being the cost threshold which decides if its specifically a plumbing job.

### Upload to Darwin

In [7]:
status, dataset = ds.upload_dataset(os.path.join(path, filename))
if not status:
    print(dataset)

#### Clean dataset

In [9]:
# clean dataset
target = "Permit Type"
status, job_id = ds.clean_data(filename, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-16T22:11:27.501866', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanData', 'loss': None, 'generations': None, 'dataset_names': ['test.csv'], 'artifact_names': ['d1fd00f5a2264aba80b97e1ad975972b'], 'model_name': None, 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-16T22:11:27.501866', 'endtime': None, 'percent_complete': 10, 'job_type': 'CleanData', 'loss': None, 'generations': None, 'dataset_names': ['test.csv'], 'artifact_names': ['d1fd00f5a2264aba80b97e1ad975972b'], 'model_name': None, 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-16T22:11:27.501866', 'endtime': None, 'percent_complete': 10, 'job_type': 'CleanData', 'loss': None, 'generations': None, 'dataset_names': ['test.csv'], 'artifact_names': ['d1fd00f5a2264aba80b97e1ad975972b'], 'model_name': None, 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-16T22:11:27.501866', 'endtime': None, 'percent_complete': 10, 'job_type': 'Clea

### Create and Train Model

In [None]:
model = target + "_model1"
status, job_id = ds.create_model(dataset_names = filename, \
                                 model_name =  model, \
                                 max_train_time = '00:30')
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

### Analyze Model

In [None]:
# Retrieve feature importance of built model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])


Display most important features of the model.

In [None]:
feature_importance[:10]

### Predictions

#### Perform model prediction on the training dataset.

In [None]:
status, artifact = ds.run_model(filename, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Download predictions from Darwin.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
prediction.head()

Create visualizations for comparing predictions with actual target. 

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

#### Perform model prediction on test dataset from holdout method.
Upload test data.

In [None]:
test_data = 'Permits_test.csv'
status, dataset = ds.upload_dataset(os.path.join(path, test_data))
if not status:
    print(dataset)

Clean test data. 

In [None]:
status, job_id = ds.clean_data(test_data, target = target, model_name = model)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

Run model on test dataset. 

In [None]:
status, artifact = ds.run_model(test_data, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Create visualizations for comparing predictions with actual target.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
df = pd.read_csv(os.path.join(path,test_data))
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

### Darwin' Pick for machine learning model

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type['description']['best'])