## Contributors: 
Kyle McCarver

### Importing Libraries:

In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import math
import numpy as np
from sklearn.metrics import classification_report

from amb_sdk.sdk import DarwinSdk

### Configure Darwin:

In [18]:
# Login
ds = DarwinSdk()
file = open("login.txt", "r")
username = file.readline(0)
password = file.readline(1)
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('username', 'password')
if not status:
    print(msg)
    

#### Data Path
Make sure to set this to your local machine's path to the data.

In [3]:
path = './'

### Importing Data:
Data used in this project:
https://data.austintexas.gov/Building-and-Development/Issued-Construction-Permits/3syk-w9eu

In [4]:
dataFile = "./Issued_Construction_Permits.csv"
filename= "train.csv"
test = 'test.csv'
data = pd.read_csv(dataFile, skipinitialspace=True)

#Columns with detected mixed types
mixedData_col = [52,54,56,58,59,60,61,62,63,64,65,66]

columnsNames = data.columns.values
# Important features calculated from the model in the master notebook

'''
Master Permit Num       0.116584
Housing Units           0.039151
Total Job Valuation     0.027820
Number Of Floors        0.025094
Fiscal Year Issued      0.016996
Calendar Year Issued    0.016561
Longitude               0.014342
Latitude                0.013554
Work Class = New        0.012211'''
important_features = ["Permit Type", "Master Permit Num", "Housing Units", "Total Job Valuation", "Number Of Floors", 
                      "Fiscal Year Issued", "Calendar Year Issued", "Longitude", "Latitude", "Work Class", "Applied Date"]
featureDrop = [x for x in columnsNames if x not in important_features]
print(featureDrop)
    
fullData = data.drop(featureDrop, axis=1)
#data added chronologically to dataset, for now reduce by half for random sampling
#pick sample sizes (max is half the dataset due to Darwin restrictions on Big Data)
trainSize = math.floor(len(fullData)/2)
testSize = math.floor(len(fullData)/10)

#sample train and test sets, note currently using .sample() performs without replacement on each instance, meaning
#there might exist overlap between the two sets
testSet = fullData.sample(n=testSize)
trainData = fullData.sample(n=trainSize)

#write out datasets to disk to upload later
testSet.to_csv(os.path.join(path, test))
trainData.to_csv(os.path.join(path, filename))

#show data / completed write to disk
trainData.head()

  interactivity=interactivity, compiler=compiler, result=result)


['Permit Type Desc', 'Permit Num', 'Permit Class Mapped', 'Permit Class', 'Condominium', 'Project Name', 'Description', 'TCAD ID', 'Property Legal Description', 'Issued Date', 'Day Issued', 'Issued In Last 30 Days', 'Issuance Method', 'Status Current', 'Status Date', 'Expires Date', 'Completed Date', 'Total Existing Bldg SQFT', 'Remodel Repair SQFT', 'Total New Add SQFT', 'Total Valuation Remodel', 'Building Valuation', 'Building Valuation Remodel', 'Electrical Valuation', 'Electrical Valuation Remodel', 'Mechanical Valuation', 'Mechanical Valuation Remodel', 'Plumbing Valuation', 'Plumbing Valuation Remodel', 'MedGas Valuation', 'MedGas Valuation Remodel', 'Original Address 1', 'Original City', 'Original State', 'Original Zip', 'Council District', 'Jurisdiction', 'Link', 'Project ID', 'Location', 'Contractor Trade', 'Contractor Company Name', 'Contractor Full Name', 'Contractor Phone', 'Contractor Address 1', 'Contractor Address 2', 'Contractor City', 'Contractor Zip', 'Applicant Full

Unnamed: 0,Permit Type,Work Class,Applied Date,Calendar Year Issued,Fiscal Year Issued,Total Job Valuation,Number Of Floors,Housing Units,Master Permit Num,Latitude,Longitude
1557453,EP,New,2002/04/30,2002,2002,470247.0,2.0,2.0,690668.0,30.364092,-97.780764
431488,EP,Wall,2016/04/18,2016,2016,,,,11515070.0,30.357097,-97.731317
1221236,PP,Remodel,1987/09/09,1987,1987,10000.0,,1.0,752648.0,30.365089,-97.705924
1257171,DS,,1990/11/28,1990,1991,108150.0,,,597288.0,30.350383,-97.612943
1143311,PP,New,1998/09/04,1998,1998,890000.0,3.0,24.0,657332.0,30.231502,-97.837432


This notebook attempts to reduce the overall dataset by using only the features Darwin deems most important. The most important features were calculated from the model in the master notebook then listed here in "important_features." We remove any feature not in that list (with Permit Type as the exception) and sample from the reduced data for the training and testing sets.

### Upload to Darwin

In [16]:
status, dataset = ds.upload_dataset(os.path.join(path, filename))
if not status:
    print(dataset)

#### Clean dataset

In [19]:
# clean dataset
target = "Permit Type"
index = "Applied Date"
status, job_id = ds.clean_data(filename, target = target)
print(job_id)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'job_name': '0172e09334684adc97d60b2c787afd34', 'artifact_name': '36df201c91364e05ac9a812cd9e1a16b'}
{'status': 'Requested', 'starttime': '2019-04-22T22:25:39.295079', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['36df201c91364e05ac9a812cd9e1a16b'], 'model_name': None, 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T22:25:39.295079', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['36df201c91364e05ac9a812cd9e1a16b'], 'model_name': None, 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T22:25:39.295079', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['36df201c91364e05ac9a812cd9e1a16b'], 'model_name': None, 'job_error': None}
{'st

In [20]:
ds.wait_for_job(job_id['job_name'])

{'status': 'Complete', 'starttime': '2019-04-22T22:25:39.295079', 'endtime': '2019-04-22T22:28:05.827868', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['36df201c91364e05ac9a812cd9e1a16b'], 'model_name': None, 'job_error': ''}


(True, 'Job completed')

### Create and Train Model

In [21]:
model = target + "_model"
status, job_id = ds.create_model(dataset_names = filename, \
                                 model_name =  model, \
                                 max_train_time = '00:01', \
                                 max_epochs=0)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-22T22:32:08.957833', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T22:32:08.957833', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T22:32:08.957833', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T22:32:08.957833', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_n

#### Check status of job

In [14]:
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Complete', 'starttime': '2019-04-22T22:15:38.563651', 'endtime': '2019-04-22T22:18:31.048386', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': None, 'artifact_names': None, 'model_name': None, 'job_error': ''}


### Analyze Model

In [None]:
# Retrieve feature importance of built model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])


Display most important features of the model.

In [None]:
feature_importance[:10]

### Predictions

#### Perform model prediction on the training dataset.

In [None]:
status, artifact = ds.run_model(filename, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Download predictions from Darwin.

In [None]:


status, prediction = ds.download_artifact(artifact['artifact_name'])
prediction.head()

#### Download predictions

In [None]:
prediction.to_csv(os.path.join(path, "prediction10.csv"))

Create visualizations for comparing predictions with actual target. 

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(reduceData[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(reduceData[target], prediction[target]))

#### Perform model prediction on test dataset from holdout method.
Upload test data.

In [None]:

status, dataset = ds.upload_dataset(os.path.join(path, test_data))
if not status:
    print(dataset)

Clean test data. 

In [None]:
status, job_id = ds.clean_data(test_data, target = target, model_name = model)
print("Model:\n",model)
print("Target: \n",target)
print(job_id)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

Run model on test dataset. 

In [None]:
status, artifact = ds.run_model(test_data, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Create visualizations for comparing predictions with actual target.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
df = pd.read_csv(os.path.join(path,test_data))
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

### Darwin' Pick for machine learning model

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type['description']['best_genome'])

## Clean up

In [15]:
ds.delete_all_datasets()
ds.delete_all_models()
ds.delete_all_artifacts()

Deleting longlat_test.csv
Error removing dataset "longlat_test.csv" - 403: FORBIDDEN - {"message": "Dataset is in use by an active job"}

Deleting Permit Type_model_loc1
Error removing model "Permit Type_model_loc1" - 403: FORBIDDEN - {"message": "Model is in use by an active job"}

Deleting 0cad448fa6db4e20845ede356d44edd5
Error removing artifact "0cad448fa6db4e20845ede356d44edd5" - 404: NOT FOUND - {"message": "Failed to find artifact 0cad448fa6db4e20845ede356d44edd5"}

Deleting f2d6110053354e5b80ece5cdc3668918
Error removing artifact "f2d6110053354e5b80ece5cdc3668918" - 404: NOT FOUND - {"message": "Failed to find artifact f2d6110053354e5b80ece5cdc3668918"}

Deleting 7a66ffae7ce6480897a4548c8491a2be
Error removing artifact "7a66ffae7ce6480897a4548c8491a2be" - 404: NOT FOUND - {"message": "Failed to find artifact 7a66ffae7ce6480897a4548c8491a2be"}

Deleting 32cbe86816554c8b9c905cf81e59e145
Error removing artifact "32cbe86816554c8b9c905cf81e59e145" - 404: NOT FOUND - {"message": "Fail

(True, None)