## Contributors: 
Kyle McCarver

### Importing Libraries:

In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import math
import numpy as np
from sklearn.metrics import classification_report

from amb_sdk.sdk import DarwinSdk

### Configure Darwin:

In [2]:
# Login
ds = DarwinSdk()
file = open("login.txt", "r")
username = file.readline(0)
password = file.readline(1)
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('username', 'password')
if not status:
    print(msg)
    

#### Data Path
Make sure to set this to your local machine's path to the data.

In [3]:
path = './'

### Importing Data:
Data used in this project:
https://data.austintexas.gov/Building-and-Development/Issued-Construction-Permits/3syk-w9eu

In [4]:
dataFile = "./Issued_Construction_Permits.csv"
filename= "train.csv"
test = 'test.csv'
data = pd.read_csv(dataFile, skipinitialspace=True)

#Columns with detected mixed types
mixedData_col = [52,54,56,58,59,60,61,62,63,64,65,66]

columnsNames = data.columns.values
#excess labels included in feature drop
#Darwin doesn't like mulitple date fields either so they must be dropped as well

'''
Master Permit Num       0.116584
Housing Units           0.039151
Total Job Valuation     0.027820
Number Of Floors        0.025094
Fiscal Year Issued      0.016996
Calendar Year Issued    0.016561
Longitude               0.014342
Latitude                0.013554
Work Class = New        0.012211'''
important_features = ["Permit Type", "Master Permit Num", "Housing Units", "Total Job Valuation", "Number Of Floors", 
                      "Fiscal Year Issued", "Calendar Year Issued", "Longitude", "Latitude", "Work Class"]
featureDrop = [x for x in columnsNames if x not in important_features]
print(featureDrop)

#add mixed datatype col to feature drop until later date of processing
print("Removed columns from dataset:")
for col in mixedData_col:
    print(columnsNames[col])
    featureDrop.append(columnsNames[col])
    
fullData = data.drop(featureDrop, axis=1)
#data added chronologically to dataset, for now reduce by half for random sampling
#pick sample sizes (max is half the dataset due to Darwin restrictions on Big Data)
trainSize = math.floor(len(fullData)/2)
testSize = math.floor(len(fullData)/10)

#sample train and test sets, note currently using .sample() performs without replacement on each instance, meaning
#there might exist overlap between the two sets
testSet = fullData.sample(n=testSize)
trainData = fullData.sample(n=trainSize)

#write out datasets to disk to upload later
testSet.to_csv(os.path.join(path, test))
trainData.to_csv(os.path.join(path, filename))

#show data / completed write to disk
trainData.head()

  interactivity=interactivity, compiler=compiler, result=result)


['Permit Type Desc', 'Permit Num', 'Permit Class Mapped', 'Permit Class', 'Condominium', 'Project Name', 'Description', 'TCAD ID', 'Property Legal Description', 'Applied Date', 'Issued Date', 'Day Issued', 'Issued In Last 30 Days', 'Issuance Method', 'Status Current', 'Status Date', 'Expires Date', 'Completed Date', 'Total Existing Bldg SQFT', 'Remodel Repair SQFT', 'Total New Add SQFT', 'Total Valuation Remodel', 'Building Valuation', 'Building Valuation Remodel', 'Electrical Valuation', 'Electrical Valuation Remodel', 'Mechanical Valuation', 'Mechanical Valuation Remodel', 'Plumbing Valuation', 'Plumbing Valuation Remodel', 'MedGas Valuation', 'MedGas Valuation Remodel', 'Original Address 1', 'Original City', 'Original State', 'Original Zip', 'Council District', 'Jurisdiction', 'Link', 'Project ID', 'Location', 'Contractor Trade', 'Contractor Company Name', 'Contractor Full Name', 'Contractor Phone', 'Contractor Address 1', 'Contractor Address 2', 'Contractor City', 'Contractor Zip',

Unnamed: 0,Permit Type,Work Class,Calendar Year Issued,Fiscal Year Issued,Total Job Valuation,Number Of Floors,Housing Units,Master Permit Num,Latitude,Longitude
1046582,PP,Remodel,1985,1985,2000.0,,1.0,555854.0,30.37166,-97.729674
1173939,PP,,2002,2003,60000.0,,,,30.178747,-97.757743
1099587,PP,Remodel,1992,1992,3100.0,,1.0,3300364.0,30.28987,-97.722097
99612,EP,Remodel,2010,2011,,0.0,0.0,,30.342565,-97.757923
1890364,BP,Remodel,2006,2007,2000.0,,,,30.265644,-97.780056


In this dataset we will attempt to use the Permit Type as the class label for the data and because of that we need to remove some columns that also act as labels in that capacity that might skew the results. Columns that are similar to Permit type are:

    Permit Type Description (too hard to process due to variablility of descriptions, but also typically contains label name)
    Permit Num (since it contain the type in the code)
    Permit Class (functions much like a label)
    Contractor Trade (plummers typically take plumbing jobs, electricians take electrician jobs thus could act as a label)
    Project ID (applies to series of permits stored in folder in database, unsurprisingly permits are put in similar folders)

Tangental Job indicators: (subject to testing and feature engineering)

    *Plumbing Valuation
    *Plumbing Valuation Remodel
    *Electrical Valuation
    *Electrical Valuation Remodel
    *Mechanical Valuation
    *Mechanical Valuation Remodel 
    *MedGas Valuation
    *MedGas Valuation Remodel

It might be interesting to note that a renovation job might include plumbing costs and the difference being the cost threshold which decides if its specifically a plumbing job.

Interesting sidenote our data qualifies as big data under Darwin's algorithms (>500MB in size) as such not only can we sample
at most half our dataset but also the need to remove addional columns before uploading including those above.

### Upload to Darwin

In [5]:
status, dataset = ds.upload_dataset(os.path.join(path, filename))
if not status:
    print(dataset)

#### Clean dataset

In [6]:
# clean dataset
target = "Permit Type"
index = "Applied Date"
status, job_id = ds.clean_data(filename, target = target)
print(job_id)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'job_name': 'c590a799e1904cc096b69b5b2ca5b306', 'artifact_name': 'b004e3e2bbb644c0ae040001d30f76a0'}
{'status': 'Requested', 'starttime': '2019-04-22T20:56:06.139236', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['b004e3e2bbb644c0ae040001d30f76a0'], 'model_name': None, 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-22T20:56:06.139236', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['b004e3e2bbb644c0ae040001d30f76a0'], 'model_name': None, 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-22T20:56:06.139236', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['b004e3e2bbb644c0ae040001d30f76a0'], 'model_name': None, 'job_error': ''}
{'status': '

In [7]:
ds.wait_for_job(job_id['job_name'])

{'status': 'Complete', 'starttime': '2019-04-22T20:56:06.139236', 'endtime': '2019-04-22T20:57:57.839087', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['train.csv'], 'artifact_names': ['b004e3e2bbb644c0ae040001d30f76a0'], 'model_name': None, 'job_error': ''}


(True, 'Job completed')

### Create and Train Model

In [13]:
model = target + "_model"
status, job_id = ds.create_model(dataset_names = filename, \
                                 model_name =  model, \
                                 max_train_time = '00:01', \
                                 max_epochs=0)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-22T21:00:57.817859', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T21:00:57.817859', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T21:00:57.817859', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': None}
{'status': 'Requested', 'starttime': '2019-04-22T21:00:57.817859', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_n

#### Check status of job

In [14]:
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Failed', 'starttime': '2019-04-22T21:00:57.817859', 'endtime': '2019-04-22T21:07:08.444792', 'percent_complete': 100, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['train.csv'], 'artifact_names': None, 'model_name': 'Permit Type_model', 'job_error': 'DarwinInternalError: uncaught'}


### Analyze Model

In [10]:
# Retrieve feature importance of built model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])


404: NOT FOUND - {"message": "Failed to find a dataset that the model was trained on. You have requested this URI [/v1/analyze/model/Permit Type_model] but did you mean /v1/analyze/model/<model_name> or /v1/analyze/model/predictions/<model_name>/<dataset_name> ?"}



TypeError: string indices must be integers

Display most important features of the model.

In [None]:
feature_importance[:10]

### Predictions

#### Perform model prediction on the training dataset.

In [None]:
status, artifact = ds.run_model(filename, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Download predictions from Darwin.

In [None]:


status, prediction = ds.download_artifact(artifact['artifact_name'])
prediction.head()

#### Download predictions

In [None]:
prediction.to_csv(os.path.join(path, "prediction10.csv"))

Create visualizations for comparing predictions with actual target. 

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(reduceData[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(reduceData[target], prediction[target]))

#### Perform model prediction on test dataset from holdout method.
Upload test data.

In [None]:

status, dataset = ds.upload_dataset(os.path.join(path, test_data))
if not status:
    print(dataset)

Clean test data. 

In [None]:
status, job_id = ds.clean_data(test_data, target = target, model_name = model)
print("Model:\n",model)
print("Target: \n",target)
print(job_id)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

Run model on test dataset. 

In [None]:
status, artifact = ds.run_model(test_data, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Create visualizations for comparing predictions with actual target.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
df = pd.read_csv(os.path.join(path,test_data))
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

### Darwin' Pick for machine learning model

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type['description']['best_genome'])

## Clean up

In [12]:
#ds.delete_all_datasets()
ds.delete_all_models()
#ds.delete_all_artifacts()

Deleting Permit Type_model


(True, None)