### Importing Libraries:

In [2]:
# Import necessary libraries
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import numpy as np
from sklearn.metrics import classification_report

from amb_sdk.sdk import DarwinSdk

### Configure Darwin:

In [11]:
# Login
ds = DarwinSdk()
file = open("login.txt", "r")
username = file.readline(0)
password = file.readline(1)
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
status, msg = ds.auth_login_user('username', 'password')
if not status:
    print(msg)
    

#### Data Path
Make sure to set this to your local machine's path to the data.

In [3]:
path = '/'

### Importing Data:
Data used in this project:
https://data.austintexas.gov/Building-and-Development/Issued-Construction-Permits/3syk-w9eu

In [12]:
filename = "Issued_Construction_Permits.csv"
#data = pd.read_csv(filename, skipinitialspace=True)

#data.head()
labels = data.iloc[:,0]
raw_data = data.drop(['Permit Type Desc', 'Permit Num', 'Permit Class', 'Contractor Trade'], axis=1)

raw_data.head()
labels.head()

#Columns with detected mixed types
#mixed_data_col = [52,54,56,58,59,60,61,62,63,64,65,66]

#columnsNames = data.columns.values
#for col in mixed_data_col:
#    print(columnsNames[col])

Contractor Company Name
Contractor Phone
Contractor Address 2
Contractor Zip
Applicant Full Name
Applicant Organization
Applicant Phone
Applicant Address 1
Applicant Address 2
Applicant City
Applicant Zip
Certificate Of Occupancy


In this dataset we will attempt to use the Permit Type as the class label for the data and because of that we need to remove some columns that also act as labels in that capacity that might skew the results. Columns that are similar to Permit type are:


Permit Type Description

Permit Num (since it contain the type in the code)

Permit Class (functions much like a label)

Contractor Trade (plummers typically take plumbing jobs, electricians take electrician jobs thus could act as a label)

Tangental Job indicators: (subject to testing and feature engineering)

    *Plumbing Valuation
    *Plumbing Valuation Remodel
    *Electrical Valuation
    *Electrical Valuation Remodel
    *Mechanical Valuation
    *Mechanical Valuation Remodel 
    *MedGas Valuation
    *MedGas Valuation Remodel

It might be interesting to note that a renovation job might include plumbing costs and the difference being the cost threshold which decides if its specifically a plumbing job.

Mixed data types dectected by Pandas (need to cleaned prior to sending to Darwin)
    
    
    Contractor Company Name (string)
    Contractor Phone (convert to int?)
    Contractor Address 2 (String)
    Contractor Zip (int?)
    Applicant Full Name (String)
    Applicant Organization (String)
    Applicant Phone (int?)
    Applicant Address 1 (String)
    Applicant Address 2 (String)
    Applicant City (String, though all for Austin)
    Applicant Zip (int?)
    Certificate Of Occupancy


### Upload to Darwin

In [None]:
status, dataset = ds.upload_dataset(os.path.join(path, filename))
if not status:
    print(dataset)

#### Clean dataset

In [None]:
# clean dataset
target = "Permit Type"
status, job_id = ds.clean_data(raw_data, target = target)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

### Create and Train Model

In [None]:
model = target + "_model0"
status, job_id = ds.create_model(dataset_names = dataset_name, \
                                 model_name =  model, \
                                 max_train_time = '00:30')
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

### Analyze Model

In [None]:
# Retrieve feature importance of built model
status, artifact = ds.analyze_model(model)
sleep(1)
if status:
    ds.wait_for_job(artifact['job_name'])
else:
    print(artifact)
status, feature_importance = ds.download_artifact(artifact['artifact_name'])


Display most important features of the model.

In [None]:
feature_importance[:10]

### Predictions

#### Perform model prediction on the training dataset.

In [None]:
status, artifact = ds.run_model(dataset_name, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Download predictions from Darwin.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
prediction.head()

Create visualizations for comparing predictions with actual target. 

In [None]:
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

#### Perform model prediction on test dataset from holdout method.
Upload test data.

In [None]:
test_data = 'Permits_test.csv'
status, dataset = ds.upload_dataset(os.path.join(path, test_data))
if not status:
    print(dataset)

Clean test data. 

In [None]:
status, job_id = ds.clean_data(test_data, target = target, model_name = model)

if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

Run model on test dataset. 

In [None]:
status, artifact = ds.run_model(test_data, model)
sleep(1)
ds.wait_for_job(artifact['job_name'])

Create visualizations for comparing predictions with actual target.

In [None]:
status, prediction = ds.download_artifact(artifact['artifact_name'])
df = pd.read_csv(os.path.join(path,test_data))
unq = prediction[target].unique()[::-1]
p = np.zeros((len(prediction),))
a = np.zeros((len(prediction),))
for i,q in enumerate(unq):
    p += i*(prediction[target] == q).values
    a += i*(df[target] == q).values
#Plot predictions vs actual
plt.plot(a)
plt.plot(p)
plt.legend(['Actual','Predicted'])
plt.yticks([i for i in range(len(unq))],[q for q in unq]);
print(classification_report(df[target], prediction[target]))

### Darwin' Pick for machine learning model

In [None]:
status, model_type = ds.lookup_model_name(model)
print(model_type['description']['best'])