In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the Provisional CSV

In [2]:
df = pd.read_csv("Training_1.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the first column
df = df.iloc[: , 1:]

# Remove the 'Duplicate' status
status = df["SR Status"] != "Duplicate (closed)" 
df = df.loc[status]

# Remove the 'To Be Deleted' status
status = df["SR Status"] != "TO BE DELETED" 
df = df.loc[status]

# Remove the 'Closed -Incomplete Information' status
status = df["SR Status"] != "Closed -Incomplete Information"
df = df.loc[status]

df[["Total Seconds"]] = df[["Total Seconds"]].fillna(0).astype(int)

df = df.drop(columns = ["Service Request (SR) Number",
                        "Open Time",
                        "Open Time delta",
                        "Created Date",
                        "Created Date DT",
                        "Close Date",
                        "Closed Date DT",
                        "Status Change Date",
                        "Last Update Date",
                        "Map Page",
                  "Map Tile",
                  "State Plane X Coordinate",
                  "State Plane Y Coordinate",
                  "Latitude Coordinate",
                  "Longitude Coordinate",
                  "(Latitude.Longitude)",
                  "SR Location",
                  "Street Number",
                  "Street Name"])

df.head()

Unnamed: 0,SR Description,Department,Method Received,SR Status,Zip Code,Council District,City,County,Request Year,Total Seconds
0,Animal Control - Assistance Request,Animal Services,Phone,Closed,78745.0,5.0,AUSTIN,TRAVIS,2016,116467642
1,Animal Control - Assistance Request,Animal Services,Phone,Closed,78752.0,4.0,AUSTIN,TRAVIS,2016,113077920
2,Animal Control - Assistance Request,Animal Services,Phone,Closed,78745.0,5.0,AUSTIN,TRAVIS,2016,108941829
3,Austin Code - Request Code Officer,Austin Code,Phone,Closed,78745.0,5.0,Austin,TRAVIS,2018,644
4,Pavement Failure,Public Works,Phone,Closed,78704.0,5.0,AUSTIN,TRAVIS,2018,2524007


In [3]:
# df = df.head(1000)

# Split the Data into Training and Testing

In [4]:
# Create our features
X = df.drop(columns = ["SR Status"])

# Create dummy variables
X = pd.get_dummies(X)

# Create our target
y = df[["SR Status"]]

In [5]:
# Check the balance of our target values
y["SR Status"].value_counts()

Closed      448468
Resolved       189
Name: SR Status, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Balanced Random Forest Classifier

In [7]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rand_for_class = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
rand_for_class.fit(X_train, y_train.values.ravel())

BalancedRandomForestClassifier(random_state=1)

In [8]:
# Calculated the balanced accuracy score
y_prediction = rand_for_class.predict(X_test)
balanced_accuracy_score(y_test, y_prediction)

0.9283570283507857

In [9]:
# Display the confusion matrix
confusion_matrix(y_test, y_prediction)

array([[99463, 12669],
       [    1,    32]], dtype=int64)

In [10]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_prediction))

                   pre       rec       spe        f1       geo       iba       sup

     Closed       1.00      0.89      0.97      0.94      0.93      0.85    112132
   Resolved       0.00      0.97      0.89      0.01      0.93      0.87        33

avg / total       1.00      0.89      0.97      0.94      0.93      0.85    112165



In [11]:
# List the features sorted in descending order by feature importance
features = rand_for_class.feature_importances_
sort_features = np.argsort(features)[::-1]

for i in sort_features:
    print("{}: ({})".format(X.columns.values[i], features[i]))

Total Seconds: (0.12868903048548114)
Request Year: (0.09042017397589495)
Department_Watershed Check: (0.08021017902295999)
Department_Austin Code: (0.05711244990053812)
Zip Code: (0.053008123194248656)
Council District: (0.04395046031247003)
Department_Animal Services: (0.042754973068431956)
SR Description_Austin Code - Request Code Officer: (0.04117616017023353)
Department_Austin Transportation: (0.03456756717720176)
Department_Public Works: (0.03108442342067603)
SR Description_Debris in Street: (0.025260382094339638)
SR Description_Sign - Traffic Sign Maintenance: (0.021196407897104183)
Department_Austin Resource Recovery: (0.019451156495279606)
SR Description_Pothole Repair: (0.018951202096911483)
City_AUSTIN: (0.015562999345519593)
SR Description_Pavement Failure: (0.015093976404686791)
SR Description_Drainage - Miscellaneous: (0.01475109596410118)
Method Received_Spot311 Interface: (0.014534549238385154)
City_Austin: (0.014174879784872873)
SR Description_Street Light Issue- Addres

# Easy Ensemble AdaBoost Classifier

In [12]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy_ens_class = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
easy_ens_class.fit(X_train, y_train.values.ravel())

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [13]:
# Calculated the balanced accuracy score
y_prediction = easy_ens_class.predict(X_test)
balanced_accuracy_score(y_test, y_prediction)

0.942911033424892

In [14]:
# Display the confusion matrix
confusion_matrix(y_test, y_prediction)

array([[99329, 12803],
       [    0,    33]], dtype=int64)

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_prediction))

                   pre       rec       spe        f1       geo       iba       sup

     Closed       1.00      0.89      1.00      0.94      0.94      0.88    112132
   Resolved       0.00      1.00      0.89      0.01      0.94      0.90        33

avg / total       1.00      0.89      1.00      0.94      0.94      0.88    112165

