TODO: Until we get real data, make synthetic data

#### Temporary: Synthetic Data

In [1]:
# Synthetic Data
import numpy as np
from sklearn.datasets import make_classification
X, y = make_classification(random_state=42)
data = np.concatenate((X, y.reshape(-1, 1)), axis = 1)

### Preprocessing

In [2]:
import preprocess
np.random.seed(42)
states = np.random.randint(low = 0, high = 1000000, size=(100,)) # numpy array with our 100 random states
data_splits = {}
for rst in states:
    data_splits[rst] = preprocess.preprocess(data, rst)

### Modeling

In [9]:
from modeling import model_decisiontree, model_randomforest, model_gradientboosting, model_lightgb

model_res = {} # key as random state, then sub dictionary with model type as key and predicted classes & model itself ex: model_res[rst][dt] --> (y_pred_dt, dt) 

for state in data_splits.keys():    
    model_res[state] = {}
    model_res[state]["dt"] = model_decisiontree(*data_splits[state]) # decision tree
    model_res[state]["rf"] = model_randomforest(*data_splits[state]) # random forest
    model_res[state]["gb"] = model_gradientboosting(*data_splits[state]) # grad boosting
    model_res[state]["lg"] = model_lightgb(*data_splits[state]) # light grad boosting



### Evaluation

In [10]:
# basic view for now to see some metrics
from sklearn.metrics import accuracy_score, confusion_matrix
for rst in data_splits.keys():  
    print(f"Stats for {rst}")
    print(accuracy_score(data_splits[rst][3], model_res[rst]["dt"][0]))
    print(confusion_matrix(data_splits[rst][3], model_res[rst]["dt"][0]))
    print(accuracy_score(data_splits[rst][3], model_res[rst]["rf"][0]))
    print(confusion_matrix(data_splits[rst][3], model_res[rst]["rf"][0]))
    print(accuracy_score(data_splits[rst][3], model_res[rst]["gb"][0]))
    print(confusion_matrix(data_splits[rst][3], model_res[rst]["gb"][0]))
    print(accuracy_score(data_splits[rst][3], model_res[rst]["lg"][0]))
    print(confusion_matrix(data_splits[rst][3], model_res[rst]["lg"][0]))

Stats for 121958
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 671155
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 131932
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 365838
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 259178
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 644167
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 110268
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 732180
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 54886
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
0.9
[[11  2]
 [ 0  7]]
Stats for 137337
0.9