In [1]:
import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from src.data import load_annotation, load_data
from src.utils.train import train_test_split

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
# config
DATAFRAME_PATH = "../data/raw/data_frames"
ANNOTATION_PATH = "../data/processed/Annotation.csv"
CATEGORY = "Defecation"
THRESHOLD = 0.3

FEATURE_NAMES = ['Min', 'Max', 'Median', 'Mean', 'Variance', 'LinearTrend']
SOURCES = ['TotalWeight', 'WaterDistance', 'RadarSum']

ANNOTATIONS = load_annotation.get_annotation(ANNOTATION_PATH)
USER_IDS = load_annotation.get_complete_ids(ANNOTATION_PATH, CATEGORY)
TRAIN_IDS, TEST_IDS = train_test_split(USER_IDS[USER_IDS < 2000])

print ("Training on {} cases, {} ...".format(len(TRAIN_IDS), TRAIN_IDS[:5]))
print ("Testing  on {} cases, {} ...".format(len(TEST_IDS), TEST_IDS[:5]))

Training on 27 cases, [1898, 1923, 1831, 1944, 1995] ...
Testing  on 7 cases, [1854, 1870, 1875, 1882, 1890] ...


In [3]:
rf_train_config = {
    'USE_IDS': TRAIN_IDS,
    'ANNOTATION_PATH': ANNOTATION_PATH,
    'FEATURE_NAMES': FEATURE_NAMES,
    'SOURCES': SOURCES,
    'CATEGORY': CATEGORY
}

rf_test_config = {
    'USE_IDS': TEST_IDS,
    'ANNOTATION_PATH': ANNOTATION_PATH,
    'FEATURE_NAMES': FEATURE_NAMES,
    'SOURCES': SOURCES,
    'CATEGORY': CATEGORY
}

dataset = {}
dataset['train'] = load_data.RandomForestDataset(rf_train_config)
dataset['test'] = load_data.RandomForestDataset(rf_test_config)

In [4]:
train_x, train_y = dataset['train'].get_all_features_and_labels()
test_x, test_y = dataset['test'].get_all_features_and_labels()

Updating user : 1898
Updating user : 1923
Updating user : 1831
Updating user : 1944
Updating user : 1995
Updating user : 1943
Updating user : 1926
Updating user : 1941
Updating user : 1940
Updating user : 1992
Updating user : 1915
Updating user : 1933
Updating user : 1839
Updating user : 1830
Updating user : 1930
Updating user : 1937
Updating user : 1881
Updating user : 1955
Updating user : 1947
Updating user : 1919
Updating user : 1862
Updating user : 1893
Updating user : 1863
Updating user : 1999
Updating user : 1806
Updating user : 1912
Updating user : 1802
Updating user : 1854
Updating user : 1870
Updating user : 1875
Updating user : 1882
Updating user : 1890
Updating user : 1904
Updating user : 1994


In [None]:
train_x[train_x['TotalWeight_LogVariance'] == - np.inf]

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, plot_roc_curve

rf = RandomForestClassifier(n_estimators = 5, max_features = 3)
rf.fit(train_x, train_y)

RandomForestClassifier(max_features=3, n_estimators=5)

In [22]:
import pickle
import time

timestr = time.strftime("%Y%m%d-%H%M%S")
with open(f"../randomforest-defecate-{timestr}.pkl", "wb") as f:
    pickle.dump(rf, f)

In [7]:
def classification_result(model, testX, testY, threshold = 0.5):
    testYPredProb = model.predict_proba(testX)
    testYPred = (testYPredProb[:, 1] > threshold).astype(int)
    print (f"threshold = {threshold}", "\n")
    print (classification_report(testY, testYPred))

In [14]:
classification_result(
    model = rf,
    testX = train_x,
    testY = train_y,
    threshold = 0.3
)

threshold = 0.3 

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      4010
         1.0       0.91      1.00      0.95       235

    accuracy                           0.99      4245
   macro avg       0.95      0.99      0.97      4245
weighted avg       0.99      0.99      0.99      4245



In [23]:
train_x

Unnamed: 0,TotalWeight_Min,TotalWeight_Max,TotalWeight_Median,TotalWeight_Mean,TotalWeight_Variance,TotalWeight_LinearTrend,WaterDistance_Min,WaterDistance_Max,WaterDistance_Median,WaterDistance_Mean,WaterDistance_Variance,WaterDistance_LinearTrend,RadarSum_Min,RadarSum_Max,RadarSum_Median,RadarSum_Mean,RadarSum_Variance,RadarSum_LinearTrend
0,83.982000,84.132500,84.057250,84.057250,0.011325,-0.301000,12.12,12.18,12.150,12.148226,1.984400e-04,-1.103525e-02,51.622228,56.922524,54.176052,54.060497,2.069827,-3.722745
1,84.073000,84.309000,84.191000,84.191000,0.027848,0.472000,12.12,12.18,12.150,12.147302,1.135689e-04,3.968089e-03,51.023040,55.268160,53.164700,53.228138,1.726514,-0.584725
2,84.010333,84.249833,84.130083,84.130083,0.028680,-0.479000,12.12,12.18,12.150,12.151290,1.622422e-04,5.545901e-03,50.460412,57.283076,54.278936,53.771527,4.521573,3.576612
3,84.099000,84.124500,84.111750,84.111750,0.000325,0.051000,12.12,12.18,12.150,12.152063,1.134153e-04,-1.745368e-03,48.138480,53.330616,51.104640,51.135444,2.474212,-1.475713
4,83.950500,84.595000,84.272750,84.272750,0.207690,1.289000,12.12,12.18,12.150,12.154032,1.916711e-04,7.224069e-03,49.924264,57.632404,53.739730,53.867978,4.918860,3.957775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,8.870500,8.878000,8.874250,8.874250,0.000028,0.015000,12.89,1127.86,13.400,122.031452,1.101750e+05,5.300531e+02,46.526284,62.541804,57.731000,57.029995,17.988292,-8.522508
4241,8.872667,8.880500,8.876583,8.876583,0.000031,0.015667,9.71,1127.86,11.960,313.038254,2.493270e+05,-1.104624e+03,44.513864,61.338836,52.336020,51.463744,22.298302,-0.438980
4242,8.871500,8.883000,8.877250,8.877250,0.000066,-0.023000,9.76,1127.86,12.375,120.593387,1.104914e+05,-1.667434e+02,54.391184,81.661608,63.607640,64.451739,36.289194,11.703266
4243,8.875000,8.879000,8.877000,8.877000,0.000008,-0.008000,10.83,1127.86,1127.860,668.070317,3.057033e+05,1.111579e+03,56.110160,73.640448,64.842556,65.034010,26.677710,9.029761


In [None]:
print (TRAIN_IDS)

In [None]:
print (TEST_IDS)

In [21]:
classification_result(
    model = rf,
    testX = test_x,
    testY = test_y,
    threshold = 0.5
)

threshold = 0.5 

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1183
         1.0       0.71      0.79      0.75        52

    accuracy                           0.98      1235
   macro avg       0.85      0.89      0.87      1235
weighted avg       0.98      0.98      0.98      1235



In [None]:
def variable_importance(trainX, model):
    plt.bar(x = range(trainX.shape[1]), height = model.feature_importances_)
    xticks_pos = np.arange(trainX.shape[1])
    plt.xticks(xticks_pos, trainX.columns, rotation=45, ha = 'right')
    pass

In [None]:
# variable importance
plt.figure(figsize = (12, 6))
variable_importance(train_x, rf)