In [1]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt

from src.utils import get_framed_label, train_test_split
from src.data import load_annotation
from src.data import load_radar, load_water_distance, load_weight_sensor, load_audio
from src import make_dataset

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, plot_roc_curve

# Urination

In [2]:
config = {
    'USE_IDS': [],
    'DATAFRAME_PATH': "C:/Users/Jiajun/Desktop/download-project/data/raw/data_frames",
    'ANNOTATION_PATH': "C:/Users/Jiajun/Desktop/download-project/data/processed/Annotation.csv",
    'FEATURE_NAMES': ['Max', 'Min', 'Mean', 'Median', 'LogVariance', 'LinearTrend'],
    'SOURCE_NAMES': ['TotalWeight', 'RadarSum', 'AudioDelay4'],
    'WINDOW_SECONDS': 2,
    'HOP_SECONDS': 1,
    'CATEGORY': "",
}

config['CATEGORY'] = "Urination"
complete_ids = load_annotation.get_complete_ids(
    annotation_filename = config['ANNOTATION_PATH'],
    category = config['CATEGORY']
)

In [3]:
selected_ids = complete_ids[:60]
TRAIN_IDS, TEST_IDS = train_test_split(selected_ids)

print(f"Category: {config['CATEGORY']}")
print(f"Training {len(TRAIN_IDS)} use_ids: {TRAIN_IDS[:5]}...")
print(f"Testing  {len(TEST_IDS)} use_ids: {TEST_IDS[:5]}...")

Category: Urination
Training 48 use_ids: [1891, 1893, 1808, 1906, 1912]...
Testing  12 use_ids: [1802, 1806, 1828, 1831, 1832]...


In [4]:
train_config = config.copy()
test_config = config.copy()

train_config['USE_IDS'] = TRAIN_IDS
test_config['USE_IDS'] = TEST_IDS

dataset = {}
dataset['train'] = make_dataset.RandomForestExtended(train_config)
dataset['test'] = make_dataset.RandomForestExtended(test_config)

In [5]:
train_x, train_y = dataset['train'].get_features_and_labels_from_users()
test_x, test_y = dataset['test'].get_features_and_labels_from_users()

updating 1891
updating 1893
updating 1808
updating 1906
updating 1912
updating 1896
updating 1884
updating 1887
updating 1892
updating 1878
updating 1875
updating 1915
updating 1898
updating 1818
updating 1877
updating 1888
updating 1870
updating 1833
updating 1914
updating 1863
updating 1894
updating 1918
updating 1854
updating 1882
updating 1911
updating 1883
updating 1904
updating 1890
updating 1917
updating 1836
updating 1871
updating 1913
updating 1881
updating 1885
updating 1874
updating 1826
updating 1839
updating 1880
updating 1879
updating 1916
updating 1920
updating 1862
updating 1921
updating 1864
updating 1876
updating 1829
updating 1897
updating 1830
updating 1802
updating 1806
updating 1828
updating 1831
updating 1832
updating 1834
updating 1835
updating 1841
updating 1845
updating 1889
updating 1895
updating 1919


In [6]:
train_x.shape, test_x.shape

((4790, 402), (1072, 402))

In [7]:
train_y.sum(), train_y.shape, test_y.sum(), test_y.shape

(742, (4790,), 194, (1072,))

In [8]:
rf = RandomForestClassifier(n_estimators = 30)
rf.fit(train_x, train_y)

RandomForestClassifier(n_estimators=30)

In [9]:
def classification_result(model, testX, testY, threshold = 0.5):
    testYPredProb = model.predict_proba(testX)
    testYPred = (testYPredProb[:, 1] > threshold).astype(int)
    print (f"threshold = {threshold}", "\n")
    print (classification_report(testY, testYPred))

In [10]:
classification_result(
    rf,
    test_x, test_y,
    threshold = 0.3
)

threshold = 0.3 

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       878
           1       0.92      0.85      0.88       194

    accuracy                           0.96      1072
   macro avg       0.94      0.91      0.93      1072
weighted avg       0.96      0.96      0.96      1072



In [11]:
classification_result(
    rf,
    test_x, test_y,
    threshold = 0.2
)

threshold = 0.2 

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       878
           1       0.87      0.90      0.88       194

    accuracy                           0.96      1072
   macro avg       0.92      0.93      0.93      1072
weighted avg       0.96      0.96      0.96      1072



In [22]:
!ls ../models/trained_models

randomforest-20210108-032342.pkl
randomforest-defecate-20210111-034703.pkl
randomforest-defecate-20210111-155654.pkl
seq2seq-20210107-164232.pt
seq2seq-20210107-203845.pt
seq2seq-20210108-140539.pt
urination-rf-sources-extended-embedding-0202.pkl


In [21]:
import pickle

with open("../models/trained_models/urination-rf-sources-extended-embedding-0202.pkl", "wb") as f:
    pickle.dump(rf, f)

# Defecation

In [5]:
config = {
    'USE_IDS': [],
    'DATAFRAME_PATH': "C:/Users/Jiajun/Desktop/download-project/data/raw/data_frames",
    'ANNOTATION_PATH': "C:/Users/Jiajun/Desktop/download-project/data/processed/Annotation.csv",
    'FEATURE_NAMES': ['Max', 'Min', 'Mean', 'Median', 'LogVariance', 'LinearTrend'],
    'SOURCE_NAMES': ['TotalWeight', 'RadarSum', 'AudioDelay4'],
    'WINDOW_SECONDS': 2,
    'HOP_SECONDS': 1,
    'CATEGORY': "Defecation",
}

complete_ids = load_annotation.get_complete_ids(
    annotation_filename = config['ANNOTATION_PATH'],
    category = config['CATEGORY']
)

In [6]:
selected_ids = [idx for idx in complete_ids if idx <= 1950 and idx >= 1800]
TRAIN_IDS, TEST_IDS = train_test_split(selected_ids)

print(f"Category: {config['CATEGORY']}")
print(f"Training {len(TRAIN_IDS)} use_ids: {TRAIN_IDS[:5]}...")
print(f"Testing  {len(TEST_IDS)} use_ids: {TEST_IDS[:5]}...")

Category: Defecation
Training 23 use_ids: [1898, 1930, 1919, 1926, 1941]...
Testing  6 use_ids: [1854, 1870, 1875, 1882, 1890]...


In [28]:
train_config = config.copy()
test_config = config.copy()

train_config['USE_IDS'] = TRAIN_IDS
test_config['USE_IDS'] = TEST_IDS

dataset = {}
dataset['train'] = make_dataset.RandomForestExtended(train_config)
dataset['test'] = make_dataset.RandomForestExtended(test_config)

In [29]:
train_x, train_y = dataset['train'].get_features_and_labels_from_users()
test_x, test_y = dataset['test'].get_features_and_labels_from_users()

updating 1898
updating 1930
updating 1919
updating 1926
updating 1941
updating 1923
updating 1831
updating 1881
updating 1937
updating 1933
updating 1839
updating 1830
updating 1940
updating 1862
updating 1915
updating 1893
updating 1863
updating 1943
updating 1806
updating 1912
updating 1802
updating 1904
updating 1947
updating 1854
updating 1870
updating 1875
updating 1882
updating 1890
updating 1944


In [30]:
print(f'train_x.shape: {train_x.shape} test_x.shape: {test_x.shape}')
print(f'No. Positive in training {train_y.sum()}/{train_y.shape}')
print(f'No. Positive in testing  {test_y.sum()}/{test_y.shape}')

train_x.shape: (3584, 402) test_x.shape: (1065, 402)
No. Positive in training 160/(3584,)
No. Positive in testing  37/(1065,)


In [31]:
rf = RandomForestClassifier(
    n_estimators = 10,
    class_weight = "balanced"
)
rf.fit(train_x, train_y)

RandomForestClassifier(class_weight='balanced', n_estimators=10)

In [32]:
def classification_result(model, testX, testY, threshold = 0.5):
    testYPredProb = model.predict_proba(testX)
    testYPred = (testYPredProb[:, 1] > threshold).astype(int)
    print (f"threshold = {threshold}", "\n")
    print (classification_report(testY, testYPred))

In [33]:
classification_result(
    rf,
    test_x, test_y,
    threshold = 0.3
)

threshold = 0.3 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1028
           1       0.67      0.84      0.75        37

    accuracy                           0.98      1065
   macro avg       0.83      0.91      0.87      1065
weighted avg       0.98      0.98      0.98      1065



In [34]:
classification_result(
    rf,
    test_x, test_y,
    threshold = 0.4
)

threshold = 0.4 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1028
           1       0.77      0.62      0.69        37

    accuracy                           0.98      1065
   macro avg       0.88      0.81      0.84      1065
weighted avg       0.98      0.98      0.98      1065



In [35]:
import pickle

with open('../models/trained_models/defecation-rf-sources-extended-embedding-0202.pkl', 'wb') as f:
    pickle.dump(rf, f)

In [None]:
def variable_importance(trainX, model, top=30):
    plt.figure(figsize=(20, 5))
    plt.bar(x = range(top), height = model.feature_importances_[:top])
    xticks_pos = np.arange(top)
    plt.xticks(xticks_pos, trainX.columns[:top], rotation=45, ha = 'right')
    pass

In [None]:
variable_importance(train_x, rf)