In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
import pyecoacc as acc 
from pyecoacc.models.pipeline import make_classifier_pipeline
from pyecoacc.util.analytics import compare_models_cv
from pyecoacc.features.transform import ACCStatsTransformer
from pyecoacc.models.deep.cnn import make_cnn_model

# Load data 

In [3]:
molerats_data = pd.read_csv("data/molerats.csv", index_col=0)

In [4]:
molerats_data.drop("Animal", inplace=True, axis=1)

X = molerats_data.iloc[:, :-1].values 
y = molerats_data.Behavior.values 

# Define models 

### Make a random forest model pipeline 
- ACCStatsTransformer to compute features from the raw ACC signal 
- no scaling 
- select the top 50 features (f-test)

In [5]:
rf_model = make_classifier_pipeline(features=ACCStatsTransformer(), 
                                    model=RandomForestClassifier(n_estimators=250, max_depth=10),
                                    feature_scaler=False,
                                    feature_selector=True, k_selection=50) 

In [14]:
rf_model

### Mkae an XGBoost model

In [6]:
xg_model = make_classifier_pipeline(features=ACCStatsTransformer(), 
                                    model=XGBClassifier(n_estimators=250)) 

In [15]:
xg_model

### Make a CNN model 
- the pipeline includes rehspaing to 3-chanels (X, Y, Z) before entering the CNN 

In [7]:
cnn_model = make_cnn_model(input_dim=X.shape[1]//3, 
                           num_behav=np.unique(y).shape[0], 
                           verbose=0)

In [16]:
cnn_model

In [8]:
model_dict = {
    "random forest": rf_model,
    "XGBoost": xg_model,
    "CNN": cnn_model
}

# Compare 
- compare_models_cv uses *cross_val_predict* and *classification_report* from sklearn for each model in the model_dict and combines results to form accuracy, precision, recall, and f1 tables comparing the models. 

In [9]:
accuracy, precision, recall, f1  = compare_models_cv(X, y, model_dict, cv=5)

Starting model random forest...
Starting model XGBoost...
Starting model CNN...


In [10]:
accuracy.round(2)

random forest    77.02
XGBoost          79.46
CNN              78.57
dtype: float64

In [11]:
precision.round(2)

Unnamed: 0,Dig,Eat,Forward Loco,Rest,Stand,Sweep,macro avg,weighted avg
random forest,0.81,0.78,0.63,0.93,0.59,0.82,0.76,0.77
XGBoost,0.84,0.84,0.67,0.9,0.58,0.84,0.78,0.8
CNN,0.89,0.8,0.69,0.81,0.55,0.91,0.77,0.79


In [12]:
recall.round(2)

Unnamed: 0,Dig,Eat,Forward Loco,Rest,Stand,Sweep,macro avg,weighted avg
random forest,0.87,0.79,0.51,0.9,0.62,0.72,0.74,0.77
XGBoost,0.88,0.78,0.61,0.93,0.67,0.77,0.77,0.79
CNN,0.82,0.78,0.68,0.95,0.62,0.85,0.78,0.79


In [13]:
f1.round(2)

Unnamed: 0,Dig,Eat,Forward Loco,Rest,Stand,Sweep,macro avg,weighted avg
random forest,0.84,0.79,0.56,0.91,0.6,0.77,0.75,0.77
XGBoost,0.86,0.81,0.64,0.91,0.62,0.8,0.78,0.8
CNN,0.85,0.79,0.69,0.87,0.58,0.88,0.78,0.79
