## Load packages

In [None]:
import pandas as pd
import pickle
from utils import post_processing, video_to_tank, calculate_overlap
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np

## Uploading a data file

In [None]:
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)

In [None]:
data_frame = pd.DataFrame({
    'init_place':[],
    'init_bottom_time':[],
    'total_dist':[],
    'mean_speed':[],
    'total_bottom_time':[],
    'freeze_count':[],
    'mean_freeze_time':[],
    'hyp_act_count':[],
    'hyp_act_total_time':[]}
)

## Translating data into predictors

In [None]:
for video in data:
    for i, v in enumerate(data[video]):
        dat, start = post_processing(v[1:])
        h, w = v[0]
        

        abs_dist = 0
        abs_speed = 0
        count = 0
        total_bottom_count = 0
        mod = 'up'
        down_count = 0
        freezes = []
        frez_count = 0

        hyp_act_ls = []
        hyp_act_count = 0
        for i in range(1, len(dat)):

            x, y = video_to_tank(dat[count], h, w)
            if y > 10:
                total_bottom_count += 1

            if count < (30 * 10) and mod == 'up':
                if y > 10:
                    mod = 'down'
                    down_count += 1
                if count == (30 * 10 - 1):
                    init = 0
                    mod = 'break'
            elif mod == 'down' and y > 10:
                init = 1
                down_count += 1
            else:
                mod = 'break'






            d = distance.euclidean(video_to_tank(dat[i-1], h, w), video_to_tank(dat[i], h, w))
            abs_dist += d
            abs_speed += d * 30
            count += 1


            if calculate_overlap(dat[i-1], dat[i]) > 0.95 and d <= 0.0165:
                frez_count += 1
            elif calculate_overlap(dat[i-1], dat[i]) > 0.95 and abs((video_to_tank(dat[i-1], h, w))[1] - (video_to_tank(dat[i], h, w))[1]) <= 0.0165:
                frez_count += 1
            else:
                freezes.append(frez_count)
                frez_count = 0


            if d * 30 > 10:
                hyp_act_count += 1
            else:
                hyp_act_ls.append(hyp_act_count)
                hyp_act_count = 0




        fris = len(freezes)
        fris_mean = sum(freezes) / len(freezes)
        init_bottom_time = down_count / 30
        total_bottom_time = total_bottom_count / 30
        mean_speed = abs_speed / count

        hyp_act_abs_time = sum(hyp_act_ls) / 30
        hyp_acts = len([i for i in hyp_act_ls if i != 0])



        data_dict = {
            'init_place':init,
            'init_bottom_time':init_bottom_time,
            'total_dist':abs_dist,
            'mean_speed':mean_speed,
            'total_bottom_time':total_bottom_time,
            'freeze_count':fris,
            'mean_freeze_time':fris_mean,
            'hyp_act_count': hyp_acts,
            'hyp_act_total_time': hyp_act_abs_time} 


        data_frame = pd.concat([data_frame, pd.DataFrame([data_dict])], ignore_index=True)




data_frame

Unnamed: 0,init_place,init_bottom_time,total_dist,mean_speed,total_bottom_time,freeze_count,mean_freeze_time,hyp_act_count,hyp_act_total_time
0,1.0,298.466667,795.887571,2.653253,298.466667,6251.0,0.43913,10.0,0.933333
1,0.0,0.0,961.695545,3.206008,288.866667,6477.0,0.389378,4.0,0.2
2,1.0,298.966667,100.552952,0.335214,298.966667,317.0,26.504732,4.0,0.433333
3,1.0,295.033333,78.417721,0.261421,295.033333,384.0,0.130208,1.0,0.133333
4,1.0,124.966667,204.511135,0.68178,287.133333,1304.0,5.792945,21.0,1.866667
10,1.0,1.466667,768.089441,2.560583,165.866667,4193.0,1.140711,11.0,2.466667
14,0.0,0.0,382.103822,1.273821,139.066667,3830.0,1.349608,44.0,2.833333
16,1.0,298.266667,316.993016,1.056761,298.266667,2332.0,2.775729,3.0,0.166667
21,1.0,72.066667,1033.571502,3.445621,242.0,6745.0,0.334173,23.0,1.233333
22,0.0,0.0,106.590508,0.355341,285.833333,942.0,8.537155,4.0,0.266667


## Standardization

In [None]:
x_scaled = StandardScaler().fit_transform(data_frame)

Here provide a list of labels for the data in the dataframe, like ['Nicotine', 'Control', 'Nicotine', 'Control', 'Caffeine', 'Ethanol']

In [None]:
target = [...]
target = pd.factorize(pd.DataFrame(target)[0])[0]

Training a random forest and visualizing the matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled, target, test_size=0.2)

clf = RandomForestClassifier(n_estimators=300)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


confusion_matrix = metrics.confusion_matrix(y_pred, y_test)

confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]





sns.heatmap(confusion_matrix,
            annot=True,
            xticklabels=['Nicotine', 'Control', 'Caffeine', 'Ethanol'],
            yticklabels=['Nicotine', 'Control', 'Caffeine', 'Ethanol'], cmap="Blues")
plt.ylabel('Prediction',fontsize=13)
plt.xlabel('Actual',fontsize=13)
plt.title('Confusion Matrix',fontsize=17)

plt.show()