In [None]:
import os
import sys
import datetime
from time import gmtime
from time import strftime
from pathlib import Path
from datetime import timedelta  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import signal
import pickle
import librosa
import librosa.display
from sklearn.model_selection import train_test_split
from feature_extraction import feature_extraction, extract_statistic

from regression_analysis import random_split_evaluation, independent_split_evaluation, random_baseline_metrics
from regression_analysis import evaluate_model_performance

import scipy
import scipy.signal
import scipy.fftpack

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

%load_ext autoreload
%autoreload 2


# Labels 2021

In [None]:
data = pd.read_csv("../data/annotations/inspections_2021.csv")
data.info()

In [None]:
data = data.fillna(0)
data['Date'] = pd.to_datetime(data['Date'])
unique_hives = data['Tag number'].unique()
unique_hives

In [None]:
grouped = data.groupby(['Tag number'])
dict_hives = {}
for i in unique_hives:
        dict_hives[i] = grouped.get_group(i)

In [None]:
for hive in  data['Tag number'].unique():
    dict_hives[hive] = dict_hives[hive].set_index(dict_hives[hive]['Date'])
    idx = pd.date_range(dict_hives[hive].index.min(), dict_hives[hive].index.max()+ timedelta(days=1), freq="15min")#  + timedelta(days=12)
    dict_hives[hive] = dict_hives[hive].reindex(idx)
    dict_hives[hive] = dict_hives[hive].drop(['Date'], axis=1)
    dict_hives[hive] = dict_hives[hive].interpolate(method="linear")#interpolate(method="ffill")
    dict_hives[hive]["fob"] = dict_hives[hive]["Fob 1st"] + dict_hives[hive]["Fob 2nd"]+ dict_hives[hive]["Fob 3rd"]
    dict_hives[hive]["fob"] = dict_hives[hive]["fob"]#.round(0).astype('f')

# Labels 2022

In [None]:
data_2022 = pd.read_csv("../data/annotations/inspections_2022.csv")
data_2022.info()

In [None]:
data_2022['Date'] = pd.to_datetime(data_2022['Date'], dayfirst=True).dt.date
data_2022 = data_2022.set_index('Date')

In [None]:
data_2022 = data_2022[data_2022['Category'] == 'frames of bees']
data_2022['Action detail'] = pd.to_numeric(data_2022['Action detail']) # conver the column from object to float
data_2022['Action detail'] = data_2022['Action detail'].astype('f') 

In [None]:
data_2022["fob"] = data_2022["Action detail"]

In [None]:
grouped = data_2022.groupby(['Tag number'])
dict_hives_2022 = {}
for i in data_2022['Tag number'].unique():
        dict_hives_2022[i] = grouped.get_group(i)

In [None]:
for hive in  data_2022['Tag number'].unique():
    idx = pd.date_range(dict_hives_2022[hive].index.min(), dict_hives_2022[hive].index.max() + timedelta (days=1), freq="15min")
    dict_hives_2022[hive] = dict_hives_2022[hive].reindex(idx)
    dict_hives_2022[hive] = dict_hives_2022[hive].interpolate(method="linear")#interpolate(method="ffill")
    dict_hives_2022[hive]['fob'] = dict_hives_2022[hive]['fob']#.round(0).astype('f')

# MFCCs

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

win = 1600
shift = 800

df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_mfccs_win_" + str(win) +'_shift_' + str(shift) +
             "_n_filter_26.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_mfccs_win_" + str(win) +'_shift_' + 
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_mfccs_win_" + str(win) +'_shift_' + str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

# LFCC

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]
win=1600
shift=800

df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data//features/2021_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win, hop_length = shift, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_lfccs_win_" + str(win) +'_shift_' +
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")

In [None]:
for hive in hives: 
    df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_lfccs_win_" + str(win) +'_shift_' +
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

# Spectral features

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]


df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                        hop_length = 800, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_spectral_nine_features.pkl")


In [None]:
df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                        hop_length = 800, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_spectral_nine_features.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                            hop_length = 800, dict_hives=dict_hives_2021, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                            hop_length = 800, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl")
    

# Hand crafted features

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                        hop_length=512, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_nectar_features.pkl")



In [None]:
df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                        hop_length=512, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_nectar_features.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                            hop_length=512, dict_hives=dict_hives, hives=[hive], year=2021, enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_hand_crafted_" + str(hive) + ".pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                            hop_length=512, dict_hives=dict_hives_2022, hives=[hive], year=2022, enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_hand_crafted_" + str(hive) + ".pkl")
    

# Extract statistic

In [None]:
win = 1600
shift= 800

mfccs_2021 = pd.read_pickle("../data/features/2021_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_mels_26.pkl")
mfccs_2022 = pd.read_pickle("../data/features/2022_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_mels_26.pkl")


mfccs_2021['date'] = pd.to_datetime(mfccs_2021['date'], dayfirst=True)
mfccs_2021 = mfccs_2021.set_index(mfccs_2021['date'])
mfccs_2021 = mfccs_2021.drop(['date'], axis=1)

mfccs_2022['date'] = pd.to_datetime(mfccs_2022['date'], dayfirst=True)
mfccs_2022 = mfccs_2022.set_index(mfccs_2022['date'])
mfccs_2022 = mfccs_2022.drop(['date'], axis=1)

mfccs_feature_2021 = extract_statistic(mfccs_2021, mfccs_2021.columns[3:], 2021, '00:00', '23:00');
mfccs_feature_2022 = extract_statistic(mfccs_2022, mfccs_2022.columns[3:], 2022, '00:00', '23:00');
mfccs_features = pd.concat([mfccs_feature_2021, mfccs_feature_2022])

In [None]:
win = 1600
shift= 800

lfccs_2021 = pd.read_pickle("../data/features/2021_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")
lfccs_2022 = pd.read_pickle("../data/features/2022_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


lfccs_2021['date'] = pd.to_datetime(lfccs_2021['date'], dayfirst=True)
lfccs_2021 = lfccs_2021.set_index(lfccs_2021['date'])
lfccs_2021 = lfccs_2021.drop(['date'], axis=1)

lfccs_2022['date'] = pd.to_datetime(lfccs_2022['date'], dayfirst=True)
lfccs_2022 = lfccs_2022.set_index(lfccs_2022['date'])
lfccs_2022 = lfccs_2022.drop(['date'], axis=1)

lfccs_feature_2021 = extract_statistic(lfccs_2021, lfccs_2021.columns[3:],  2021, '00:00', '23:00');
lfccs_feature_2022 = extract_statistic(lfccs_2022, lfccs_2022.columns[3:], 2022, '00:00', '23:00');
lfccs_features = pd.concat([lfccs_feature_2021, lfccs_feature_2022])

In [None]:
spectral_2021 = pd.read_pickle("../data/features/2021_df_spectral_nine_features.pkl")
spectral_2022 = pd.read_pickle("../data/features/2022_df_spectral_nine_features.pkl")

spectral_2021['date'] = pd.to_datetime(spectral_2021['date'], dayfirst=True)
spectral_2021 = spectral_2021.set_index(spectral_2021['date'])
spectral_2021 = spectral_2021.drop(['date'], axis=1)

spectral_2022['date'] = pd.to_datetime(spectral_2022['date'], dayfirst=True)
spectral_2022 = spectral_2022.set_index(spectral_2022['date'])
spectral_2022 = spectral_2022.drop(['date'], axis=1)

spectral_feature_2021 = extract_statistic(spectral_2021, ['centroid', 'spread', 'skewness', 'kurtosis', 'flatness', 'rolloff', 'crest','flux', 'entropy'], 2021, '00:00', '23:00')
spectral_feature_2022 = extract_statistic(spectral_2022, ['centroid', 'spread', 'skewness', 'kurtosis', 'flatness', 'rolloff', 'crest','flux', 'entropy'], 2022, '00:00', '23:00')
spectral_features = pd.concat([spectral_feature_2021, spectral_feature_2022])

In [None]:
hand_crafted_2021 = pd.read_pickle("../data/features/2021_df_hand_crafted_features.pkl")
hand_crafted_2022 = pd.read_pickle("../data/features/2022_df_hand_crafted_features.pkl")

hand_crafted_2021['date'] = pd.to_datetime(hand_crafted_2021['date'], dayfirst=True)
hand_crafted_2021 = hand_crafted_2021.set_index(hand_crafted_2021['date'])
hand_crafted_2021 = hand_crafted_2021.drop(['date'], axis=1)

hand_crafted_2022['date'] = pd.to_datetime(hand_crafted_2022['date'], dayfirst=True)
hand_crafted_2022 = hand_crafted_2022.set_index(hand_crafted_2022['date'])
hand_crafted_2022 = hand_crafted_2022.drop(['date'], axis=1)

hand_crafted_feature_2021 = extract_statistic(hand_crafted_2021, hand_crafted_2021.columns[3:], 2021, '00:00', '23:00')
hand_crafted_feature_2022 = extract_statistic(hand_crafted_2022, hand_crafted_2022.columns[3:], 2022, '00:00', '23:00')
hand_crafted_features = pd.concat([hand_crafted_feature_2021, hand_crafted_feature_2022])

In [None]:
win = 1600
shift= 800

hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
mfccs_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_mfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


mfccs_ss_2021['date'] = pd.to_datetime(mfccs_ss_2021['date'], dayfirst=True)
mfccs_ss_2021 = mfccs_ss_2021.set_index(mfccs_ss_2021['date'])
mfccs_ss_2021 = mfccs_ss_2021.drop(['date'], axis=1)

mfccs_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_mfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


mfccs_ss_2022['date'] = pd.to_datetime(mfccs_ss_2022['date'], dayfirst=True)
mfccs_ss_2022 = mfccs_ss_2022.set_index(mfccs_ss_2022['date'])
mfccs_ss_2022 = mfccs_ss_2022.drop(['date'], axis=1)


mfccs_ss_feature_2021 = extract_statistic(mfccs_ss_2021, mfccs_ss_2021.columns[3:], 2021, '00:00', '23:00');
mfccs_ss_feature_2022 = extract_statistic(mfccs_ss_2022, mfccs_ss_2022.columns[3:], 2022, '00:00', '23:00');
mfccs_ss_features = pd.concat([mfccs_ss_feature_2021, mfccs_ss_feature_2022])



In [None]:
win = 1600
shift= 800

hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
lfccs_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_lfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)

lfccs_ss_2021['date'] = pd.to_datetime(lfccs_ss_2021['date'], dayfirst=True)
lfccs_ss_2021 = lfccs_ss_2021.set_index(lfccs_ss_2021['date'])
lfccs_ss_2021 = lfccs_ss_2021.drop(['date'], axis=1)

lfccs_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_lfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


lfccs_ss_2022['date'] = pd.to_datetime(lfccs_ss_2022['date'], dayfirst=True)
lfccs_ss_2022 = lfccs_ss_2022.set_index(lfccs_ss_2022['date'])
lfccs_ss_2022 = lfccs_ss_2022.drop(['date'], axis=1)



lfccs_ss_feature_2021 = extract_statistic(lfccs_ss_2021, lfccs_ss_2021.columns[3:],  2021, '00:00', '23:00');
lfccs_ss_feature_2022 = extract_statistic(lfccs_ss_2022, lfccs_ss_2022.columns[3:], 2022, '00:00', '23:00');
lfccs_ss_feature = pd.concat([lfccs_ss_feature_2021, lfccs_ss_feature_2022])


In [None]:

hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
spectral_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)

spectral_ss_2021['date'] = pd.to_datetime(spectral_ss_2021['date'], dayfirst=True)
spectral_ss_2021 = spectral_ss_2021.set_index(spectral_ss_2021['date'])
spectral_ss_2021 = spectral_ss_2021.drop(['date'], axis=1)

spectral_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)


spectral_ss_2022['date'] = pd.to_datetime(spectral_ss_2022['date'], dayfirst=True)
spectral_ss_2022 = spectral_ss_2022.set_index(spectral_ss_2022['date'])
spectral_ss_2022 = spectral_ss_2022.drop(['date'], axis=1)

spectral_ss_feature_2021 = extract_statistic(spectral_ss_2021, spectral_ss_2021.columns[3:],  2021, '00:00', '23:00');
spectral_ss_feature_2022 = extract_statistic(spectral_ss_2022, spectral_ss_2022.columns[3:], 2022, '00:00', '23:00');
spectral_ss_feature = pd.concat([spectral_ss_feature_2021, spectral_ss_feature_2022])


In [None]:

hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
hand_crafted_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_hand_crafted_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)

hand_crafted_ss_2021['date'] = pd.to_datetime(hand_crafted_ss_2021['date'], dayfirst=True)
hand_crafted_ss_2021 = hand_crafted_ss_2021.set_index(hand_crafted_ss_2021['date'])
hand_crafted_ss_2021 = hand_crafted_ss_2021.drop(['date'], axis=1)

hand_crafted_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_hand_crafted_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)


hand_crafted_ss_2022['date'] = pd.to_datetime(hand_crafted_ss_2022['date'], dayfirst=True)
hand_crafted_ss_2022 = hand_crafted_ss_2022.set_index(hand_crafted_ss_2022['date'])
hand_crafted_ss_2022 = hand_crafted_ss_2022.drop(['date'], axis=1)

hand_crafted_ss_feature_2021 = extract_statistic(hand_crafted_ss_2021, hand_crafted_ss_2021.columns[3:],  2021, '00:00', '23:00');
hand_crafted_ss_feature_2022 = extract_statistic(hand_crafted_ss_2022, hand_crafted_ss_2022.columns[3:], 2022, '00:00', '23:00');
hand_crafted_ss_feature = pd.concat([hand_crafted_ss_feature_2021, hand_crafted_ss_feature_2022])


# Repeat 10 times - Random-split

In [None]:
feature = 'mfccs' #nectar, mfccs, lfccs, spectral
feature_data = mfccs_features # nectar_features, lfccs_features, mfccs_features, spectral_features 

In [None]:
model_predictions, y_tests = random_split_evaluation(feature_data, feature=feature, n_iterations=10,
                                                     model='random forest')


In [None]:
random_predictions = random_baseline_metrics(y_tests, n_iterations=10, random_range=(0, 31))


In [None]:
results = evaluate_model_performance(y_tests, random_predictions, model_predictions)


# Repeat 10 times - Independent-split

In [None]:
feature = 'mfccs' #hand_crafted, mfccs, lfccs, spectral
feature_data = mfccs_features # hand_crafted_features, lfccs_features, mfccs_features, spectral_features 

In [None]:
initial_train_hives = [3628, 3631, 6, 3640, 3693] 
initial_test_hives = [3692, 3690]
initial_val_hives = [3691, 3627]

model_predictions, y_tests = independent_split_evaluation(feature_data, feature=feature,
                                                          initial_train_hives, initial_val_hives,
                                                          initial_test_hives, n_iterations=10,
                                                          model='random forest')

In [None]:
random_predictions = random_baseline_metrics(y_tests, n_iterations=10, random_range=(0, 31))


In [None]:
results = evaluate_model_performance(y_tests, random_predictions, model_predictions)
