In [1]:
import pandas as pd
from pandas import DataFrame
import math
import dataset
import main
import events
import os
import numpy as np

MINUTES = 20
SRC_DIR = 'dataset/bob_all_processed_mins'
OUT_DIR = 'dataset/modern/full_dataset_'+str(MINUTES)
os.makedirs(OUT_DIR, exist_ok=True)
files = os.scandir(SRC_DIR)

In [2]:
columns = [
    'time',
    't_i_1',
    't_i_2',
    't_i_3',
    't_i_4',
    't_i_5',
    't_o',
    'weight_kg',
    'h',
    't',
    'p',
    'year',
    'month',
    'day',
    'hour',
    'minute',
    'queencell.next.dif',
    'feeding.next.dif',
    'honey.next.dif',
    'treatment.next.dif',
    'died.next.dif',
    'swarming.next.dif',
]

files = os.scandir(SRC_DIR)

In [3]:
def has_event(record: pd.Series, e: str):
    event = f'{e}.next.dif'
    return not (record[event] == 'NA' or record[event] == '' or record[event] == None or math.isnan(record[event]))

def to_out_filename(name: str, queencell: bool, feeding: bool, honey: bool, treatment: bool, died: bool, swarming: bool):
    queencell = 'q' if queencell else ''
    feeding = 'f' if feeding else ''
    honey = 'h' if honey else ''
    treatment = 't' if treatment else ''
    died = 'd' if died else ''
    swarming = 's' if swarming else ''
    return f'{name.split(".")[0]}-{queencell}{feeding}{honey}{treatment}{died}{swarming}.csv'

In [4]:
for file in files:
    df = pd.read_csv(file.path, dtype={
        't_i_1': float,
        't_i_2': float,
        't_i_3': float,
        't_i_4': float,
        't_o': float,
        'weight_kg': float,
        "weight_delta": float,
        'numeric.time': float,
        'h': float,
        't': float,
        'p': float,
    }, low_memory=False)

    df = df.bfill()
    first = df.iloc[0]
    print(f"Processing {file.name}")
    
    has_queencell = has_event(first, 'queencell') 
    has_feeding = has_event(first, 'feeding')
    has_honey = has_event(first, 'honey')
    has_treatment = has_event(first, 'treatment')
    has_died = has_event(first, 'died')
    has_swarming = has_event(first, 'swarming')
    
    print(f'Processing {file.name}:')
    print(f' Queencell: {has_queencell}')
    print(f' Feeding: {has_feeding}')
    print(f' Honey: {has_honey}')
    print(f' Treatment: {has_treatment}')
    print(f' Died: {has_died}')
    print(f' Swarming: {has_swarming}')
    
    odf = pd.DataFrame(columns=columns)
    
    for i, current in df.iterrows():
        if i % MINUTES == 0:
            print(f"MI: {i}/{len(df)}", end='\r')
            if i > 0:
                slice = df.iloc[i-MINUTES:i]
                odf.loc[i // MINUTES] = {
                    'X.1': current['X.1'],
                    'month': current['month'],
                    'hour': current['hour'],

                    't_i_1': slice['t_i_1'].mean(),
                    't_i_2': slice['t_i_2'].mean(),
                    't_i_3': slice['t_i_3'].mean(),
                    't_i_4': slice['t_i_4'].mean(),
                    't_i_5': slice['t_i_5'].mean(),


                    't_o': slice['t_o'].mean(),
                    'weight_kg': slice['weight_kg'].mean(),

                    'h': slice['h'].mean(),
                    't': slice['t'].mean(),
                    'p': slice['p'].mean(),
                    
                    'year': current['year'],
                    'month': current['month'],
                    'day': current['day'],
                    'hour': current['hour'],
                    'minute': current['minute'],
                    
                    'queencell.next.dif': current['queencell.next.dif'],
                    'feeding.next.dif': current['feeding.next.dif'],
                    'honey.next.dif': current['honey.next.dif'],
                    'treatment.next.dif': current['treatment.next.dif'],
                    'died.next.dif': current['died.next.dif'],
                    'swarming.next.dif': current['swarming.next.dif'],
                }
            else:
                odf.loc[i // MINUTES] = current[columns]   
                
    odf.bfill()
    odf['weight_kg'] = dataset.smooth_col(odf['weight_kg'])
    odf['t_i_1'] = dataset.smooth_col(odf['t_i_1'])
    odf['t_i_2'] = dataset.smooth_col(odf['t_i_2'])
    odf['t_i_3'] = dataset.smooth_col(odf['t_i_3'])
    odf['t_i_4'] = dataset.smooth_col(odf['t_i_4'])
    odf['t_i_5'] = dataset.smooth_col(odf['t_i_5'])

    odf['t'] = dataset.smooth_col(odf['t'])
    odf['h'] = dataset.smooth_col(odf['h'])
    odf['p'] = dataset.smooth_col(odf['p'])
    
    if has_queencell:
        queencell_indexes = events.get_event_indexes(odf, 'queencell.next.dif')
        dataset.populate_column_by_index(odf, 'queencell', queencell_indexes)

    if has_feeding:
        feeding_indexes = events.get_event_indexes(odf, 'feeding.next.dif')
        dataset.populate_column_by_index(odf, 'feeding', feeding_indexes)

    if has_honey:
        honey_indexes = events.get_event_indexes(odf, 'honey.next.dif')
        dataset.populate_column_by_index(odf, 'honey', honey_indexes)
    
    if has_treatment:
        treatment_indexes = events.get_event_indexes(odf, 'treatment.next.dif')
        dataset.populate_column_by_index(odf, 'treatment', treatment_indexes)
    
    if has_died:
        died_indexes = events.get_event_indexes(odf, 'died.next.dif')
        dataset.populate_column_by_index(odf, 'died', died_indexes)
    
    if has_swarming:
        swarming_indexes = events.get_event_indexes(odf, 'swarming.next.dif')
        dataset.populate_column_by_index(odf, 'swarming', swarming_indexes)

    dataset.populate_delta(odf)
    dataset.populate_temp_delta(odf)
    dataset.populate_humidity_delta(odf)
    
    out_filename = to_out_filename(file.name, has_queencell, has_feeding, has_honey, has_treatment, has_died, has_swarming)
    odf.to_csv(f'{OUT_DIR}/{out_filename}', index=False)
    

  df = df.bfill()


Processing 2020_79.csv
Processing 2020_79.csv:
 Queencell: False
 Feeding: True
 Honey: False
 Treatment: True
 Died: False
 Swarming: True
MI: 272920/272937

  odf.bfill()
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  c.interpolate(method='linear', inplace=True)
  perc_deltaDI = (curr[column] - prevDI[column]) / prevDI[column]


KeyError: "None of [Index(['weight_delta_diff'], dtype='object')] are in the [columns]"