In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

In [2]:
root = Path.cwd().parent
data = root / 'data'

In [4]:
df = pd.read_csv(data / 'train_val.csv', parse_dates=['date'])

In [5]:
class_priors = np.zeros(len(df['class_id'].unique()))
for species in df['class_id'].unique():
    class_priors[species] = len(df[df['class_id'] == species])

class_priors = class_priors/sum(class_priors)

month_distributions = {}

for _, observation in tqdm(df.iterrows(), total=len(df)):
    month = str(observation.month)
    class_id = observation.class_id
    if month not in month_distributions:        
        month_distributions[month] = np.zeros(len(df['class_id'].unique()))
    else:
        month_distributions[month][class_id] += 1

for key, value in month_distributions.items():
    month_distributions[key] = value / sum(value)

In [18]:
df['norm_date'] = MinMaxScaler().fit_transform(df[['days']])
df.drop('date_c', axis=1).to_csv(data / 'train_val.csv', index=False)

In [32]:
import pickle

pickle.dump(month_distributions, open(root / 'month_distributions.pkl', 'wb'))
pickle.dump(class_priors, open(root / 'class_priors.pkl', 'wb'))