In [10]:
import pandas as pd
import numpy as np
import pickle
from datetime import timedelta

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

from tqdm import tqdm

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [33]:
num_days_data = 45
directory = './data/track_1/'

hydro_1day = pd.read_csv(directory + 'hydro_1day_prep.csv')
df = pd.read_csv(directory + 'train.csv')
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [34]:
hydro_1day['date'] = hydro_1day['date'].astype('datetime64[ns]')
stations = hydro_1day['station_id'].unique()

def get_data(year):
    if year-1 in hydro_1day['year'].values:
        part1 = hydro_1day[(hydro_1day['year']==year-1)].iloc[-num_days_data*len(stations):]    
    else:
        part1 = hydro_1day[(hydro_1day['year']==year)].iloc[-num_days_data*len(stations):]  
        if (year-1)%4:
            days = 365
        else:
            days = 366

        part1['date'] = part1['date'] - pd.Timedelta(days=days)

    part2 = hydro_1day[(hydro_1day['year']==year)].iloc[:num_days_data*len(stations)]
    result = pd.concat([part1, part2])
    result = result.drop(['date'], 1)
    return result


def predict(df):
    Y_pred = []
    full_train_df = df.copy()
    for index1, group in tqdm(full_train_df.groupby(['station_id','year'])):

        data = get_data(year=index1[1])
        
        X_days = []

        for i, (index, row) in enumerate(group.iterrows()):
            period = data.iloc[i:num_days_data*len(stations)+i]
            period = period.values.flatten()

            full_train_df.loc[index, 'ice_jam_pred'] = model.predict([np.append(row.drop('ice_jam').values, period)])
            
            
    return full_train_df

In [35]:
pred = predict(df)

100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [03:01<00:00,  1.42it/s]


In [36]:
f1_score(pred['ice_jam'], pred['ice_jam_pred'])

1.0

In [None]:
pred = predict(df)
pred['ice_jam'] = pred['ice_jam_pred']
pred = pred.drop(['ice_jam_pred'],1)
pred = pred.astype(int)
pred.to_csv(directory + 'predict_data.csv', index=None)

 38%|██████████████████████████████▊                                                  | 98/258 [01:09<01:48,  1.47it/s]