In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
directory = './data/track_1/'

# гидро
hydro_1day = pd.read_csv(directory + 'hydro_1day.csv', parse_dates=['date'])
# hydro_coord = pd.read_csv(directory + 'hydro_coord.csv')

# метео — обратите внимание на различие во временной детализации
# meteo_3hours = pd.read_csv(directory + 'meteo_3hours.csv')
# meteo_1day = pd.read_csv(directory + 'meteo_1day.csv')
# meteo_1month = pd.read_csv(directory + 'meteo_1month.csv')
# meteo_coord = pd.read_csv(directory + 'meteo_coord.csv')

# справочники
# reference_water_codes = pd.read_csv(directory + 'reference_water_codes.csv')
# reference_horiz_visib = pd.read_csv(directory + 'reference_horiz_visib.csv')

# тест и трейн
train_df = pd.read_csv(directory + 'train.csv')
test_df = pd.read_csv(directory + 'test.csv')

# Обработка

In [3]:
hydro_1day = pd.read_csv(directory + 'hydro_1day.csv', parse_dates=['date'])



# разделяем код режимной группы
water_code = pd.DataFrame(hydro_1day['water_code'].str.split(',', expand=True).fillna(0).astype(int))
water_code.columns=[f'water_code_{i}' for i in range(5)]


# добавляем в общий датасет
hydro_1day = pd.concat([hydro_1day, water_code], axis=1)
hydro_1day = hydro_1day.drop(['water_code'], 1)
hydro_1day = hydro_1day.sort_values(by=['year', 'day'])
hydro_1day = hydro_1day.reset_index(drop=True)


# удаляем не нужные фичи
hydro_1day = hydro_1day.drop(['month'], 1)

# запонняем пустые значения
hydro_1day[['place', 'snow_height']] = hydro_1day[['place','snow_height']].fillna(0)
hydro_1day.loc[hydro_1day['temp'].isna(), 'temp'] = hydro_1day['temp'].median()

# нормализируем
hydro_1day['place'] = hydro_1day['place'].astype(int)
lb_place = LabelEncoder()
hydro_1day['place'] = lb_place.fit_transform(hydro_1day['place'])
hydro_1day[['stage_avg', 'stage_min', 'stage_max']] = hydro_1day[['stage_avg', 'stage_min', 'stage_max']].abs()


# интерполируем данные для каждой станции
columns_interpolate = ['stage_avg', 'stage_min', 'stage_max', 'temp', 'discharge', 'ice_thickness']
stations = hydro_1day['station_id'].unique()
for i in tqdm(stations):
    for col in columns_interpolate:
        hydro_1day.loc[hydro_1day['station_id'] == i, col] = hydro_1day.loc[hydro_1day['station_id'] == i, col].interpolate(limit_direction='both')
        
        
hydro_1day['discharge'] = hydro_1day['discharge'].fillna(hydro_1day['discharge'].median())

100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 37.03it/s]


In [4]:
hydro_1day['real'] = 1
empty_rows = []
stations = hydro_1day['station_id'].unique()

for index, i in tqdm(hydro_1day.groupby(['year', 'day'])):
    for station in stations:
        if station not in i['station_id'].values:     
            row = i.iloc[0].copy()
            row['station_id'] = station
            empty_rows.append(row.values)
            
       
empty_df = pd.DataFrame(empty_rows, columns=hydro_1day.columns)
empty_df.iloc[:,4:] = 0
hydro_1day = pd.concat([hydro_1day, empty_df], ignore_index=True).sort_values(by=['year', 'day'])
# hydro_1day = hydro_1day.append(empty_df)

100%|████████████████████████████████████████████████████████████████████████████| 8960/8960 [00:04<00:00, 1820.70it/s]


In [5]:
hydro_1day.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232960 entries, 0 to 232959
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   year           232960 non-null  int64         
 1   station_id     232960 non-null  int64         
 2   day            232960 non-null  int64         
 3   date           232960 non-null  datetime64[ns]
 4   stage_avg      232960 non-null  float64       
 5   stage_min      232960 non-null  float64       
 6   stage_max      232960 non-null  float64       
 7   temp           232960 non-null  float64       
 8   ice_thickness  232960 non-null  float64       
 9   snow_height    232960 non-null  float64       
 10  place          232960 non-null  int64         
 11  discharge      232960 non-null  float64       
 12  water_code_0   232960 non-null  int64         
 13  water_code_1   232960 non-null  int64         
 14  water_code_2   232960 non-null  int64         
 15  

In [6]:
num_days_data = 45

In [7]:
def get_data(year=2000):
    if year-1 in hydro_1day['year'].values:
        part1 = hydro_1day[(hydro_1day['year']==year-1)].iloc[-num_days_data*len(stations):]    
    else:
        part1 = hydro_1day[(hydro_1day['year']==year)].iloc[-num_days_data*len(stations):]  
        if (year-1)%4:
            days = 365
        else:
            days = 366

        part1['date'] = part1['date'] - pd.Timedelta(days=days)

    part2 = hydro_1day[(hydro_1day['year']==year)].iloc[:num_days_data*len(stations)]
    result = pd.concat([part1, part2])
    result = result.drop(['date'], 1)
    return result

get_data(1986)

Unnamed: 0,year,station_id,day,stage_avg,stage_min,stage_max,temp,ice_thickness,snow_height,place,discharge,water_code_0,water_code_1,water_code_2,water_code_3,water_code_4,real
5486,1985,3019,321,36.0,36.0,36.0,0.00,29.386598,0.0,0,1210.000,15,0,0,0,0,1
5487,1985,3041,321,73.0,73.0,73.0,0.00,34.900000,0.0,0,1210.000,16,0,0,0,0,1
5488,1985,3042,321,137.0,137.0,137.0,11.75,42.000000,0.0,0,1470.000,16,0,0,0,0,1
5489,1985,3038,321,245.0,245.0,245.0,0.00,23.800000,0.0,0,1210.000,16,0,0,0,0,1
5490,1985,3555,321,280.0,280.0,280.0,0.00,47.608247,0.0,0,1210.000,5,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7821,1986,3106,45,26.0,26.0,26.0,11.75,74.597701,0.0,0,109.000,16,0,0,0,0,1
7822,1986,3180,45,209.0,209.0,209.0,11.75,66.080460,0.0,0,31.398,16,0,0,0,0,1
7823,1986,3035,45,14.0,14.0,14.0,11.75,74.840237,0.0,0,1210.000,16,0,0,0,0,1
7824,1986,3031,45,23.0,23.0,23.0,11.75,128.097561,0.0,0,1210.000,16,0,0,0,0,1


In [8]:
get_data(1985)

Unnamed: 0,year,station_id,day,stage_avg,stage_min,stage_max,temp,ice_thickness,snow_height,place,discharge,water_code_0,water_code_1,water_code_2,water_code_3,water_code_4,real
5486,1985,3019,321,36.0,36.0,36.0,0.00,29.386598,0.0,0,1210.000,15,0,0,0,0,1
5487,1985,3041,321,73.0,73.0,73.0,0.00,34.900000,0.0,0,1210.000,16,0,0,0,0,1
5488,1985,3042,321,137.0,137.0,137.0,11.75,42.000000,0.0,0,1470.000,16,0,0,0,0,1
5489,1985,3038,321,245.0,245.0,245.0,0.00,23.800000,0.0,0,1210.000,16,0,0,0,0,1
5490,1985,3555,321,280.0,280.0,280.0,0.00,47.608247,0.0,0,1210.000,5,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,1985,3106,45,37.0,37.0,37.0,11.75,85.574713,0.0,0,84.600,16,14,0,0,0,1
1166,1985,3180,45,230.0,230.0,230.0,11.75,94.988827,0.0,0,41.199,16,0,0,0,0,1
1167,1985,3035,45,38.0,38.0,38.0,11.75,113.540230,0.0,0,1210.000,16,0,0,0,0,1
1168,1985,3031,45,47.0,47.0,47.0,11.75,158.482759,0.0,0,1210.000,16,0,0,0,0,1


In [13]:
rfc = RandomForestClassifier(n_jobs=-1, random_state=42)

def train_model(df):
    days_X = []
    days_Y = []
    for index1, group in tqdm(df.groupby(['station_id','year'])):

        data = get_data(year=index1[1])

        for i, (index, row) in enumerate(group.iterrows()):
            period = data.iloc[i:num_days_data*len(stations)+i]
            period = period.values.flatten()

            days_X.append(np.append(row.drop('ice_jam').values, period))
            days_Y.append(row['ice_jam'])

    rfc.fit(days_X, days_Y)

In [39]:
def predict(df):
    Y_pred = []
    full_train_df = df.copy()
    for index1, group in tqdm(full_train_df.groupby(['station_id','year'])):

        data = get_data(year=index1[1])
        
        X_days = []

        for i, (index, row) in enumerate(group.iterrows()):
            period = data.iloc[i:num_days_data*len(stations)+i]
            period = period.values.flatten()

            full_train_df.loc[index, 'ice_jam_pred'] = rfc.predict([np.append(row.drop('ice_jam').values, period)])
            
            
    return full_train_df

In [38]:
X_train, X_test = train_test_split(train_df, test_size=0.33, random_state=42, stratify=train_df['ice_jam'])

In [22]:
train_model(X_train)

100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:07<00:00, 35.03it/s]


In [41]:
pred = predict(X_test)

100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [01:05<00:00,  3.96it/s]


In [42]:
pred[]

Unnamed: 0,year,station_id,day,ice_jam,ice_jam_pred
4382,1996,3230,27,0.0,0.0
3525,2007,3230,5,0.0,0.0
5162,1986,3045,14,0.0,0.0
133,2007,3019,1,0.0,0.0
1827,1987,3041,23,0.0,0.0
...,...,...,...,...,...
3330,1998,3030,30,0.0,0.0
1392,2010,3041,28,0.0,0.0
6414,1988,3028,35,0.0,0.0
10071,1996,3035,30,0.0,0.0


In [37]:
from sklearn.metrics import f1_score

In [43]:
f1_score(pred['ice_jam'], pred['ice_jam_pred'])

0.03703703703703704