In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, cross_val_score

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

import lightgbm as lgbm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape, test.shape)

(2838, 370) (939, 369)


Объединяем train и test выборки

In [3]:
train_target = train['Culture']
train.drop('Culture', axis = 1, inplace = True)

train['is train'] = 1
test['is train'] = 0
df = pd.concat([train, test])

df_features = df[['Year', 'Field ID', 'Field Area', 'is train']]

days = list(train.columns[3:-1])

Cтоит заполнить пропуски. Для этого будем обучаться на полях, у которых известны все дни(365) и предсказывать для всех остальных

In [4]:
#для начала посмотрим количество информатвных дней у каждого
df_features['amt_days'] = (df[days].isna() == False).sum(axis = 1)

In [5]:
#Сначала подберём алгоритм, обучившись на 364 днях и предсказав на 365-й:
train_na = df[df_features['amt_days']>=365][days[:364]]
target_na = df[df_features['amt_days']>=365][days[364]]

kf = KFold(n_splits=3, shuffle = True, random_state=13)

for est in [100, 200, 300, 500]:
    for d in [2, 5, 8, 12, 16]:
        model = lgbm.LGBMRegressor(n_estimators = est, max_depth = d, random_state = 44, n_jobs = 4)
        score = cross_val_score(model, train_na, target_na, cv = kf, scoring='neg_mean_squared_error')
        print(est, d, np.mean(score))
        
for k in range(1, 25):
    model = KNeighborsRegressor(n_neighbors=k)
    score = cross_val_score(model, train_na, target_na, cv = kf, scoring='neg_mean_squared_error')
    print(k, np.mean(score))

100 2 -0.00010983799959275591
100 5 -6.436471580923183e-05
100 8 -6.151429194979794e-05
100 12 -6.0516276991428513e-05
100 16 -6.058763700035909e-05
200 2 -8.728547454058916e-05
200 5 -5.993426809563833e-05
200 8 -5.740956932137332e-05
200 12 -5.766898560864294e-05
200 16 -5.725678636866907e-05
300 2 -7.963730733924925e-05
300 5 -5.8404487788624595e-05
300 8 -5.7145584064076924e-05
300 12 -5.7521155197809425e-05
300 16 -5.697713625218716e-05
500 2 -7.487672426673317e-05
500 5 -5.81013399079825e-05
500 8 -5.750697101334243e-05
500 12 -5.8092893272918164e-05
500 16 -5.739050176608738e-05
1 -0.004102669421208281
2 -0.0034238358766370933
3 -0.003324593417828475
4 -0.003463654810942121
5 -0.0035735215114490915
6 -0.003654123115758344
7 -0.0037808599460265727
8 -0.003965647124656738
9 -0.004123713762433043
10 -0.004277225611913815
11 -0.004367863875813091
12 -0.004495117404913627
13 -0.0046080582869485005
14 -0.004701413766349379
15 -0.004810171100708821
16 -0.0049379253307687745
17 -0.00501

Будем использовать модель LGBMRegressor с n_estimators = 300 и max_depth = 8

In [36]:
start_df.columns

Index(['Year', 'Field ID', 'Field Area', 'Day 1', 'Day 2', 'Day 3', 'Day 4',
       'Day 5', 'Day 6', 'Day 7',
       ...
       'Day 245', 'Day 246', 'Day 247', 'Day 248', 'Day 249', 'Day 250',
       'Day 251', 'Day 252', 'Day 253', 'Day 254'],
      dtype='object', length=257)

In [37]:
start_df = df[df.columns[:256]]
for day in tqdm(days[253:]):
    start_df[day] = list(df[day])
    if (start_df[day].isna().sum()>0):
        train_day = start_df[start_df[day].isna() == False]
        test_day = start_df[start_df[day].isna()]
        
        res_train = train_day[['Year', 'Field ID', 'Field Area', day]]
        res_test = test_day[['Year', 'Field ID', 'Field Area']]
        target_day = train_day[day]
        
        train_day.drop(day, axis = 1, inplace = True)
        test_day.drop(day, axis = 1, inplace = True)
        start_df.drop(day, axis = 1, inplace = True)
        
        model = lgbm.LGBMRegressor(n_estimators = 300, max_depth = 8, random_state = 44, n_jobs = 4)
        model.fit(train_day, target_day)
        y_pred = model.predict(test_day)
        res_test[day]=y_pred
        start_df = pd.merge(start_df, pd.concat([res_train, res_test]), on = ['Year', 'Field ID', 'Field Area'])

100%|██████████| 113/113 [07:01<00:00,  3.73s/it]


In [38]:
start_df

Unnamed: 0,Year,Field ID,Field Area,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7,...,Day 357,Day 358,Day 359,Day 360,Day 361,Day 362,Day 363,Day 364,Day 365,Day 366
0,2018,517,48.64,0.5369,0.5371,0.5375,0.5379,0.5384,0.5390,0.5397,...,0.440200,0.436200,0.432200,0.428300,0.424300,0.420300,0.416300,0.412300,0.408400,0.392122
1,2019,1395,80.41,0.1159,0.1096,0.1034,0.0971,0.0909,0.0846,0.0783,...,-0.004773,-0.006073,-0.007973,-0.008298,-0.009524,-0.009825,-0.010031,-0.010170,-0.010817,-0.011907
2,2017,44,43.64,-0.0204,-0.0207,-0.0210,-0.0212,-0.0215,-0.0218,-0.0220,...,0.051600,0.083300,0.114900,0.146600,0.178200,0.209900,0.241500,0.273200,0.304800,0.291426
3,2018,1591,79.34,0.4827,0.4576,0.4324,0.4073,0.3821,0.3570,0.3318,...,0.264600,0.265700,0.266800,0.268000,0.269200,0.270500,0.271700,0.273000,0.274300,0.274513
4,2017,681,144.76,-0.0202,-0.0191,-0.0181,-0.0170,-0.0160,-0.0150,-0.0140,...,-0.035100,-0.035400,-0.035800,-0.036100,-0.036500,-0.036900,-0.037200,-0.037600,-0.037900,-0.028729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,2019,665,46.50,-0.0176,-0.0179,-0.0181,-0.0184,-0.0186,-0.0188,-0.0189,...,0.027822,0.025795,0.023634,0.015430,0.010265,0.009172,0.011404,0.008252,0.008069,0.002872
3773,2019,1277,229.24,0.0212,0.0181,0.0149,0.0118,0.0087,0.0056,0.0025,...,0.065797,0.059417,0.056833,0.050502,0.045515,0.040413,0.034999,0.028333,0.025110,0.017780
3774,2019,875,166.97,0.0460,0.0411,0.0362,0.0312,0.0263,0.0214,0.0165,...,0.112037,0.107812,0.098518,0.090323,0.085745,0.080592,0.074589,0.069685,0.065124,0.063064
3775,2019,301,123.52,0.2499,0.2483,0.2468,0.2452,0.2437,0.2422,0.2407,...,0.219314,0.223738,0.222823,0.219409,0.209657,0.198641,0.194691,0.193127,0.184791,0.196919


In [74]:
df = pd.merge(df[['Year', 'Field ID', 'Field Area', 'is train']], start_df, on = ['Year', 'Field ID', 'Field Area'])

In [75]:
df.to_csv('filled_df.csv', index = False)

Собираем признаки:

In [76]:
#стандартные статистические величины
df_features['days_mean'] = df[days].mean(axis = 1)
df_features['days_std'] = df[days].std(axis = 1)
df_features['days_min'] = df[days].min(axis = 1)
df_features['days_max'] = df[days].max(axis = 1)
df_features['days_max_minus_min'] = (df_features['days_max']-df_features['days_min'])/df_features['days_mean']

In [119]:
df_features = pd.concat([df_features, pd.get_dummies(df_features['Year'])], axis = 1)                         

In [131]:
df_features.columns

Index([              'Year',           'Field ID',         'Field Area',
                 'is train',           'amt_days',          'days_mean',
                 'days_std',           'days_min',           'days_max',
       'days_max_minus_min',                 2015,                 2016,
                       2017,                 2018,                 2019],
      dtype='object')

In [113]:
X_tt, X_tv, y_tt, y_tv = train_test_split(df_features[df_features['is train']==1].drop(['Field ID', 'is train'], axis = 1), 
                                          train_target, test_size=0.2, random_state=0)

In [147]:
X_tt, X_tv, y_tt, y_tv = train_test_split(df_1[df_1['is train']==1].drop(['Field ID', 'is train', 'Year'], axis = 1), 
                                          train_target, test_size=0.2, random_state=0)

In [146]:
df_1 = pd.merge(df, df_features, on = ['Year', 'Field ID', 'Field Area', 'is train'])

In [148]:
model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
model.fit(X_tt, y_tt)
preds_valid = model.predict(X_tv)

print(f1_score(y_tv, preds_valid, average='weighted'))

0.895717530977502


In [149]:
model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
model.fit(df_1[df_1['is train']==1].drop(['Field ID', 'is train', 'Year'], axis = 1), train_target)
preds = model.predict(df_1[df_1['is train']==0].drop(['Field ID', 'is train', 'Year'], axis = 1)) + 1

In [151]:
pd.Series(preds).to_csv('preds.csv', index=False, header=['Culture'])

In [None]:
preds = model.predict(X_test) + 1

pd.Series(preds).to_csv('preds.csv', index=False, header=['Culture'])