In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, cross_val_score

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

import lightgbm as lgbm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape, test.shape)

(2838, 370) (939, 369)


Объединяем train и test выборки

In [3]:
train_target = train['Culture']
train.drop('Culture', axis = 1, inplace = True)

train['is train'] = 1
test['is train'] = 0
df = pd.concat([train, test])

df_features = df[['Year', 'Field ID', 'Field Area', 'is train']]

days = list(train.columns[3:-1])

Cтоит заполнить пропуски. Для этого будем обучаться на полях, у которых известны все дни(365) и предсказывать для всех остальных

In [4]:
#для начала посмотрим количество информатвных дней у каждого
df_features['amt_days'] = (df[days].isna() == False).sum(axis = 1)

In [None]:
#Сначала подберём алгоритм, обучившись на 364 днях и предсказав на 365-й:
train_na = df[df_features['amt_days']>=365][days[:364]]
target_na = df[df_features['amt_days']>=365][days[364]]

kf = KFold(n_splits=3, shuffle = True, random_state=13)

for est in [100, 200, 300, 500]:
    for d in [2, 5, 8, 12, 16]:
        model = lgbm.LGBMRegressor(n_estimators = est, max_depth = d, random_state = 44, n_jobs = 4)
        score = cross_val_score(model, train_na, target_na, cv = kf, scoring='neg_mean_squared_error')
        print(est, d, np.mean(score))
        
for k in range(1, 25):
    model = KNeighborsRegressor(n_neighbors=k)
    score = cross_val_score(model, train_na, target_na, cv = kf, scoring='neg_mean_squared_error')
    print(k, np.mean(score))

Будем использовать модель LGBMRegressor с n_estimators = 300 и max_depth = 8

In [37]:
start_df = df[df.columns[:256]]
for day in tqdm(days[253:]):
    start_df[day] = list(df[day])
    if (start_df[day].isna().sum()>0):
        train_day = start_df[start_df[day].isna() == False]
        test_day = start_df[start_df[day].isna()]
        
        res_train = train_day[['Year', 'Field ID', 'Field Area', day]]
        res_test = test_day[['Year', 'Field ID', 'Field Area']]
        target_day = train_day[day]
        
        train_day.drop(day, axis = 1, inplace = True)
        test_day.drop(day, axis = 1, inplace = True)
        start_df.drop(day, axis = 1, inplace = True)
        
        model = lgbm.LGBMRegressor(n_estimators = 300, max_depth = 8, random_state = 44, n_jobs = 4)
        model.fit(train_day, target_day)
        y_pred = model.predict(test_day)
        res_test[day]=y_pred
        start_df = pd.merge(start_df, pd.concat([res_train, res_test]), on = ['Year', 'Field ID', 'Field Area'])

100%|██████████| 113/113 [07:01<00:00,  3.73s/it]


In [38]:
start_df

Unnamed: 0,Year,Field ID,Field Area,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7,...,Day 357,Day 358,Day 359,Day 360,Day 361,Day 362,Day 363,Day 364,Day 365,Day 366
0,2018,517,48.64,0.5369,0.5371,0.5375,0.5379,0.5384,0.5390,0.5397,...,0.440200,0.436200,0.432200,0.428300,0.424300,0.420300,0.416300,0.412300,0.408400,0.392122
1,2019,1395,80.41,0.1159,0.1096,0.1034,0.0971,0.0909,0.0846,0.0783,...,-0.004773,-0.006073,-0.007973,-0.008298,-0.009524,-0.009825,-0.010031,-0.010170,-0.010817,-0.011907
2,2017,44,43.64,-0.0204,-0.0207,-0.0210,-0.0212,-0.0215,-0.0218,-0.0220,...,0.051600,0.083300,0.114900,0.146600,0.178200,0.209900,0.241500,0.273200,0.304800,0.291426
3,2018,1591,79.34,0.4827,0.4576,0.4324,0.4073,0.3821,0.3570,0.3318,...,0.264600,0.265700,0.266800,0.268000,0.269200,0.270500,0.271700,0.273000,0.274300,0.274513
4,2017,681,144.76,-0.0202,-0.0191,-0.0181,-0.0170,-0.0160,-0.0150,-0.0140,...,-0.035100,-0.035400,-0.035800,-0.036100,-0.036500,-0.036900,-0.037200,-0.037600,-0.037900,-0.028729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,2019,665,46.50,-0.0176,-0.0179,-0.0181,-0.0184,-0.0186,-0.0188,-0.0189,...,0.027822,0.025795,0.023634,0.015430,0.010265,0.009172,0.011404,0.008252,0.008069,0.002872
3773,2019,1277,229.24,0.0212,0.0181,0.0149,0.0118,0.0087,0.0056,0.0025,...,0.065797,0.059417,0.056833,0.050502,0.045515,0.040413,0.034999,0.028333,0.025110,0.017780
3774,2019,875,166.97,0.0460,0.0411,0.0362,0.0312,0.0263,0.0214,0.0165,...,0.112037,0.107812,0.098518,0.090323,0.085745,0.080592,0.074589,0.069685,0.065124,0.063064
3775,2019,301,123.52,0.2499,0.2483,0.2468,0.2452,0.2437,0.2422,0.2407,...,0.219314,0.223738,0.222823,0.219409,0.209657,0.198641,0.194691,0.193127,0.184791,0.196919


In [74]:
df = pd.merge(df[['Year', 'Field ID', 'Field Area', 'is train']], start_df, on = ['Year', 'Field ID', 'Field Area'])

In [75]:
df.to_csv('filled_df.csv', index = False)

In [6]:
df = pd.read_csv('filled_df.csv')

Собираем признаки:

In [7]:
#стандартные статистические величины
df_features['days_mean'] = df[days].mean(axis = 1)
df_features['days_std'] = df[days].std(axis = 1)
df_features['days_min'] = df[days].min(axis = 1)
df_features['days_max'] = df[days].max(axis = 1)
df_features['days_max_minus_min'] = (df_features['days_max']-df_features['days_min'])/df_features['days_mean']

In [8]:
df_features = pd.concat([df_features, pd.get_dummies(df_features['Year'])], axis = 1)                         

In [56]:
base_features=['amt_days','days_mean','days_std','days_min','days_max','days_max_minus_min', 2015, 2016, 2017, 2018, 2019]

Соберём агрегации по месяцам. Так как мы заполнили пропуски для всех дней(в том числе 366), считаем год високосным

In [16]:
month_days = dict()
month_days['Jan'] = days[:31]
month_days['Feb'] = days[31:60]
month_days['Mar'] = days[60:91]
month_days['Apr'] = days[91:121]
month_days['May'] = days[121:152]
month_days['Jun'] = days[152:182]
month_days['Jul'] = days[182:213]
month_days['Aug'] = days[213:244]
month_days['Sep'] = days[244:274]
month_days['Oct'] = days[274:305]
month_days['Nov'] = days[305:335]
month_days['Dec'] = days[335:366]

In [40]:
month_means = []
month_max_minus_mean = []
month_last_minus_first = []
month_mean_div_whole_mean = []
for month in month_days:
    df_features[month+'_mean'] = df[month_days[month]].mean(axis = 1)
    month_means.append(month+'_mean')
    df_features[month+'_max_minus_mean'] = df[month_days[month]].max(axis = 1)-df[month_days[month]].min(axis = 1)
    month_max_minus_mean.append(month+'_max_minus_mean')
    df_features[month+'_last_minus_first'] = df[month_days[month][0]]-df[month_days[month][-1]]
    month_last_minus_first.append(month+'_last_minus_first')
    df_features[month+'_mean_div_whole_mean'] = df_features[month+'_mean']/df_features['days_mean']
    month_mean_div_whole_mean.append(month+'_mean_div_whole_mean')

In [46]:
cumsums = df[days].cumsum(axis = 1)

In [48]:
for day in days:
    cumsums[day] = cumsums[day]/cumsums['Day 366']

In [51]:
month_cumsum = []
for i in [30, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334]:
    df_features['cumsum_'+str(i)] = cumsums[days[i]]
    month_cumsum.append('cumsum_'+str(i))

Валидация

In [57]:
for lists in [base_features, month_means, month_max_minus_mean, month_last_minus_first, 
              month_mean_div_whole_mean, month_cumsum]:
    X_tt, X_tv, y_tt, y_tv = train_test_split(df_features[df_features['is train']==1][lists],
                                          train_target, test_size=0.2, random_state=23)
    model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
    model.fit(X_tt, y_tt)
    preds_valid = model.predict(X_tv)
    
    print(lists, f1_score(y_tv, preds_valid, average='weighted'))

['amt_days', 'days_mean', 'days_std', 'days_min', 'days_max', 'days_max_minus_min', 2015, 2016, 2017, 2018, 2019] 0.6713122608767789
['Jan_mean', 'Feb_mean', 'Mar_mean', 'Apr_mean', 'May_mean', 'Jun_mean', 'Jul_mean', 'Aug_mean', 'Sep_mean', 'Oct_mean', 'Nov_mean', 'Dec_mean'] 0.8451555103961657
['Jan_max_minus_mean', 'Feb_max_minus_mean', 'Mar_max_minus_mean', 'Apr_max_minus_mean', 'May_max_minus_mean', 'Jun_max_minus_mean', 'Jul_max_minus_mean', 'Aug_max_minus_mean', 'Sep_max_minus_mean', 'Oct_max_minus_mean', 'Nov_max_minus_mean', 'Dec_max_minus_mean'] 0.7899095868359964
['Jan_last_minus_first', 'Feb_last_minus_first', 'Mar_last_minus_first', 'Apr_last_minus_first', 'May_last_minus_first', 'Jun_last_minus_first', 'Jul_last_minus_first', 'Aug_last_minus_first', 'Sep_last_minus_first', 'Oct_last_minus_first', 'Nov_last_minus_first', 'Dec_last_minus_first'] 0.8277429479703636
['Jan_mean_div_whole_mean', 'Feb_mean_div_whole_mean', 'Mar_mean_div_whole_mean', 'Apr_mean_div_whole_mean', 'M

In [59]:
for lists in [base_features, month_max_minus_mean, month_last_minus_first, 
              month_mean_div_whole_mean, month_cumsum]:
    X_tt, X_tv, y_tt, y_tv = train_test_split(df_features[df_features['is train']==1][month_means+lists],
                                          train_target, test_size=0.2, random_state=23)
    model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
    model.fit(X_tt, y_tt)
    preds_valid = model.predict(X_tv)
    
    print(month_means+lists, f1_score(y_tv, preds_valid, average='weighted'))

['Jan_mean', 'Feb_mean', 'Mar_mean', 'Apr_mean', 'May_mean', 'Jun_mean', 'Jul_mean', 'Aug_mean', 'Sep_mean', 'Oct_mean', 'Nov_mean', 'Dec_mean', 'amt_days', 'days_mean', 'days_std', 'days_min', 'days_max', 'days_max_minus_min', 2015, 2016, 2017, 2018, 2019] 0.848425535109012
['Jan_mean', 'Feb_mean', 'Mar_mean', 'Apr_mean', 'May_mean', 'Jun_mean', 'Jul_mean', 'Aug_mean', 'Sep_mean', 'Oct_mean', 'Nov_mean', 'Dec_mean', 'Jan_max_minus_mean', 'Feb_max_minus_mean', 'Mar_max_minus_mean', 'Apr_max_minus_mean', 'May_max_minus_mean', 'Jun_max_minus_mean', 'Jul_max_minus_mean', 'Aug_max_minus_mean', 'Sep_max_minus_mean', 'Oct_max_minus_mean', 'Nov_max_minus_mean', 'Dec_max_minus_mean'] 0.8479993333771156
['Jan_mean', 'Feb_mean', 'Mar_mean', 'Apr_mean', 'May_mean', 'Jun_mean', 'Jul_mean', 'Aug_mean', 'Sep_mean', 'Oct_mean', 'Nov_mean', 'Dec_mean', 'Jan_last_minus_first', 'Feb_last_minus_first', 'Mar_last_minus_first', 'Apr_last_minus_first', 'May_last_minus_first', 'Jun_last_minus_first', 'Jul_la

In [60]:
df_features.drop()

Unnamed: 0,Year,Field ID,Field Area,is train,amt_days,days_mean,days_std,days_min,days_max,days_max_minus_min,...,cumsum_59,cumsum_90,cumsum_120,cumsum_151,cumsum_181,cumsum_212,cumsum_243,cumsum_273,cumsum_304,cumsum_334
0,2018,517,48.64,1,365,0.505194,0.202440,0.215800,0.9010,1.356310,...,0.192149,0.298609,0.429627,0.576725,0.665702,0.720710,0.765556,0.809170,0.867410,0.925639
1,2019,1395,80.41,1,326,0.351769,0.282001,-0.053300,0.7775,2.361775,...,0.003759,0.010158,0.077734,0.235985,0.394521,0.571817,0.738953,0.834942,0.919766,0.989227
2,2017,44,43.64,1,365,0.301936,0.236817,-0.042400,0.7949,2.773103,...,-0.010658,0.033949,0.101559,0.195515,0.360108,0.570142,0.690628,0.774730,0.886622,0.977451
3,2018,1591,79.34,1,365,0.386756,0.204191,-0.009000,0.8113,2.120975,...,0.141415,0.263577,0.416005,0.585893,0.670113,0.720361,0.791195,0.841540,0.888403,0.942558
4,2017,681,144.76,1,365,0.296840,0.300523,-0.062100,0.8716,3.145465,...,-0.008729,-0.022531,0.035052,0.221093,0.458681,0.656686,0.752493,0.844904,0.932725,0.997216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
934,2019,665,46.50,0,276,0.228434,0.171441,-0.064200,0.5901,2.864289,...,-0.026468,0.012394,0.087032,0.211954,0.409486,0.582429,0.690681,0.775881,0.858091,0.938221
935,2019,1277,229.24,0,320,0.471490,0.179972,0.066373,0.8393,1.639328,...,0.172942,0.269918,0.388335,0.534402,0.652834,0.733935,0.800463,0.857759,0.908256,0.950680
936,2019,875,166.97,0,324,0.327233,0.278732,-0.038000,0.8708,2.777230,...,0.002984,0.047342,0.121275,0.238080,0.421749,0.638429,0.813331,0.886668,0.929820,0.972856
937,2019,301,123.52,0,292,0.435520,0.227361,-0.007000,0.8992,2.080730,...,0.201501,0.307538,0.450213,0.615794,0.717488,0.771005,0.828190,0.882077,0.929172,0.953946


In [None]:
X_tt, X_tv, y_tt, y_tv = train_test_split(df_features[df_features['is train']==1][month_means+lists],
                                          train_target, test_size=0.2, random_state=23)
    model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
    model.fit(X_tt, y_tt)
    preds_valid = model.predict(X_tv)
    
    print(month_means+lists, f1_score(y_tv, preds_valid, average='weighted'))

In [79]:
X_tt, X_tv, y_tt, y_tv = train_test_split(df_features[df_features['is train']==1].drop(['Field ID', 'is train'], axis = 1), 
                                          train_target, test_size=0.2, random_state=0)

In [70]:
X_tt, X_tv, y_tt, y_tv = train_test_split(df_1[df_1['is train']==1].drop(['Field ID', 'is train'], axis = 1), 
                                          train_target, test_size=0.2, random_state=0)

In [23]:
df_1 = pd.merge(df, df_features, on = ['Year', 'Field ID', 'Field Area', 'is train'])

In [65]:
for est in [20, 50, 100, 200, 300, 500]:
    for d in [5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100]:
        model = RandomForestClassifier(random_state=42, n_estimators=est, max_depth=d, n_jobs = 8)
        model.fit(X_tt, y_tt)
        preds_valid = model.predict(X_tv)

        print(est, d, f1_score(y_tv, preds_valid, average='weighted'))

20 5 0.6656180935266178
20 10 0.8451841517668313
20 15 0.8909423574416322
20 20 0.8730442476805805
20 25 0.8785841567934874
20 30 0.8785841567934874
20 40 0.8785841567934874
20 50 0.8785841567934874
20 60 0.8785841567934874
20 70 0.8785841567934874
20 80 0.8785841567934874
20 90 0.8785841567934874
20 100 0.8785841567934874
50 5 0.6875932523045007
50 10 0.8624786997519104
50 15 0.8953841355521528
50 20 0.884387304307748
50 25 0.8895516272054121
50 30 0.8878561986081891
50 40 0.8878561986081891
50 50 0.8878561986081891
50 60 0.8878561986081891
50 70 0.8878561986081891
50 80 0.8878561986081891
50 90 0.8878561986081891
50 100 0.8878561986081891
100 5 0.6863682709496699
100 10 0.8685358754590745
100 15 0.8899515709923921
100 20 0.8925832692935858
100 25 0.8920516499360279
100 30 0.8920516499360279
100 40 0.8920516499360279
100 50 0.8920516499360279
100 60 0.8920516499360279
100 70 0.8920516499360279
100 80 0.8920516499360279
100 90 0.8920516499360279
100 100 0.8920516499360279
200 5 0.67972

KeyboardInterrupt: 

In [80]:
model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25, max_features = 'sqrt')
model.fit(X_tt, y_tt)
preds_valid = model.predict(X_tv)

print(f1_score(y_tv, preds_valid, average='weighted'))

0.9059297083653809


In [67]:
model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25)
model.fit(X_tt, y_tt)
preds_valid = model.predict(X_tv)

print(f1_score(y_tv, preds_valid, average='weighted'))

0.9059297083653809


In [81]:
model = RandomForestClassifier(random_state=0, n_estimators=200, max_depth=25)
model.fit(df_features[df_features['is train']==1].drop(['Field ID', 'is train'], axis = 1), train_target)
preds_valid = model.predict(df_features[df_features['is train']==0].drop(['Field ID', 'is train'], axis = 1))

pd.Series(preds).to_csv('v1.csv', index=False, header=['Culture'])

In [151]:
pd.Series(preds).to_csv('preds.csv', index=False, header=['Culture'])

In [19]:
df_1[df_1['is train']==0].head()

Unnamed: 0,Year,Field ID,Field Area,is train,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,...,days_mean,days_std,days_min,days_max,days_max_minus_min,2015,2016,2017,2018,2019
2838,2019,605,132.6,0,-0.0047,-0.0051,-0.0056,-0.0061,-0.0067,-0.0073,...,0.505194,0.20244,0.2158,0.901,1.35631,0,0,0,0,1
2839,2019,305,61.44,0,0.2445,0.2438,0.2431,0.2424,0.2417,0.241,...,0.351769,0.282001,-0.0533,0.7775,2.361775,0,0,0,0,1
2840,2019,1096,59.26,0,0.0145,0.0081,0.0018,-0.0046,-0.011,-0.0122,...,0.301936,0.236817,-0.0424,0.7949,2.773103,0,0,0,0,1
2841,2019,179,112.36,0,-0.0171,-0.02,-0.0209,-0.0216,-0.0221,-0.0223,...,0.386756,0.204191,-0.009,0.8113,2.120975,0,0,0,0,1
2842,2019,282,68.57,0,0.3908,0.3892,0.3876,0.3859,0.3843,0.3827,...,0.29684,0.300523,-0.0621,0.8716,3.145465,0,0,0,0,1


In [18]:
test.head()

Unnamed: 0,Year,Field ID,Field Area,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7,...,Day 358,Day 359,Day 360,Day 361,Day 362,Day 363,Day 364,Day 365,Day 366,is train
0,2019,605,132.6,-0.0047,-0.0051,-0.0056,-0.0061,-0.0067,-0.0073,-0.008,...,,,,,,,,,,0
1,2019,305,61.44,0.2445,0.2438,0.2431,0.2424,0.2417,0.241,0.2403,...,,,,,,,,,,0
2,2019,1096,59.26,0.0145,0.0081,0.0018,-0.0046,-0.011,-0.0122,-0.0133,...,,,,,,,,,,0
3,2019,179,112.36,-0.0171,-0.02,-0.0209,-0.0216,-0.0221,-0.0223,-0.0221,...,,,,,,,,,,0
4,2019,282,68.57,0.3908,0.3892,0.3876,0.3859,0.3843,0.3827,0.3811,...,,,,,,,,,,0


In [20]:
preds = model.predict(X_test) + 1

NameError: name 'X_test' is not defined

In [35]:
pd.Series(preds).to_csv('preds.csv', index=False, header=['Culture'])