## Timeseries Decision Tree & Random Forest

After looking at less time series oriented tree models, and armed with a better understanding of preparing timeseries data, 
I build these models with time based features and not in wide format

In [2]:
import numpy as np
import pandas as pd
import gc, re 

In [25]:
# Create the rowwise sum of n days of observations for the specified range of columns; this range includes BOTH the start and end columns.
def n_day_sum(df, n=0, start_col=0, end_col=0, name_prefix=''):
    try:
        if (end_col - (start_col-1))%n != 0:
            raise ValueError('Column range must be evenly divisible by n.')
    except ValueError as e:
        print(f'{e}; DataFrame Unchanged; invalid column range. Start & end columns are included in range.')
        return df

    col_index = 0
    col_range = end_col - (start_col-1)
    cols_added = 0
    while col_index < col_range:
        sum_columns = df.iloc[:, start_col+col_index-1:start_col+col_index+n-1]
        sum = sum_columns.sum(axis=1)
        df[f'{name_prefix}nSum{n}_{cols_added}'] = sum
        col_index += n
        cols_added += 1

    return df

### Dataframe preparation

In [7]:
del pr_m
gc.collect()

279

In [32]:
pr_m = pd.read_parquet('/Users/james/FutureCrop_Kaggle/Data/pr_maize_train.parquet')

In [33]:
# Downsample via sums to semimonthly measures of 15 days
pr_m = n_day_sum(pr_m, n=15, start_col=6, end_col=245)

pr_m

Unnamed: 0_level_0,crop,year,lon,lat,variable,0,1,2,3,4,...,nSum15_6,nSum15_7,nSum15_8,nSum15_9,nSum15_10,nSum15_11,nSum15_12,nSum15_13,nSum15_14,nSum15_15
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,maize,381.0,-122.25,48.25,pr,0.000227,0.000149,0.000017,0.000000,0.000021,...,0.000380,0.000023,0.000000,0.000173,0.000125,0.000736,0.001682,0.000624,0.000925,0.002383
1,maize,381.0,-122.25,48.75,pr,0.000242,0.000128,0.000000,0.000000,0.000000,...,0.000478,0.000052,0.000002,0.000175,0.000278,0.000986,0.001787,0.000688,0.000956,0.002327
2,maize,381.0,-122.25,49.25,pr,0.000483,0.000240,0.000010,0.000029,0.000000,...,0.000636,0.000133,0.000009,0.000328,0.000530,0.001700,0.003055,0.001240,0.001656,0.003820
3,maize,381.0,-116.75,43.25,pr,0.000000,0.000016,0.000000,0.000000,0.000000,...,0.000000,0.000003,0.000013,0.000104,0.000109,0.000671,0.000517,0.000320,0.000681,0.000649
4,maize,381.0,-116.75,43.75,pr,0.000015,0.000009,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000009,0.000100,0.000049,0.000726,0.000624,0.000268,0.000671,0.000611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349714,maize,419.0,132.75,46.75,pr,0.000025,0.000000,0.000000,0.000000,0.000000,...,0.000792,0.000054,0.000505,0.000433,0.000267,0.000407,0.000072,0.000120,0.000034,0.000007
349715,maize,419.0,132.75,47.25,pr,0.000025,0.000000,0.000000,0.000000,0.000000,...,0.000712,0.000097,0.000561,0.000471,0.000238,0.000257,0.000033,0.000125,0.000029,0.000008
349716,maize,419.0,133.25,45.25,pr,0.000000,0.000000,0.000000,0.000086,0.000001,...,0.000878,0.000543,0.000749,0.000519,0.000235,0.000382,0.000112,0.000299,0.000246,0.000007
349717,maize,419.0,133.25,47.25,pr,0.000035,0.000000,0.000000,0.000000,0.000000,...,0.000899,0.000104,0.000604,0.000463,0.000336,0.000327,0.000050,0.000170,0.000055,0.000013


In [34]:
start = pr_m.columns.get_loc('0')
stop = pr_m.columns.get_loc('239')
pr_m.drop(pr_m.iloc[:, start:stop+1], axis=1, inplace=True)

pr_m

Unnamed: 0_level_0,crop,year,lon,lat,variable,nSum15_0,nSum15_1,nSum15_2,nSum15_3,nSum15_4,...,nSum15_6,nSum15_7,nSum15_8,nSum15_9,nSum15_10,nSum15_11,nSum15_12,nSum15_13,nSum15_14,nSum15_15
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,maize,381.0,-122.25,48.25,pr,0.000987,0.000564,0.000501,0.000659,0.000043,...,0.000380,0.000023,0.000000,0.000173,0.000125,0.000736,0.001682,0.000624,0.000925,0.002383
1,maize,381.0,-122.25,48.75,pr,0.000799,0.000776,0.000877,0.000646,0.000102,...,0.000478,0.000052,0.000002,0.000175,0.000278,0.000986,0.001787,0.000688,0.000956,0.002327
2,maize,381.0,-122.25,49.25,pr,0.001517,0.001455,0.001381,0.001109,0.000264,...,0.000636,0.000133,0.000009,0.000328,0.000530,0.001700,0.003055,0.001240,0.001656,0.003820
3,maize,381.0,-116.75,43.25,pr,0.000073,0.000095,0.000348,0.000000,0.000218,...,0.000000,0.000003,0.000013,0.000104,0.000109,0.000671,0.000517,0.000320,0.000681,0.000649
4,maize,381.0,-116.75,43.75,pr,0.000049,0.000043,0.000313,0.000004,0.000127,...,0.000000,0.000000,0.000009,0.000100,0.000049,0.000726,0.000624,0.000268,0.000671,0.000611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349714,maize,419.0,132.75,46.75,pr,0.000112,0.000192,0.000517,0.000243,0.000819,...,0.000792,0.000054,0.000505,0.000433,0.000267,0.000407,0.000072,0.000120,0.000034,0.000007
349715,maize,419.0,132.75,47.25,pr,0.000086,0.000177,0.000583,0.000333,0.000905,...,0.000712,0.000097,0.000561,0.000471,0.000238,0.000257,0.000033,0.000125,0.000029,0.000008
349716,maize,419.0,133.25,45.25,pr,0.000131,0.000266,0.000116,0.000759,0.000896,...,0.000878,0.000543,0.000749,0.000519,0.000235,0.000382,0.000112,0.000299,0.000246,0.000007
349717,maize,419.0,133.25,47.25,pr,0.000116,0.000188,0.000670,0.000329,0.000808,...,0.000899,0.000104,0.000604,0.000463,0.000336,0.000327,0.000050,0.000170,0.000055,0.000013


In [36]:
i = 0
prefix = 'nSum15_'
while i < 16:
    pr_m.rename(columns={prefix+str(i): str(i)}, inplace=True)
    i+=1

pr_m

Unnamed: 0_level_0,crop,year,lon,lat,variable,0,1,2,3,4,...,6,7,8,9,10,11,12,13,14,15
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,maize,381.0,-122.25,48.25,pr,0.000987,0.000564,0.000501,0.000659,0.000043,...,0.000380,0.000023,0.000000,0.000173,0.000125,0.000736,0.001682,0.000624,0.000925,0.002383
1,maize,381.0,-122.25,48.75,pr,0.000799,0.000776,0.000877,0.000646,0.000102,...,0.000478,0.000052,0.000002,0.000175,0.000278,0.000986,0.001787,0.000688,0.000956,0.002327
2,maize,381.0,-122.25,49.25,pr,0.001517,0.001455,0.001381,0.001109,0.000264,...,0.000636,0.000133,0.000009,0.000328,0.000530,0.001700,0.003055,0.001240,0.001656,0.003820
3,maize,381.0,-116.75,43.25,pr,0.000073,0.000095,0.000348,0.000000,0.000218,...,0.000000,0.000003,0.000013,0.000104,0.000109,0.000671,0.000517,0.000320,0.000681,0.000649
4,maize,381.0,-116.75,43.75,pr,0.000049,0.000043,0.000313,0.000004,0.000127,...,0.000000,0.000000,0.000009,0.000100,0.000049,0.000726,0.000624,0.000268,0.000671,0.000611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349714,maize,419.0,132.75,46.75,pr,0.000112,0.000192,0.000517,0.000243,0.000819,...,0.000792,0.000054,0.000505,0.000433,0.000267,0.000407,0.000072,0.000120,0.000034,0.000007
349715,maize,419.0,132.75,47.25,pr,0.000086,0.000177,0.000583,0.000333,0.000905,...,0.000712,0.000097,0.000561,0.000471,0.000238,0.000257,0.000033,0.000125,0.000029,0.000008
349716,maize,419.0,133.25,45.25,pr,0.000131,0.000266,0.000116,0.000759,0.000896,...,0.000878,0.000543,0.000749,0.000519,0.000235,0.000382,0.000112,0.000299,0.000246,0.000007
349717,maize,419.0,133.25,47.25,pr,0.000116,0.000188,0.000670,0.000329,0.000808,...,0.000899,0.000104,0.000604,0.000463,0.000336,0.000327,0.000050,0.000170,0.000055,0.000013


In [37]:
# Melt the dataframe
# pr_m.reset_index(inplace=True)
pr_m = pr_m.melt(id_vars=['ID','crop','year','lon','lat','variable'], var_name='15day', value_name='pr_15daySum', ignore_index=False)

# Normalize year to start at 0, then create period column as year * day
pr_m['year'] = pr_m['year'] - 381
pr_m['15day'] = pd.to_numeric(pr_m['15day'], errors='coerce')
pr_m.insert(7, 'period', pr_m['year'] * pr_m['15day'])

pr_m

Unnamed: 0,ID,crop,year,lon,lat,variable,period,pr_15daySum,value
0,0,maize,0.0,-122.25,48.25,pr,0.0,0,0.000987
1,1,maize,0.0,-122.25,48.75,pr,0.0,0,0.000799
2,2,maize,0.0,-122.25,49.25,pr,0.0,0,0.001517
3,3,maize,0.0,-116.75,43.25,pr,0.0,0,0.000073
4,4,maize,0.0,-116.75,43.75,pr,0.0,0,0.000049
...,...,...,...,...,...,...,...,...,...
349714,349714,maize,38.0,132.75,46.75,pr,570.0,15,0.000007
349715,349715,maize,38.0,132.75,47.25,pr,570.0,15,0.000008
349716,349716,maize,38.0,133.25,45.25,pr,570.0,15,0.000007
349717,349717,maize,38.0,133.25,47.25,pr,570.0,15,0.000013


#### Differencing: 1st and 2nd Order

In [None]:
# Initialize columns for differences
pr_m['pr_1diff'] = 0
pr_m['pr_2diff'] = 0

unique_indexes = pr_m.index.unique()
index_set = set(unique_indexes)

# Slice the dataframe to get all matching ID, then perform differencing for that slice.
for index_val in index_set:
    matching_rows = pr_m.loc[pr_m.index == index_val]

In [17]:
pr_m.dtypes

crop         object
year        float64
lon         float64
lat         float64
variable     object
day          object
period      float64
value       float64
dtype: object