## <span style=color:blue>This notebook has code for taking the weather data that was archived in part-04, aggregates by MONTHY, and re-formats to form a very wide row for each year-state-county triple.  We finish by creating a wide table for ML, that includes year-state-county, yield, soil data, and weather data </span>

<span style=color:blue>First step is to pull import the weather files that were created for each year-state-county triple     </span>

In [60]:
import pandas as pd

archive_dir = '/Users/rick/AG-CODE--v03/ML-ARCHIVES--v01/'
yscyll_filename = 'year_state_county_yield_lon_lat.csv'

weather_dir = archive_dir + 'WEATHER-DATA--v01/'
wdtemplate = r'weather-data-for-index__{padded}.csv'

df_yscyll = pd.read_csv(archive_dir + yscyll_filename)
print(df_yscyll.shape)
print()

w_df = {}
for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    w_df[i] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    w_df[i].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    # w_df[i] = w_df[i].rename_axis(index='DATE')
    
print()
print(w_df[4].shape)
print(w_df[4].head())

(9952, 6)


(214, 7)
       date  T2M_MAX  T2M_MIN  PRECTOTCORR  GWETROOT  EVPTRNS  \
0  20180401    -0.91    -6.36         0.18      0.78      0.0   
1  20180402     5.39    -5.29         0.26      0.78      0.0   
2  20180403     7.16    -0.85         4.00      0.79      0.0   
3  20180404     2.90    -4.00         0.08      0.79      0.0   
4  20180405     3.16    -4.54         1.37      0.78      0.0   

   ALLSKY_SFC_PAR_TOT  
0               51.96  
1               83.03  
2               22.30  
3               95.11  
4               58.27  


<span style=color:blue>Test of grouping by MONTH </span>

In [61]:
# takes as input a dataframe whose index field is called "date" and
#   holds 8-character dates, and with columns 
#   ['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
# produces dataframe with same shape, but the values are grouped by MONTH,
#   with a particular aggregation used for each column

def create_monthly_df(df):
    df1 = df.copy()
    # convert index to datetime format    
    df1.index = pd.to_datetime(df['date'], format='%Y%m%d')
    # use 'M' for monthly, use 'W' for weekly
    df1_monthly = df1.resample('M').agg({'T2M_MAX':'mean',
                                       'T2M_MIN':'mean',
                                       'PRECTOTCORR':'sum',
                                       'GWETROOT':'mean',
                                       'EVPTRNS':'mean',
                                       'ALLSKY_SFC_PAR_TOT':'sum'})    

    # convert index back to string format YYYYMM
    df1_monthly.index = df1_monthly.index.strftime('%Y%m%d')
    
    return df1_monthly
    


print(create_monthly_df(w_df[4]).head(50))

            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20180430  10.749333  -0.650333        33.30  0.772333  0.168333   
20180531  25.752903  13.560323       126.08  0.723871  1.002581   
20180630  27.648333  17.241000       172.80  0.711667  1.758333   
20180731  27.861290  17.300000        65.35  0.636774  2.460323   
20180831  29.693548  18.430000       112.82  0.547419  0.941613   
20180930  25.644333  13.959667        86.09  0.581667  0.929333   
20181031  16.171613   4.874194       104.68  0.610968  0.216774   

          ALLSKY_SFC_PAR_TOT  
date                          
20180430             2661.21  
20180531             3424.52  
20180630             3447.89  
20180731             3992.43  
20180831             3056.50  
20180930             2444.94  
20181031             1708.35  


<span style=color:blue>Function that creates a list of all the column names I want for the MONTHLY weather data.    </span>

In [51]:
import json

df_t0 = w_df[0]
cols_narrow = df_t0.columns.values.tolist()[1:]
print(cols_narrow)

print()

df_t1 = create_monthly_df(df_t0)     # dfw['0001']
print(len(df_t1))
# print(df_t1.head())

cols_wide = []
for i in range(0,len(df_t1)):
    row = df_t1.iloc[i]
    # print(row)
    # can't use date, because it has year built in, and weeks start on different numbers...
    month_id = 'month_' + str(i).zfill(2)
    # print(date)
    for c in cols_narrow:
        cols_wide.append(month_id + '__' + c)
        
print(cols_wide)
print(len(cols_wide))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']

7
['month_00__T2M_MAX', 'month_00__T2M_MIN', 'month_00__PRECTOTCORR', 'month_00__GWETROOT', 'month_00__EVPTRNS', 'month_00__ALLSKY_SFC_PAR_TOT', 'month_01__T2M_MAX', 'month_01__T2M_MIN', 'month_01__PRECTOTCORR', 'month_01__GWETROOT', 'month_01__EVPTRNS', 'month_01__ALLSKY_SFC_PAR_TOT', 'month_02__T2M_MAX', 'month_02__T2M_MIN', 'month_02__PRECTOTCORR', 'month_02__GWETROOT', 'month_02__EVPTRNS', 'month_02__ALLSKY_SFC_PAR_TOT', 'month_03__T2M_MAX', 'month_03__T2M_MIN', 'month_03__PRECTOTCORR', 'month_03__GWETROOT', 'month_03__EVPTRNS', 'month_03__ALLSKY_SFC_PAR_TOT', 'month_04__T2M_MAX', 'month_04__T2M_MIN', 'month_04__PRECTOTCORR', 'month_04__GWETROOT', 'month_04__EVPTRNS', 'month_04__ALLSKY_SFC_PAR_TOT', 'month_05__T2M_MAX', 'month_05__T2M_MIN', 'month_05__PRECTOTCORR', 'month_05__GWETROOT', 'month_05__EVPTRNS', 'month_05__ALLSKY_SFC_PAR_TOT', 'month_06__T2M_MAX', 'month_06__T2M_MIN', 'month_06__PRECTOTC

<span style=color:blue>Function that takes in weather data for one year-state-city triple and produces list of all the MONTHLY weather values, in correct order     </span>

In [52]:
# starts with a df with the weekly aggregates for weather params,
# and produces a long sequence of all the MONTHLY weather values, in order corresponding to cols_wide

print(w_df[0].columns.tolist()[1:])
print(w_df[0].shape)
print(create_monthly_df(w_df[0]).shape)

def create_weather_seq_for_monthly(dfw):
    seq = []
    cols = dfw.columns.tolist()
    for i in range(0,len(dfw)):
        for c in cols:
            seq.append(dfw.iloc[i][c])
    return seq

# sanity check
dfw = create_monthly_df(w_df[0])
print(dfw.head(10))

seqw = create_weather_seq_for_monthly(dfw)
print(json.dumps(seqw, indent=4))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
(214, 7)
(7, 6)
            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20220430  13.714333   1.584000        89.94  0.746333  0.151000   
20220531  22.248387  11.422581        88.10  0.730323  0.744516   
20220630  27.599667  15.388000        70.19  0.642333  2.118333   
20220731  29.000968  17.890645       113.76  0.567419  1.590000   
20220831  28.850000  17.449677       126.05  0.547097  0.905484   
20220930  25.354000  12.574000        67.10  0.551000  0.582333   
20221031  18.133871   4.468065        49.51  0.540323  0.098065   

          ALLSKY_SFC_PAR_TOT  
date                          
20220430             2473.60  
20220531             2937.03  
20220630             4023.87  
20220731             3645.67  
20220831             3161.41  
20220930             2640.65  
20221031             2086.28  
[
    13.714333333

<span style=color:blue>Building a dictionary that has indexes from df_yscy as keys, and the MONTHLY weather sequences as values    </span>

In [53]:
import time

u_df = {}   # each entry will hold a df corresponding to a weather .csv file 
dfw = {}    # each entry will hold the df corresponding to monthly aggregation of a weather .csv file
seqw = {}   # each entry will hold the "flattening" of the monthly aggregation df


for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    # print(padded)
    u_df[padded] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    u_df[padded].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    
    dfw[padded] = create_monthly_df(u_df[padded])
    # print(dfw.head())

    seqw[i] = create_weather_seq_for_monthly(dfw[padded])
    # print(json.dumps(dictw, indent=4)
    
    # introducing a small occassional sleep because my python kernel kept complaining about
    # exceeding some I/O threshold
    if i % 30 == 0:
        time.sleep(0.05)
        
    if i > 9000 and i % 100 == 0:
        time.sleep(0.5)
    
    if i % 500 == 0:
        print('Completed processing for index: ', i)
    
    
# sanity check
# print(json.dumps(seqw, indent=4))


Completed processing for index:  0
Completed processing for index:  500
Completed processing for index:  1000
Completed processing for index:  1500
Completed processing for index:  2000
Completed processing for index:  2500
Completed processing for index:  3000
Completed processing for index:  3500
Completed processing for index:  4000
Completed processing for index:  4500
Completed processing for index:  5000
Completed processing for index:  5500
Completed processing for index:  6000
Completed processing for index:  6500
Completed processing for index:  7000
Completed processing for index:  7500
Completed processing for index:  8000
Completed processing for index:  8500
Completed processing for index:  9000
Completed processing for index:  9500


In [62]:
print(len(seqw))
print(json.dumps(seqw, indent=4))

9952


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<span style=color:blue>Converting the dictionary with a sequence for each year-state-county triple into a df     </span>

In [63]:
print(dfw['0000'].shape)
print(len(cols_wide))
print(len(df_yscyll))
print(len(seqw[0]))
print()

df_wide_weather_monthly = pd.DataFrame.from_dict(seqw, orient='index', columns=cols_wide)

print(df_wide_weather_monthly.shape)
print()
print(df_wide_weather_monthly.head())

(7, 6)
42
9952
42

(9952, 42)

   month_00__T2M_MAX  month_00__T2M_MIN  month_00__PRECTOTCORR  \
0          13.714333           1.584000                  89.94   
1          16.245667           4.572000                  59.88   
2          14.029333           2.215667                 107.80   
3          15.576000           3.632000                 129.76   
4          10.749333          -0.650333                  33.30   

   month_00__GWETROOT  month_00__EVPTRNS  month_00__ALLSKY_SFC_PAR_TOT  \
0            0.746333           0.151000                       2473.60   
1            0.710000           0.283333                       2868.94   
2            0.853000           0.133000                       2791.42   
3            0.817333           0.194333                       2491.24   
4            0.772333           0.168333                       2661.21   

   month_01__T2M_MAX  month_01__T2M_MIN  month_01__PRECTOTCORR  \
0          22.248387          11.422581                  88.1

<span style=color:blue>Merge projection of state-county-lat-lon-soil table into projection of yscyll table</span>

In [55]:
sclls_file = 'state_county_lon_lat_soil.csv'

df_scsoil = pd.read_csv(archive_dir + sclls_file).drop(columns=['lon','lat'])
print(df_scsoil.shape)
# print(df_scsoil.head())

# will continue working with df_yscyll because updated DU PAGE county 
#     (and might update other things in future versions...)

df_ysc_y_soil = pd.merge(df_yscyll, df_scsoil, on=['state_name','county_name'],how='left')

df_ysc_y_soil = df_ysc_y_soil.drop(columns=['lon','lat'])

print()
print(df_ysc_y_soil.shape)
print(df_ysc_y_soil.head())


(559, 27)

(9952, 29)
   year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
0  2022   ILLINOIS      BUREAU   67.5             10                10000   
1  2021   ILLINOIS      BUREAU   66.4             10                10000   
2  2020   ILLINOIS      BUREAU   64.8             10                10000   
3  2019   ILLINOIS      BUREAU   57.4             10                10000   
4  2018   ILLINOIS      BUREAU   68.5             10                10000   

   AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  SQH_5  SQH_6  SQH_7  SQL_1  SQL_2  SQL_3  \
0      0      1      0      0  ...      0      0      1      0      0      0   
1      0      1      0      0  ...      0      0      1      0      0      0   
2      0      1      0      0  ...      0      0      1      0      0      0   
3      0      1      0      0  ...      0      0      1      0      0      0   
4      0      1      0      0  ...      0      0      1      0      0      0   

   SQL_4  SQL_5  SQL_6  SQL_7  
0 

<span style=color:blue>Merge df_wide_weather_monthly into df_ysc_y_soil</span>

In [64]:
df_ysc_y_soil_weather_monthly = pd.concat([df_ysc_y_soil, df_wide_weather_monthly], axis='columns')

print(df_ysc_y_soil_weather_monthly.shape)
# print(df_ysc_y_soil_weather_monthly.head(10))
print(df_ysc_y_soil_weather_monthly.loc[28:32,:])

(9952, 71)
    year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
28  2014   ILLINOIS     CARROLL   60.4             10                10000   
29  2013   ILLINOIS     CARROLL   57.9             10                10000   
30  2012   ILLINOIS     CARROLL   59.0             10                10000   
31  2011   ILLINOIS     CARROLL   63.7             10                10000   
32  2010   ILLINOIS     CARROLL   58.9             10                10000   

    AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  month_05__PRECTOTCORR  \
28      0      1      0      0  ...                  83.12   
29      0      1      0      0  ...                  44.90   
30      0      1      0      0  ...                  46.86   
31      0      1      0      0  ...                  75.98   
32      0      1      0      0  ...                  56.90   

    month_05__GWETROOT  month_05__EVPTRNS  month_05__ALLSKY_SFC_PAR_TOT  \
28            0.572333           0.859333                       2540.37 

<span style=color:blue>Write the resulting table for ML learning to disk</span>

In [65]:
ml_tables_dir = archive_dir + 'ML-TABLES--v01/'

ml_file = 'ML-table-monthly.csv'

df_ysc_y_soil_weather_monthly.to_csv(ml_tables_dir + ml_file, index=False)

print('Wrote file ', ml_tables_dir + ml_file)

Wrote file  /Users/rick/AG-CODE--v03/ML-ARCHIVES--v01/ML-TABLES--v01/ML-table-monthly.csv
