## <span style=color:blue>This notebook has code for taking the weather data that was archived in part-04, aggregates it by WEEK, and re-formats it to form a very wide row for each year-state-county triple.  We finish by creating a wide table for ML, that includes year-state-county, yield, soil data, and weather data </span>

In [192]:
import pandas as pd

archive_dir = '/Users/rick/AG-CODE--v03/ML-ARCHIVES--v01/'
yscyll_filename = 'year_state_county_yield_lon_lat.csv'

weather_dir = archive_dir + 'WEATHER-DATA--v01/'
wdtemplate = r'weather-data-for-index__{padded}.csv'

df_yscyll = pd.read_csv(archive_dir + yscyll_filename)
print(df_yscyll.shape)
print()

w_df = {}
for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    w_df[i] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    w_df[i].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    # w_df[i] = w_df[i].rename_axis(index='DATE')
    
print()
print(w_df[4].shape)
print(w_df[4].head())

(9952, 6)


(214, 7)
       date  T2M_MAX  T2M_MIN  PRECTOTCORR  GWETROOT  EVPTRNS  \
0  20180401    -0.91    -6.36         0.18      0.78      0.0   
1  20180402     5.39    -5.29         0.26      0.78      0.0   
2  20180403     7.16    -0.85         4.00      0.79      0.0   
3  20180404     2.90    -4.00         0.08      0.79      0.0   
4  20180405     3.16    -4.54         1.37      0.78      0.0   

   ALLSKY_SFC_PAR_TOT  
0               51.96  
1               83.03  
2               22.30  
3               95.11  
4               58.27  


<span style=color:blue>Test of grouping by WEEK   </span>

In [193]:
# takes as input a dataframe whose index field is called "date" and
#   holds 8-character dates, and with columns 
#   ['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
# produces dataframe with same shape, but the values are grouped by WEEK,
#   with a particular aggregation used for each column

def create_weekly_df(df):
    df1 = df.copy()
    # convert index to datetime format    
    df1.index = pd.to_datetime(df['date'], format='%Y%m%d')
    # use 'M' for monthly, use 'W' for weekly
    df1_weekly = df1.resample('W').agg({'T2M_MAX':'mean',
                                       'T2M_MIN':'mean',
                                       'PRECTOTCORR':'sum',
                                       'GWETROOT':'mean',
                                       'EVPTRNS':'mean',
                                       'ALLSKY_SFC_PAR_TOT':'sum'})    

    # convert index back to string format YYYYMM
    df1_weekly.index = df1_weekly.index.strftime('%Y%m%d')
    
    return df1_weekly
    


print(create_weekly_df(w_df[4]).head(50))

            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20180401  -0.910000  -6.360000         0.18  0.780000  0.000000   
20180408   3.950000  -4.628571         7.44  0.782857  0.000000   
20180415  11.477143   0.915714        22.41  0.775714  0.085714   
20180422   9.645714  -1.272857         2.90  0.781429  0.101429   
20180429  17.667143   2.582857         0.37  0.754286  0.427143   
20180506  24.807143  11.792857        25.71  0.740000  0.695714   
20180513  23.182857   9.867143        18.32  0.722857  0.750000   
20180520  24.671429  12.370000        22.67  0.724286  0.910000   
20180527  28.014286  15.077143        26.74  0.718571  1.411429   
20180603  28.342857  16.855714        33.43  0.704286  1.600000   
20180610  27.300000  14.774286        68.09  0.682857  1.517143   
20180617  28.304286  19.144286        13.51  0.710000  1.641429   
20180624  26.662857  17.735714        77.59  0.735714  1.67285

<span style=color:blue>Function that creates a list of all the column names I want for the WEEKLY weather data.    </span>

In [194]:
import json

df_t0 = w_df[0]
cols_narrow = df_t0.columns.values.tolist()[1:]
print(cols_narrow)

print()

df_t1 = create_weekly_df(df_t0)     # dfw['0001']
# print(df_t1.head())

cols_wide = []
for i in range(0,len(df_t1)):
    row = df_t1.iloc[i]
    # print(row)
    # can't use date, because it has year built in, and weeks start on different numbers...
    week_id = 'week_' + str(i).zfill(2)
    # print(date)
    for c in cols_narrow:
        cols_wide.append(week_id + '__' + c)
        
print(cols_wide)

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']

['week_00__T2M_MAX', 'week_00__T2M_MIN', 'week_00__PRECTOTCORR', 'week_00__GWETROOT', 'week_00__EVPTRNS', 'week_00__ALLSKY_SFC_PAR_TOT', 'week_01__T2M_MAX', 'week_01__T2M_MIN', 'week_01__PRECTOTCORR', 'week_01__GWETROOT', 'week_01__EVPTRNS', 'week_01__ALLSKY_SFC_PAR_TOT', 'week_02__T2M_MAX', 'week_02__T2M_MIN', 'week_02__PRECTOTCORR', 'week_02__GWETROOT', 'week_02__EVPTRNS', 'week_02__ALLSKY_SFC_PAR_TOT', 'week_03__T2M_MAX', 'week_03__T2M_MIN', 'week_03__PRECTOTCORR', 'week_03__GWETROOT', 'week_03__EVPTRNS', 'week_03__ALLSKY_SFC_PAR_TOT', 'week_04__T2M_MAX', 'week_04__T2M_MIN', 'week_04__PRECTOTCORR', 'week_04__GWETROOT', 'week_04__EVPTRNS', 'week_04__ALLSKY_SFC_PAR_TOT', 'week_05__T2M_MAX', 'week_05__T2M_MIN', 'week_05__PRECTOTCORR', 'week_05__GWETROOT', 'week_05__EVPTRNS', 'week_05__ALLSKY_SFC_PAR_TOT', 'week_06__T2M_MAX', 'week_06__T2M_MIN', 'week_06__PRECTOTCORR', 'week_06__GWETROOT', 'week_06__EVPT

<span style=color:blue>Function that takes in weather data for one year-state-city and produces list of all the WEEKLY weather values, in correct order     </span>

In [197]:
# starts with a df with the weekly aggregates for weather params,
# and produces a long sequence of all the WEEKLY weather values, in order corresponding to cols_wide

print(w_df[0].columns.tolist()[1:])
print(w_df[0].shape)
print(create_weekly_df(w_df[0]).shape)

def create_weather_seq_for_weekly(dfw):
    seq = []
    for i in range(0,len(dfw)):
        for c in cols:
            seq.append(dfw.iloc[i][c])
    return seq

# sanity check
dfw = create_weekly_df(w_df[0])
print(dfw.head(10))

seqw = create_weather_seq_for_weekly(dfw)
print(json.dumps(seqw, indent=4))

['T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'GWETROOT', 'EVPTRNS', 'ALLSKY_SFC_PAR_TOT']
(214, 7)
(32, 6)
            T2M_MAX    T2M_MIN  PRECTOTCORR  GWETROOT   EVPTRNS  \
date                                                              
20220403   8.810000  -3.323333         6.97  0.740000  0.053333   
20220410  11.265714   0.378571        23.48  0.751429  0.088571   
20220417  13.488571  -0.097143        18.50  0.745714  0.105714   
20220424  17.155714   5.304286        29.73  0.750000  0.280000   
20220501  14.682857   3.537143        11.43  0.741429  0.152857   
20220508  15.775714   4.571429        30.01  0.761429  0.298571   
20220515  27.634286  17.031429         2.96  0.734286  0.730000   
20220522  23.060000  11.205714        20.09  0.712857  0.882857   
20220529  22.232857  11.291429        30.79  0.718571  0.872857   
20220605  25.942857  14.562857         5.70  0.685714  1.704286   

          ALLSKY_SFC_PAR_TOT  
date                          
20220403              248.80  
20

<span style=color:blue>Building a dictionary that has indexes from df_yscy as keys, and the WEEKLY weather sequences as values    </span>

In [201]:
import time

u_df = {}
dfw = {}
seqw = {}


for i in range(0,len(df_yscyll)):
    padded = str(i).zfill(4)
    # print(padded)
    u_df[padded] = pd.read_csv(weather_dir + wdtemplate.format(padded=padded))
    # Want to have a name for the index of my dataframe
    u_df[padded].rename(columns={'Unnamed: 0': 'date'}, 
                   inplace=True)
    
    dfw[padded] = create_weekly_df(u_df[padded])
    # print(dfw.head())

    seqw[i] = create_weather_seq(dfw[padded])
    # print(json.dumps(dictw, indent=4)

    # introducing a small occassional sleep because my python kernel kept complaining about
    # exceeding some I/O threshold
    # if i % 30 == 0:
    #     time.sleep(0.05)
        
    # if i > 4000 and i % 100 == 0:
    #     time.sleep(0.5)
    
    if i % 100 == 0:
        print('Completed processing of index ', i)
    
# sanity check
print(print(json.dumps(seqw, indent=4)))


Completed processing of index  0
Completed processing of index  100
Completed processing of index  200
Completed processing of index  300
Completed processing of index  400
Completed processing of index  500
Completed processing of index  600
Completed processing of index  700
Completed processing of index  800
Completed processing of index  900
Completed processing of index  1000
Completed processing of index  1100
Completed processing of index  1200
Completed processing of index  1300
Completed processing of index  1400
Completed processing of index  1500
Completed processing of index  1600
Completed processing of index  1700
Completed processing of index  1800
Completed processing of index  1900
Completed processing of index  2000
Completed processing of index  2100
Completed processing of index  2200
Completed processing of index  2300
Completed processing of index  2400
Completed processing of index  2500
Completed processing of index  2600
Completed processing of index  2700
Comp

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [202]:
print(len(seqw))

9952


In [205]:
print(dfw['0000'].shape)
print(len(cols_wide))
print(len(df_yscyll))
print(len(seqw[0]))
print()

df_wide_weather_weekly_prelim = pd.DataFrame.from_dict(seqw, orient='index', columns=cols_wide)

print(df_wide_weather_weekly_prelim.shape)
print()
print(df_wide_weather_weekly_prelim.head())

(32, 6)
192
9952
192

(9952, 192)

   week_00__T2M_MAX  week_00__T2M_MIN  week_00__PRECTOTCORR  \
0          8.810000         -3.323333                  6.97   
1         14.877500         -0.320000                  0.62   
2         14.792000          0.846000                  1.14   
3         14.284286          3.037143                  9.20   
4         -0.910000         -6.360000                  0.18   

   week_00__GWETROOT  week_00__EVPTRNS  week_00__ALLSKY_SFC_PAR_TOT  \
0             0.7400          0.053333                       248.80   
1             0.7125          0.215000                       462.83   
2             0.8720          0.122000                       483.64   
3             0.8300          0.084286                       488.84   
4             0.7800          0.000000                        51.96   

   week_01__T2M_MAX  week_01__T2M_MIN  week_01__PRECTOTCORR  \
0         11.265714          0.378571                 23.48   
1         19.291429          9.64

<span style=color:blue>Where are the NaN values coming from in the above?   </span>

<span style=color:blue>It is because for some years the 7 months end up creating 31 weeks whereas for others only 30 are created.  So we will simply drop week_31.  </span>

In [206]:
# print(cols_wide_weekly)
print(df_wide_weather_weekly_prelim.shape)
week_31_cols = ['week_31__T2M_MAX', 'week_31__T2M_MIN', 'week_31__PRECTOTCORR', 'week_31__GWETROOT', 'week_31__EVPTRNS', 'week_31__ALLSKY_SFC_PAR_TOT']

df_wide_weather_weekly = df_wide_weather_weekly_prelim.drop(columns=week_31_cols)

print()
print(df_wide_weather_weekly.shape)
print(df_wide_weather_weekly.head())


(9952, 192)

(9952, 186)
   week_00__T2M_MAX  week_00__T2M_MIN  week_00__PRECTOTCORR  \
0          8.810000         -3.323333                  6.97   
1         14.877500         -0.320000                  0.62   
2         14.792000          0.846000                  1.14   
3         14.284286          3.037143                  9.20   
4         -0.910000         -6.360000                  0.18   

   week_00__GWETROOT  week_00__EVPTRNS  week_00__ALLSKY_SFC_PAR_TOT  \
0             0.7400          0.053333                       248.80   
1             0.7125          0.215000                       462.83   
2             0.8720          0.122000                       483.64   
3             0.8300          0.084286                       488.84   
4             0.7800          0.000000                        51.96   

   week_01__T2M_MAX  week_01__T2M_MIN  week_01__PRECTOTCORR  \
0         11.265714          0.378571                 23.48   
1         19.291429          9.641429      

<span style=color:blue>Merge projection of state-county-lat-lon-soil table into projection of yscyll table</span>

In [207]:
sclls_file = 'state_county_lon_lat_soil.csv'

df_scsoil = pd.read_csv(archive_dir + sclls_file).drop(columns=['lon','lat'])
print(df_scsoil.shape)
# print(df_scsoil.head())

# will continue working with df_yscyll because updated DU PAGE county 
#     (and might update other things in future versions...)

df_ysc_y_soil = pd.merge(df_yscyll, df_scsoil, on=['state_name','county_name'],how='left')

df_ysc_y_soil = df_ysc_y_soil.drop(columns=['lon','lat'])

print()
print(df_ysc_y_soil.shape)
print(df_ysc_y_soil.head())


(559, 27)

(9952, 29)
   year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
0  2022   ILLINOIS      BUREAU   67.5             10                10000   
1  2021   ILLINOIS      BUREAU   66.4             10                10000   
2  2020   ILLINOIS      BUREAU   64.8             10                10000   
3  2019   ILLINOIS      BUREAU   57.4             10                10000   
4  2018   ILLINOIS      BUREAU   68.5             10                10000   

   AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  SQH_5  SQH_6  SQH_7  SQL_1  SQL_2  SQL_3  \
0      0      1      0      0  ...      0      0      1      0      0      0   
1      0      1      0      0  ...      0      0      1      0      0      0   
2      0      1      0      0  ...      0      0      1      0      0      0   
3      0      1      0      0  ...      0      0      1      0      0      0   
4      0      1      0      0  ...      0      0      1      0      0      0   

   SQL_4  SQL_5  SQL_6  SQL_7  
0 

<span style=color:blue>Merge df_wide_weather_weekly into df_ysc_y_soil</span>

In [209]:
df_ysc_y_soil_weather_weekly = pd.concat([df_ysc_y_soil, df_wide_weather_weekly], axis='columns')

print(df_ysc_y_soil_weather_weekly.shape)
# print(df_ysc_y_soil_weather_weekly.head(10))
print(df_ysc_y_soil_weather_weekly.loc[28:32,:])

(9952, 215)
    year state_name county_name  yield  nutr_ret_high  suit_irrig_high_soy  \
28  2014   ILLINOIS     CARROLL   60.4             10                10000   
29  2013   ILLINOIS     CARROLL   57.9             10                10000   
30  2012   ILLINOIS     CARROLL   59.0             10                10000   
31  2011   ILLINOIS     CARROLL   63.7             10                10000   
32  2010   ILLINOIS     CARROLL   58.9             10                10000   

    AEZ_1  AEZ_2  AEZ_3  AEZ_4  ...  week_29__PRECTOTCORR  week_29__GWETROOT  \
28      0      1      0      0  ...                  3.49           0.598571   
29      0      1      0      0  ...                  3.43           0.530000   
30      0      1      0      0  ...                 23.78           0.542857   
31      0      1      0      0  ...                  3.04           0.550000   
32      0      1      0      0  ...                 31.65           0.574286   

    week_29__EVPTRNS  week_29__ALLSKY_

<span style=color:blue>Write the resulting table for ML learning to disk</span>

In [210]:
ml_tables_dir = archive_dir + 'ML-TABLES--v01/'

ml_file = 'ML-table-weekly.csv'

df_ysc_y_soil_weather_weekly.to_csv(ml_tables_dir + ml_file, index=False)

print('Wrote file ', ml_tables_dir + ml_file)



Wrote file  /Users/rick/AG-CODE--v03/ML-ARCHIVES--v01/ML-TABLES--v01/ML-table-weekly.csv
