# IntegratingSpring 2021
### Chitra Agastya

This notebook is used to integrate the individual, serialized dataframes. We explore the metadata to get some descriptive statistics about our model. 

In [15]:
import pandas as pd
import tensorflow as tf
from glob import glob
import os
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
from tqdm import tqdm
import seaborn as sns
from matplotlib.cm import get_cmap
import folium

In [16]:
print(pd.__version__)
print(tf.__version__)
sns.set()

1.2.2
2.4.1


### Set Paths to Data
Data is actually kept in a folder shared with me. In order to access it from Colab, I added a shortcut to the Shared Folder to my 'My Drive' folder on Google Drive. A link is created that allows me to create the path below to get to the actual data without having to copy it back to my personal drive.

In [17]:
sentinel_path = '/home/cagastya/hdd/gee_central_valley'
patches = [os.path.join(sentinel_path, x) for x in os.listdir(sentinel_path)]
print(len(patches))

79


### Gather Serialzed DataFrames
Get a list of all the pkl files we need to process

In [19]:
pkl_dfs = []
for d in patches:
    pdf = [pdf for pdf in os.listdir(d) if '.pkl.clean' == pdf[-10:]]
    if pdf:
        pkl_dfs.append(os.path.join(d, pdf[0]))
    
#pkl_dfs = [pdf for pdf in os.listdir(sentinel_path) if '.pkl' == pdf[-4:]]
print(len(pkl_dfs))
pkl_dfs

79


['/home/cagastya/hdd/gee_central_valley/Chico_S2SR_9_2019_39.625_-121.875/Chico_S2SR_9_2019_39.625_-121.875.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Modesto_S2SR_11_2019_37.875_-121.375/Modesto_S2SR_11_2019_37.875_-121.375.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Bakersfield_S2SR_9_2019_35.125_-119.125/Bakersfield_S2SR_9_2019_35.125_-119.125.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Merced_S2SR_4_2019_37.375_-120.375/Merced_S2SR_4_2019_37.375_-120.375.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Modesto_S2SR_10_2019_37.625_-120.625/Modesto_S2SR_10_2019_37.625_-120.625.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Chico_S2SR_11_2019_39.375_-121.625/Chico_S2SR_11_2019_39.375_-121.625.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Modesto_S2SR_8_2019_37.625_-121.125/Modesto_S2SR_8_2019_37.625_-121.125.pkl.clean',
 '/home/cagastya/hdd/gee_central_valley/Fresno_S2SR_3_2019_36.875_-120.625/Fresno_S2SR_3_2019_36.875_-120.625.pkl.clean',
 '/hom

## Check how many have negative pixels

In [21]:
from collections import defaultdict 

#for k in x.keys():
#    if k.startswith('B'):
#        #cnt = cnt + 1 if (x[k] < 0).any() else cnt
#        badcnt = badcnt + 1 if (x[k] < 0).any() else badcnt
#        if badcnt:
#           break 

def get_keys(d):
    return list(d.keys())

def is_any_band_invalid(img_splt):
    for k in img_splt.keys():
        if k.startswith('B'):
            if (img_splt[k] < 0).any():
                return True
    return False

def checkNegatives(df):
    badImage = 0
    for x in df:
        if is_any_band_invalid(x):
            badImage += 1
    return badImage
             

bad_cnt = total_cnt = 0
# Loop over our pickle files
for pdf in tqdm(pkl_dfs):
#for i in range(1):
    # Read in each pickle file
    df = list(pd.read_pickle(pdf).values())
    #df = list(pd.read_pickle(pkl_dfs[i]).values())
    #print(df)
    bad_cnt += checkNegatives(df)
    total_cnt += len(df)
    
    
print(f'Total Bad: {bad_cnt} Total Img: {total_cnt} Percent: {bad_cnt/total_cnt * 100}')

100%|██████████| 79/79 [03:58<00:00,  3.03s/it]

Total Bad: 0 Total Img: 50194 Percent: 0.0





## Check If Cloud and Cirrus Bit Masks are Set

The presence of very high integer values in the data indicates that the cloud mask might be set.

In [15]:
def get_keys(d):
    #print(d.keys())
    return list(d.keys())

# Check for the cloud and cirrus bit adn see if it is set for the ones with high negative values
def replace_negatives_with_NaN(df):
    for k in get_keys(df):
        if k.startswith('B'):
            df[k] = np.where(df[k] < 0, np.nan, df[k])
    return df
    


In [None]:
for i in range(1):
    # Read in each pickle file
    row = 0
    df = list(pd.read_pickle(pkl_dfs[i]).values())
    for x in df:
        #print(x)
        result = replace_negatives_with_NaN(x)
        print(result)
        row += 1
        if row == 1:
            break
    #rdfs.append(df)
    #rdfs.append(compute_band_means2(df))

In [26]:
# Instantiate a list to contain individual dataframes
rdfs = []


def compute_band_means2(df):
    res = {}
    
    for k in get_keys(df[0]):
        if not k.startswith('B'):
            res[k] = df[0][k]
        else:
            #band = [np.where(x[k] < 0, np.nan, x[k]) for x in df]
            band = [x[k] for x in df]
            res[k] = np.nanmean(band)
             
    #print(res)
    return res

# Loop over our pickle files
for pdf in tqdm(pkl_dfs):
#for i in range(1):
    # Read in each pickle file
    df = list(pd.read_pickle(pdf).values())
    #rdfs.append(df)
    rdfs.append(compute_band_means2(df))

 39%|███▉      | 31/79 [01:06<01:43,  2.16s/it]


KeyboardInterrupt: 

In [25]:
rdfs

[{'patch_name': '/home/cagastya/hdd/gee_central_valley/Chico_S2SR_9_2019_39.625_-121.875',
  'month': 9,
  'B2': 466.06159486990674,
  'B3': 769.5775052160039,
  'B4': 782.8166608370152,
  'B5': 1277.8048612133857,
  'B6': 2277.462858165603,
  'B7': 2692.3799077483227,
  'B8': 2863.558517325315,
  'B8A': 2942.5295817992146,
  'B11': 2054.6815449599085,
  'B12': 1330.4899604197349}]

### Read Metadata
First we will explore some metadata. We cannot read the totality of our dataframes in at once because we are RAM limited. So we will do it in steps. The data is still uploading to Google Drive, so let's just get the infrastructure in place first.

In [28]:
import numpy as np

# Make sure the dataframe is deleted before trying to read in each of the individual dataframes
if 'total_df' in locals():
  del total_df

def get_keys(d):
    #print(d.keys())
    return list(d.keys())
    
def compute_band_means(df):
    res = {}
    
    for k in get_keys(df[0]):
        if not k.startswith('B'):
            res[k] = df[0][k]
        else:
            res[k] = np.mean([x[k] for x in df])
    #print(res)
    return res

# Instantiate a list to contain individual dataframes
dfs = []

# Loop over our pickle files
for pdf in tqdm(pkl_dfs):
    # Read in each pickle file
    df = list(pd.read_pickle(pdf).values())
    dfs.append(compute_band_means(df))


100%|██████████| 79/79 [02:02<00:00,  1.56s/it]


In [29]:
total_df = pd.DataFrame(dfs)
total_df.head()

Unnamed: 0,patch_name,month,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12
0,/home/cagastya/hdd/gee_central_valley/Chico_S2...,9,466.061595,769.577505,782.816661,1277.804861,2277.462858,2692.379908,2863.558517,2942.529582,2054.681545,1330.48996
1,/home/cagastya/hdd/gee_central_valley/Modesto_...,11,655.809842,888.271671,1073.736781,1313.922827,1675.602987,1841.400783,1973.890035,2034.730701,2293.881985,1884.897805
2,/home/cagastya/hdd/gee_central_valley/Bakersfi...,9,897.380866,1289.851596,1528.08673,1911.141248,2590.13529,2907.561489,3063.887163,3157.429187,3285.82957,2564.077367
3,/home/cagastya/hdd/gee_central_valley/Merced_S...,4,530.050492,792.777851,918.097144,1402.458747,2177.829775,2507.145438,2719.413477,2849.830679,2476.764277,1562.70935
4,/home/cagastya/hdd/gee_central_valley/Modesto_...,10,642.745278,937.109638,1188.935104,1543.553952,2048.621431,2292.195918,2561.798822,2587.219757,2785.551162,1995.116633


In [None]:
## Make sure the dataframe is deleted before trying to read in each of the individual dataframes
#if 'total_df' in locals():
#  del total_df

## Instantiate a list to contain individual dataframes
#dfs = []

## Loop over our pickle files
#for pdf in tqdm(pkl_dfs):
#  # Read in each pickle file but drop the MSI data and the predictions to fit within memory limitations
#  dfs.append(pd.read_pickle(os.path.join(sentinel_path, pdf)).drop(['msi', 'predictions'], axis=1))

## Concatenate the dataframes togethers
#total_df = pd.concat(dfs).reset_index(drop=True)

## Delete the list of dataframes to save memory
#del dfs

100%|██████████| 60/60 [10:03<00:00, 10.05s/it]


In [30]:
# High Level Statistics
sum = total_df.describe()
sum

Unnamed: 0,month,B2,B3,B4,B5,B6,B7,B8,B8A,B11,B12
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,6.708861,704.066031,1013.66256,1177.396698,1559.415758,2271.823718,2570.590655,2729.778848,2782.582092,2633.260001,1960.124207
std,3.076594,275.446338,341.972757,469.329506,439.344539,405.986918,462.807666,453.409184,465.306707,570.62154,593.4589
min,1.0,233.002412,403.936806,464.053654,784.56976,1361.212705,1491.846075,1656.833323,1667.395576,1372.780258,845.94384
25%,4.0,554.982913,826.954618,878.008321,1316.090424,2049.707873,2269.784307,2441.554634,2435.797561,2188.692367,1570.661929
50%,6.0,666.48391,948.049736,1118.906852,1508.805197,2278.584109,2657.023523,2795.689919,2870.802903,2678.541768,1916.204306
75%,9.0,788.724417,1121.767014,1371.649689,1734.846743,2508.43757,2865.091856,3021.53151,3079.22646,2979.518048,2237.210463
max,12.0,2175.35631,2844.896448,3522.679379,3792.920461,3818.323131,3950.427554,4055.023076,3977.47299,4852.19734,4666.239329


In [31]:
sum_data = sum.loc[['mean', 'std']].T.to_dict()
sum_data

{'mean': {'month': 6.708860759493671,
  'B2': 704.0660306667106,
  'B3': 1013.6625595886348,
  'B4': 1177.3966978795684,
  'B5': 1559.4157583764888,
  'B6': 2271.823718038332,
  'B7': 2570.590654856275,
  'B8': 2729.77884789601,
  'B8A': 2782.5820922432317,
  'B11': 2633.2600006272755,
  'B12': 1960.1242074549111},
 'std': {'month': 3.0765944323027594,
  'B2': 275.44633762623107,
  'B3': 341.9727574411451,
  'B4': 469.3295063736902,
  'B5': 439.3445385554108,
  'B6': 405.9869176625257,
  'B7': 462.80766566429446,
  'B8': 453.4091836811935,
  'B8A': 465.3067073795589,
  'B11': 570.6215399891606,
  'B12': 593.4589000748651}}

In [32]:
# dump summary to pickle file
import pickle
filename = os.path.join('/home/cagastya/MIDS_Capstone/data', 'central_valley_summary.pickle')
with open(filename, 'wb') as handle:
    pickle.dump(sum_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
with open(filename, 'rb') as handle:
    b = pickle.load(handle)
    
sum_data == b

True

### Function for Summarizing Key Statistics
- Land area of California from [Brittanica](https://www.britannica.com/place/California-state)
- Cropland area from [California Department of Food and Agriculture](https://www.cdfa.ca.gov/statistics/PDFs/2018-2019AgReportnass.pdf)
- 2013 Estimate of Irrigated Cropland in CA from [Federation of American Scientists](https://fas.org/sgp/crs/misc/R44093.pdf)




In [35]:
def summarize_metadata(df):
  '''
  '''

  MONTHS = {1:'January',
            2:'February',
            3:'March',
            4:'April',
            5:'May',
            6:'June',
            7:'July',
            8:'August',
            9:'September',
            10:'October',
            11:'November',
            12:'December'}

  CMAP = get_cmap('tab20').colors

  CALIFORNIA_AREA = 423967 # square kms
  CALIFORNIA_CROPLAND = 6.75e7 / 640 * 2.59 # acres to sq miles to sq kms
  CALIFORNIA_IRRIGATED_AREA = 7.9e6 / 640 * 2.59 # acres to sq miles to sq kms

  # Number of images - each image exists for 11 or 12 months of 2018
  num_imgs = len(df)

  # Each image is 120 x 120 pixels
  # There are 99 small images per base image
  # Thus, we use 0.9155 of the base image
  # At our latitude and longitude the 0.25 x 0.25 degree region
  # is about 616.55 km2
  # June present for all images
  unique_area = 616.55*0.9155 / 99 * len(df[df.month==6])

   
  print(10*'*' + 'SUMMARY' + 10*'*')
  print(f'Number of 120x120 10-channel "images" to process:\t{num_imgs}')
  print(f'Total Land Area in California:\t\t\t\t{CALIFORNIA_AREA} sq. km')
  print(f'Total Cropland Area in California:\t\t\t{CALIFORNIA_CROPLAND:.0f} sq. km')
  print(f'Area Covered in Images:\t\t\t\t\t{unique_area} sq. km')
  print(f'Fraction of California Area Covered in Images:\t\t{unique_area / CALIFORNIA_AREA:.3f} ')
  print(f'Fraction of Sample Irrigated:\t\t\t\t{irrigated_area:.3f} ')
  print(f'Estimated Irrigated Fraction of Land In California:\t{CALIFORNIA_IRRIGATED_AREA/CALIFORNIA_AREA:.3f}')
  print(2*'\n')
  
  # Generate Plot of Irrigated Land Fraction by Month
  avg_irr_by_month = total_df.groupby(['month']).tot_irr_locs.mean()
  fig, ax = plt.subplots(1,1, figsize=(10,5))
  ax.bar(x=[MONTHS[m] for m in avg_irr_by_month.index],
         height=avg_irr_by_month.values/10000,
         color=CMAP, edgecolor='black')
  ax.set_xticklabels([MONTHS[m] for m in avg_irr_by_month.index],
                     rotation=45, fontsize=11,fontweight='bold')
  ax.set_yticklabels([f'{n:.2f}' for n in np.arange(0.0,0.18,0.02)],
                     fontsize=11,fontweight='bold')
  ax.set_title('Mean Fraction of Land Irrigated by Month',fontsize=16,fontweight='bold')

  print(2*'\n')

  # Histogram of Irrigated Locations by Month
  irr_by_month = total_df.groupby(['month']).tot_irr_locs
  fig, axes = plt.subplots(2,6,figsize=(20,10), sharex=True, sharey=True)
  axes = axes.flatten()
  for g, group in irr_by_month:
    group = group / 10000
    group.hist(ax=axes[g-1], bins=20, color='tab:green', ec='black')
    axes[g-1].set_title(f'{MONTHS[g]}', fontweight='bold', fontsize=11)
  plt.tight_layout()
  plt.suptitle('Distribution of Irrigated Fraction by Month', y=1.02, fontsize=16, fontweight='bold')




### Top Level Summary

In [36]:
summarize_metadata(total_df)

**********SUMMARY**********
Number of 120x120 10-channel "images" to process:	79
Total Land Area in California:				423967 sq. km
Total Cropland Area in California:			273164 sq. km
Area Covered in Images:					79.82142777777777 sq. km
Fraction of California Area Covered in Images:		0.000 


NameError: name 'irrigated_area' is not defined

In [34]:
# Create interactive map with default basemap
map_osm = folium.Map(
    location=[total_df.lat.median(), total_df.lon.median()],
    zoom_start=8,
    tiles='Stamen Terrain'
)

centers = total_df.groupby(['lat']).lon.unique()
month_counts = total_df.groupby(['lat']).month.unique()
for l, lat in enumerate(centers.index):
  for lon in centers[l]:

    folium.Rectangle(
    bounds=[(float(lat)-0.125, float(lon)-0.125), (float(lat)+0.125, float(lon)+0.125)],
    popup=f'{lat}, {lon}\n{len(month_counts[l])} months',
    color='#708090',
    fill=True,
    fill_color='#708090'
    ).add_to(map_osm)

map_osm

AttributeError: 'DataFrame' object has no attribute 'lat'