-----------
## In this notebook:
* Filter out SCHU rows
* Create testing/training/validation DFs
* Copy images into respective folders
-----------

In [1]:
import pandas as pd
import numpy as np
import cv2
import os
import glob
import multiprocessing
import shutil
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import warnings
warnings.filterwarnings("ignore")

## Load Precip DF

In [2]:
day_precip_df = pd.read_pickle("../DFs/day_precip_df.pkl")

In [None]:
day_precip_df

## Create QUEE Precip Testing/Valid Subset

In [3]:
df_SCHU = day_precip_df.loc[day_precip_df['station']==b'SCHU']

In [4]:
# change index
df_SCHU['index'] = list(range(0,len(df_SCHU)))

In [5]:
len(df_SCHU)

13851

In [35]:
df_SCHU.iloc[11449]

index                                                               11449
station                                                           b'SCHU'
time_5M                                               2019-12-31 19:30:00
tair                                                              1.10407
ta9m                                                              1.17228
precip                                                               1.47
precip_total                                                        350.8
precip_max_intensity                                                    0
snow_depth                                                     0.00747691
precip_diff                                                     0.0500183
img_path                /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
is_day                                                               True
Name: 1625692, dtype: object

In [None]:
df_SCHU.loc[df_SCHU['time_5M']=='2020-01-01 14:30:00']

In [33]:
# create a df with images up to 2019 (save 2020 for testing)
df_SCHU_2019 = df_SCHU[(df_SCHU['time_5M'] < '2020-01-04 12:15:00')]

In [34]:
# create a df with just the images
df_SCHU_2019_WP_imgs = df_SCHU_2019['img_path']
df_SCHU_2019_WP_imgs

0          /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
1          /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
2          /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
3          /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
4          /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
                                 ...                        
1625688    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
1625689    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
1625690    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
1625691    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
1625692    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
Name: img_path, Length: 11450, dtype: object

In [None]:
len(df_SCHU_2019)

In [36]:
# create a df with images 2020+
df_SCHU_2020 = df_SCHU[(df_SCHU['time_5M'] > '2019-12-31 19:30:00')]
df_SCHU_2020

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
1670944,11450,b'SCHU',2020-01-04 12:15:00,2.130165,2.627654,1.619995,353.100006,0.0,-0.000810,0.059998,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/SC...,True
1670945,11451,b'SCHU',2020-01-04 12:25:00,2.359442,2.653928,1.679993,353.160004,0.0,-0.001542,0.059998,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/SC...,True
1670946,11452,b'SCHU',2020-01-04 12:35:00,2.149962,2.564523,1.729980,353.220001,0.0,-0.000874,0.049988,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/SC...,True
1670947,11453,b'SCHU',2020-01-04 12:40:00,2.163660,2.564317,1.779999,353.269989,0.0,0.000087,0.050018,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/SC...,True
1670948,11454,b'SCHU',2020-01-04 12:55:00,2.335260,2.670384,1.839996,353.320007,0.0,0.000544,0.059998,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/SC...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2059930,13846,b'SCHU',2020-12-25 15:35:00,13.620380,13.957690,8.579987,311.429993,0.0,0.009156,0.130005,/tf/NYSM/archive/nysm/cam_photos/2020/12/25/SC...,True
2059931,13847,b'SCHU',2020-12-25 15:40:00,13.619330,13.967100,8.660004,311.559998,0.0,0.003011,0.080017,/tf/NYSM/archive/nysm/cam_photos/2020/12/25/SC...,True
2059932,13848,b'SCHU',2020-12-25 15:45:00,13.011610,13.779300,8.679993,311.640015,0.0,0.009981,0.019989,/tf/NYSM/archive/nysm/cam_photos/2020/12/25/SC...,True
2059936,13849,b'SCHU',2020-12-28 14:40:00,4.178930,4.138499,0.100006,312.109985,0.0,0.003794,0.100006,/tf/NYSM/archive/nysm/cam_photos/2020/12/28/SC...,True


In [None]:
df_SCHU_2020['img_path'].iloc[567]

In [None]:
img = mpimg.imread(df_SCHU_2020['img_path'].iloc[567])
plt.imshow(img)

plt.show()

## Copy WP images to respective folder

df_QUEE_2019_WP_imgs.iloc[0]

In [None]:
destination = "/tf/kokamura/Precip/QUEE-2cl/WP"
os.symlink(df_QUEE_2019_WP_imgs.iloc[0], destination)

In [37]:
# copy images into no precip folder
destination = "/tf/kokamura/Precip/3st-2cl/WP"
for image in df_SCHU_2019_WP_imgs:
    shutil.copy(image, destination)

#### Potentially learn how to use os.symlink when using more images

----------
## Load No Precip DF

In [38]:
no_precip_df_images = pd.read_pickle("../DFs/no_precip_df_images.pkl")

## Create Subset

In [39]:
# create QUEE subset
NP_df_SCHU = no_precip_df_images.loc[no_precip_df_images['station']==b'SCHU']

In [None]:
len(NP_df_ANDE)

In [None]:
NP_df_ANDE

In [40]:
# define functions

# determine day images
def is_day(path):
    image = cv2.imread(path)
    b,g,r = image[:,:,0], image[:,:,1], image[:,:,2]
    if (b==g).all() and (b==r).all(): return None
    else: return True
    
# filter out day images
def loopy_d(df):
    output2_df = df.copy()
    output2_df['is_day'] = output2_df['img_path'].apply(
        lambda x: is_day(x)
    )
    return output2_df

In [41]:
# filter out day images
NUM_CORES = 10
df_chunks = np.array_split(NP_df_SCHU, NUM_CORES)

with multiprocessing.Pool(NUM_CORES) as pool:
    no_df_SCHU = pd.concat(pool.map(loopy_d, df_chunks), ignore_index = True)

In [42]:
no_df_SCHU['is_day'].isnull().sum()

50762

In [43]:
SCHU_no_df = no_df_SCHU.loc[no_df_SCHU['is_day'].notnull()]
SCHU_no_df

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,0,b'SCHU',2015-08-13 16:30:00,22.238859,22.051840,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
1,1,b'SCHU',2015-08-13 16:35:00,21.994610,21.835501,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
2,2,b'SCHU',2015-08-13 16:40:00,21.792700,21.673880,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
3,3,b'SCHU',2015-08-13 16:55:00,20.111839,19.999201,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
4,4,b'SCHU',2015-08-13 17:00:00,20.223181,20.070271,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
295727,31926983,b'SCHU',2020-12-31 21:20:00,2.106877,2.116866,4.369995,316.609985,0.0,0.001295,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295728,31926984,b'SCHU',2020-12-31 21:25:00,1.926355,2.010462,4.369995,316.609985,0.0,0.001868,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295729,31926985,b'SCHU',2020-12-31 21:30:00,1.920723,1.998626,4.369995,316.609985,0.0,0.002890,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295730,31926986,b'SCHU',2020-12-31 21:35:00,1.893512,2.034337,4.369995,316.609985,0.0,0.001973,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True


In [44]:
# change index
SCHU_no_df['index'] = list(range(0,len(SCHU_no_df)))
SCHU_no_df

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,0,b'SCHU',2015-08-13 16:30:00,22.238859,22.051840,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
1,1,b'SCHU',2015-08-13 16:35:00,21.994610,21.835501,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
2,2,b'SCHU',2015-08-13 16:40:00,21.792700,21.673880,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
3,3,b'SCHU',2015-08-13 16:55:00,20.111839,19.999201,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
4,4,b'SCHU',2015-08-13 17:00:00,20.223181,20.070271,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
295727,244977,b'SCHU',2020-12-31 21:20:00,2.106877,2.116866,4.369995,316.609985,0.0,0.001295,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295728,244978,b'SCHU',2020-12-31 21:25:00,1.926355,2.010462,4.369995,316.609985,0.0,0.001868,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295729,244979,b'SCHU',2020-12-31 21:30:00,1.920723,1.998626,4.369995,316.609985,0.0,0.002890,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True
295730,244980,b'SCHU',2020-12-31 21:35:00,1.893512,2.034337,4.369995,316.609985,0.0,0.001973,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/SC...,True


## Sanity Check

In [None]:
img = mpimg.imread(SCHU_no_df['img_path'].iloc[678])
plt.imshow(img)

plt.show()

## Create QUEE NP Testing/Valid Subset

In [72]:
SCHU_no_df.iloc[192716]

index                                                              192716
station                                                           b'SCHU'
time_5M                                               2020-01-01 12:05:00
tair                                                             0.428311
ta9m                                                             0.609719
precip                                                           0.630005
precip_total                                                       351.48
precip_max_intensity                                                    0
snow_depth                                                     0.00763087
precip_diff                                                             0
img_path                /tf/NYSM/archive/nysm/cam_photos/2020/01/01/SC...
is_day                                                               True
Name: 233661, dtype: object

In [73]:
# create a df with images up to 2019 (save 2020 for testing)
NP_SCHU_df_2019 = SCHU_no_df[(SCHU_no_df['time_5M'] < ' 2020-01-01 12:05:00')]

In [74]:
NP_SCHU_df_2019

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,0,b'SCHU',2015-08-13 16:30:00,22.238859,22.051840,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
1,1,b'SCHU',2015-08-13 16:35:00,21.994610,21.835501,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
2,2,b'SCHU',2015-08-13 16:40:00,21.792700,21.673880,0.000000,24.986000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
3,3,b'SCHU',2015-08-13 16:55:00,20.111839,19.999201,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
4,4,b'SCHU',2015-08-13 17:00:00,20.223181,20.070271,0.212000,25.198000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
233626,192711,b'SCHU',2019-12-31 21:25:00,1.181103,1.350393,1.470001,350.850006,0.0,0.006229,0.0,/tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...,True
233627,192712,b'SCHU',2019-12-31 21:30:00,1.162010,1.355412,1.470001,350.850006,0.0,0.005288,0.0,/tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...,True
233628,192713,b'SCHU',2019-12-31 21:35:00,1.172545,1.373394,1.470001,350.850006,0.0,0.006256,0.0,/tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...,True
233629,192714,b'SCHU',2019-12-31 21:40:00,1.112934,1.361817,1.470001,350.850006,0.0,0.006447,0.0,/tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...,True


In [76]:
# create a df with just the images
NP_SCHU_df_2019_imgs = NP_SCHU_df_2019['img_path']
NP_SCHU_df_2019_imgs

0         /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
1         /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
2         /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
3         /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
4         /tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...
                                ...                        
233626    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
233627    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
233628    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
233629    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
233630    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/SC...
Name: img_path, Length: 192716, dtype: object

## Copy NP images to respective folder

In [None]:
6911/(6911 + 121833) # only 5% of all images contain precipitation

In [77]:
# chose 7,000 random images to copy over to NP folder
NP_SCHU_img_sample = NP_SCHU_df_2019_imgs.sample(11500, random_state=1)

In [78]:
# copy images into no precip folder
destination = "/tf/kokamura/Precip/3st-2cl/NP"
for image in NP_SCHU_img_sample:
    shutil.copy(image, destination)