-----------
## In this notebook:
* Filter out ANDE rows
* Create testing/training/validation DFs
* Copy images into respective folders
-----------

In [1]:
import pandas as pd
import numpy as np
import cv2
import os
import glob
import multiprocessing
import shutil
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import warnings
warnings.filterwarnings("ignore")

## Load Precip DF

In [2]:
day_precip_df = pd.read_pickle("../DFs/day_precip_df.pkl")

In [3]:
day_precip_df

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,0,b'SCHU',2015-08-13 16:45:00,21.526239,21.358690,0.146999,24.986000,0.000,,0.146999,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
1,1,b'SCHU',2015-08-13 16:50:00,20.540680,20.426950,0.212000,25.132999,0.000,,0.065001,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
2,2,b'SCHU',2015-08-13 17:20:00,20.132040,20.003860,0.520000,25.198000,0.000,,0.308001,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
3,3,b'SCHU',2015-08-13 17:25:00,19.782150,19.688551,0.607000,25.506001,0.000,,0.087000,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
4,4,b'SCHU',2015-08-13 17:30:00,19.587971,19.559811,0.975000,25.593000,0.127,,0.368000,/tf/NYSM/archive/nysm/cam_photos/2015/08/13/SC...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2073158,1684820,b'YORK',2020-12-24 21:45:00,7.989942,8.129804,2.320000,61.520000,0.000,-0.004730,0.149998,/tf/NYSM/archive/nysm/cam_photos/2020/12/24/YO...,True
2073159,1684821,b'YORK',2020-12-24 21:50:00,7.869830,8.115767,2.360001,61.669998,0.000,-0.008403,0.040001,/tf/NYSM/archive/nysm/cam_photos/2020/12/24/YO...,True
2073160,1684822,b'YORK',2020-12-24 21:55:00,7.515274,8.021632,2.520000,61.709999,0.000,-0.007266,0.160000,/tf/NYSM/archive/nysm/cam_photos/2020/12/24/YO...,True
2073179,1684823,b'YORK',2020-12-25 19:35:00,-0.835582,-1.015465,14.009995,80.930000,0.000,-0.007240,0.089996,/tf/NYSM/archive/nysm/cam_photos/2020/12/25/YO...,True


## Create QUEE Precip Testing/Valid Subset

In [4]:
df_ANDE = day_precip_df.loc[day_precip_df['station']==b'ANDE']

In [5]:
# change index
df_ANDE['index'] = list(range(0,len(df_ANDE)))

In [6]:
df_ANDE

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
56772,0,b'ANDE',2016-10-16 15:45:00,13.942360,13.967540,0.170000,0.000000,0.0,,0.170000,/tf/NYSM/archive/nysm/cam_photos/2016/10/16/AN...,True
56773,1,b'ANDE',2016-10-20 12:35:00,6.878686,6.808193,0.110000,1.210000,0.0,,0.110000,/tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...,True
56774,2,b'ANDE',2016-10-20 12:40:00,6.756670,6.716099,0.140000,1.320000,0.0,,0.030000,/tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...,True
56775,3,b'ANDE',2016-10-20 12:55:00,6.959150,6.882154,0.270000,1.350000,0.0,,0.130000,/tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...,True
56776,4,b'ANDE',2016-10-20 13:00:00,7.099862,7.018954,0.300000,1.480000,0.0,,0.030000,/tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2021866,14113,b'ANDE',2020-12-29 15:45:00,-5.551324,-5.834967,0.699982,344.459991,0.0,-0.003624,0.029999,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021867,14114,b'ANDE',2020-12-29 16:00:00,-5.458502,-5.741222,0.809998,344.489990,0.0,0.002980,0.110016,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021868,14115,b'ANDE',2020-12-29 16:05:00,-5.476278,-5.752946,0.829987,344.600006,0.0,0.005022,0.019989,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021869,14116,b'ANDE',2020-12-29 16:20:00,-5.602849,-5.846936,0.919983,344.619995,0.0,-0.003444,0.089996,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True


In [7]:
df_ANDE.loc[df_ANDE['time_5M']=='2020-01-01 14:30:00']

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
1642254,11346,b'ANDE',2020-01-01 14:30:00,-2.929516,-2.942815,1.459991,391.269989,0.0,-0.002811,0.059998,/tf/NYSM/archive/nysm/cam_photos/2020/01/01/AN...,True


In [6]:
# create a df with images up to 2019 (save 2020 for testing)
df_ANDE_2019 = df_ANDE[(df_ANDE['time_5M'] < '2020-01-01 14:30:00')]

In [7]:
# create a df with just the images
df_ANDE_2019_WP_imgs = df_ANDE_2019['img_path']
df_ANDE_2019_WP_imgs

56772      /tf/NYSM/archive/nysm/cam_photos/2016/10/16/AN...
56773      /tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...
56774      /tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...
56775      /tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...
56776      /tf/NYSM/archive/nysm/cam_photos/2016/10/20/AN...
                                 ...                        
1578802    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
1578803    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
1578804    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
1578805    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
1578806    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
Name: img_path, Length: 11346, dtype: object

In [None]:
len(df_ANDE_2019)

In [8]:
# create a df with images 2020+
df_ANDE_2020 = df_ANDE[(df_ANDE['time_5M'] > '2019-12-31 21:10:00')]
df_ANDE_2020

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
1642254,11346,b'ANDE',2020-01-01 14:30:00,-2.929516,-2.942815,1.459991,391.269989,0.0,-0.002811,0.059998,/tf/NYSM/archive/nysm/cam_photos/2020/01/01/AN...,True
1642255,11347,b'ANDE',2020-01-01 15:25:00,-2.910642,-2.982537,1.600006,391.329987,0.0,-0.003871,0.140015,/tf/NYSM/archive/nysm/cam_photos/2020/01/01/AN...,True
1642256,11348,b'ANDE',2020-01-01 15:30:00,-2.882289,-2.949813,1.630005,391.470001,0.0,-0.009938,0.029999,/tf/NYSM/archive/nysm/cam_photos/2020/01/01/AN...,True
1642269,11349,b'ANDE',2020-01-04 14:00:00,3.462889,3.649639,4.640015,396.079987,0.0,-0.005854,0.060028,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/AN...,True
1642270,11350,b'ANDE',2020-01-04 14:10:00,3.516216,3.831187,4.660004,396.140015,0.0,-0.005030,0.019989,/tf/NYSM/archive/nysm/cam_photos/2020/01/04/AN...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2021866,14113,b'ANDE',2020-12-29 15:45:00,-5.551324,-5.834967,0.699982,344.459991,0.0,-0.003624,0.029999,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021867,14114,b'ANDE',2020-12-29 16:00:00,-5.458502,-5.741222,0.809998,344.489990,0.0,0.002980,0.110016,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021868,14115,b'ANDE',2020-12-29 16:05:00,-5.476278,-5.752946,0.829987,344.600006,0.0,0.005022,0.019989,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True
2021869,14116,b'ANDE',2020-12-29 16:20:00,-5.602849,-5.846936,0.919983,344.619995,0.0,-0.003444,0.089996,/tf/NYSM/archive/nysm/cam_photos/2020/12/29/AN...,True


In [None]:
df_ANDE_2020['img_path'].iloc[567]

In [None]:
img = mpimg.imread(df_ANDE_2020['img_path'].iloc[567])
plt.imshow(img)

plt.show()

## Copy WP images to respective folder

df_QUEE_2019_WP_imgs.iloc[0]

In [None]:
destination = "/tf/kokamura/Precip/QUEE-2cl/WP"
os.symlink(df_QUEE_2019_WP_imgs.iloc[0], destination)

In [9]:
# copy images into no precip folder
destination = "/tf/kokamura/Precip/3st-2cl/WP"
for image in df_ANDE_2019_WP_imgs:
    shutil.copy(image, destination)

#### Potentially learn how to use os.symlink when using more images

----------
## Load No Precip DF

In [10]:
no_precip_df_images = pd.read_pickle("../DFs/no_precip_df_images.pkl")

## Create Subset

In [11]:
# create QUEE subset
NP_df_ANDE = no_precip_df_images.loc[no_precip_df_images['station']==b'ANDE']

In [None]:
len(NP_df_ANDE)

In [None]:
NP_df_ANDE

In [12]:
# define functions

# determine day images
def is_day(path):
    image = cv2.imread(path)
    b,g,r = image[:,:,0], image[:,:,1], image[:,:,2]
    if (b==g).all() and (b==r).all(): return None
    else: return True
    
# filter out day images
def loopy_d(df):
    output2_df = df.copy()
    output2_df['is_day'] = output2_df['img_path'].apply(
        lambda x: is_day(x)
    )
    return output2_df

In [13]:
# filter out day images
NUM_CORES = 10
df_chunks = np.array_split(NP_df_ANDE, NUM_CORES)

with multiprocessing.Pool(NUM_CORES) as pool:
    no_df_ANDE = pd.concat(pool.map(loopy_d, df_chunks), ignore_index = True)

In [None]:
no_df_ANDE['is_day'].isnull().sum()

In [14]:
ANDE_no_df = no_df_ANDE.loc[no_df_ANDE['is_day'].notnull()]
ANDE_no_df

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,1552184,b'ANDE',2016-10-13 21:40:00,11.274610,11.308010,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
1,1552185,b'ANDE',2016-10-13 21:45:00,11.193380,11.241410,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
2,1552186,b'ANDE',2016-10-13 21:50:00,11.100970,11.164340,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
3,1552187,b'ANDE',2016-10-13 21:55:00,11.138290,11.205270,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
4,1552188,b'ANDE',2016-10-13 22:00:00,11.098840,11.198260,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
250571,31543219,b'ANDE',2020-12-31 21:35:00,-1.383750,-1.521260,7.049988,351.839996,0.0,0.002758,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250572,31543220,b'ANDE',2020-12-31 21:40:00,-1.376182,-1.531667,7.049988,351.839996,0.0,0.002732,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250573,31543221,b'ANDE',2020-12-31 21:45:00,-1.423195,-1.570673,7.049988,351.839996,0.0,0.001896,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250574,31543222,b'ANDE',2020-12-31 21:50:00,-1.454483,-1.590882,7.049988,351.839996,0.0,0.002004,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True


In [15]:
# change index
ANDE_no_df['index'] = list(range(0,len(ANDE_no_df)))
ANDE_no_df

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff,img_path,is_day
0,0,b'ANDE',2016-10-13 21:40:00,11.274610,11.308010,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
1,1,b'ANDE',2016-10-13 21:45:00,11.193380,11.241410,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
2,2,b'ANDE',2016-10-13 21:50:00,11.100970,11.164340,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
3,3,b'ANDE',2016-10-13 21:55:00,11.138290,11.205270,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
4,4,b'ANDE',2016-10-13 22:00:00,11.098840,11.198260,24.720001,0.000000,0.0,,0.0,/tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
250571,207043,b'ANDE',2020-12-31 21:35:00,-1.383750,-1.521260,7.049988,351.839996,0.0,0.002758,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250572,207044,b'ANDE',2020-12-31 21:40:00,-1.376182,-1.531667,7.049988,351.839996,0.0,0.002732,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250573,207045,b'ANDE',2020-12-31 21:45:00,-1.423195,-1.570673,7.049988,351.839996,0.0,0.001896,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True
250574,207046,b'ANDE',2020-12-31 21:50:00,-1.454483,-1.590882,7.049988,351.839996,0.0,0.002004,0.0,/tf/NYSM/archive/nysm/cam_photos/2020/12/31/AN...,True


## Sanity Check

In [None]:
img = mpimg.imread(ANDE_no_df['img_path'].iloc[678])
plt.imshow(img)

plt.show()

## Create QUEE NP Testing/Valid Subset

In [None]:
ANDE_no_df.iloc[157024]

In [16]:
# create a df with images up to 2019 (save 2020 for testing)
NP_ANDE_df_2019 = ANDE_no_df[(ANDE_no_df['time_5M'] < '2020-01-01 12:30:00')]

In [None]:
NP_ANDE_df_2019

In [17]:
# create a df with just the images
NP_ANDE_df_2019_imgs = NP_ANDE_df_2019['img_path']
NP_ANDE_df_2019_imgs

0         /tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...
1         /tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...
2         /tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...
3         /tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...
4         /tf/NYSM/archive/nysm/cam_photos/2016/10/13/AN...
                                ...                        
189909    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
189910    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
189911    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
189912    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
189913    /tf/NYSM/archive/nysm/cam_photos/2019/12/31/AN...
Name: img_path, Length: 157025, dtype: object

## Copy NP images to respective folder

In [None]:
6911/(6911 + 121833) # only 5% of all images contain precipitation

In [18]:
# chose 7,000 random images to copy over to NP folder
NP_ANDE_img_sample = NP_ANDE_df_2019_imgs.sample(11000, random_state=1)

In [19]:
# copy images into no precip folder
destination = "/tf/kokamura/Precip/3st-2cl/NP"
for image in NP_ANDE_img_sample:
    shutil.copy(image, destination)