# prepare grooming and nongrooming data

In [1]:
import pathlib
import shutil
# from multiprocessing import Pool, current_process

import random
import math 
from tqdm.autonotebook import tqdm
import cv2 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from time import time
import platform

from collections import defaultdict
import re
from re import sub             
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

%matplotlib inline
mpl.rcParams['figure.dpi']= 120

plt.rcParams.update({'figure.max_open_warning': 0})
np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.float_format', '{:,.5f}'.format)
pd.set_option('display.max_colwidth', 80)

_platform = platform.platform()
print('platform:', _platform, platform.node())
if 'Linux' in _platform: # linux
    rat_path = '/home/ece/rat_data/'
    tsn_path = '/home/ece/tsn_data/'
    
elif 'macOS' in _platform: # MAC OS X
    rat_path = '/Users/cclee/rat_data/'
    tsn_path = '/Users/cclee/tsn_data/'     
elif 'Windows' in _platform: # Windows
    if platform.node()=='Mozart':
        rat_path = 'e:/rat_data/'
        tsn_path = 'e:/tsn_data/' 
    else:
        rat_path = 'd:/rat_data/'   
        tsn_path = 'd:/tsn_data/' 
  
path_rat = pathlib.Path(rat_path)
path_tsn = pathlib.Path(tsn_path)
path_new_grooming = path_rat.joinpath('new_grooming')

print(cv2.__version__)


  from tqdm.autonotebook import tqdm


platform: Windows-10-10.0.19041-SP0 Mozart
4.5.1


In [4]:
finished_lst = [921111, 930302, 930316, 921216, 930203, 930217] 

def get_rat_lst(allrat=True):
    rat_lst = []
    if allrat:
        all_rat_set = set()
        csv_lst = sorted(path_new_grooming.glob('9*.csv'))
        for csvf in csv_lst:
            tok = csvf.stem.split('_')
            rat_date = int(tok[0])
            all_rat_set.add(rat_date)
        
        rat_lst = list(all_rat_set)
        rat_lst.sort()
        
        print('all ', rat_lst)
        remove_lst = []
        for i in range(len(rat_lst)-1):
            if rat_lst[i]+1==rat_lst[i+1]:
                remove_lst.append(rat_lst[i+1])
          
        print('remove_lst ', remove_lst)
        for r in remove_lst:
            rat_lst.remove(r)    
                
#         for r in finished_lst:
#             rat_lst.remove(r)                        
        
    print('rat_lst ', rat_lst)
    return rat_lst

rat_lst = get_rat_lst()
print(len(rat_lst), rat_lst)

all  [921111, 921112, 921113, 921216, 921217, 921218, 930203, 930204, 930205, 930217, 930218, 930219, 930302, 930303, 930304, 930309, 930310, 930311, 930316, 930317, 930318, 930323, 930324, 930325, 930330]
remove_lst  [921112, 921113, 921217, 921218, 930204, 930205, 930218, 930219, 930303, 930304, 930310, 930311, 930317, 930318, 930324, 930325]
rat_lst  [921111, 921216, 930203, 930217, 930302, 930309, 930316, 930323, 930330]
9 [921111, 921216, 930203, 930217, 930302, 930309, 930316, 930323, 930330]


# step 1. copy grooming/non-grooming mp4 files

In [3]:
rat_lst = [930330] #get_rat_lst() #[921111, 930302, 930316, 921216, 930203] #930217

def copy_clips(copy=False):
    rat_gm_lst = []
    csv_lst = sorted(path_new_grooming.glob('9*.csv'))
    for csvf in csv_lst:
        tok = csvf.stem.split('_')
        rat_date = int(tok[0])
        if rat_date in rat_date_lst:
            rat_gm_lst.append(tok[0]+'_'+tok[1])

    print('rat_gm_lst', len(rat_gm_lst))
    #     print(rat_gm_lst)

    clip_dir_lst = [x for x in path_rat.iterdir() if x.is_dir() and x.name[0]=='9' and int(x.name[:6]) in rat_date_lst]
    clip_dir_lst = sorted(clip_dir_lst)
    if len(clip_dir_lst)==0:
        print('cannot find rat in rat_data')

    grooming_lst = []
    nongrooming_lst = []
    for clip_dir in clip_dir_lst:
        clip_lst = sorted(clip_dir.glob('*.mp4'))
        print('copy ', clip_dir, len(clip_lst))
        for clip in clip_lst:
            tok = clip.stem.split('_')
            clip_name = tok[0]+'_'+tok[1]
    #             print(clip_name)
            if clip_name in rat_gm_lst:
                grooming_lst.append(str(clip)+'\n')
                if copy:
                    shutil.copyfile(clip, path_rat_grooming.joinpath(clip.name) )
            else:
                nongrooming_lst.append(str(clip)+'\n')
                if copy:
                    shutil.copyfile(clip, path_rat_nongrooming.joinpath(clip.name) )
    
    return grooming_lst, nongrooming_lst

for rat in rat_lst:
    outpath = path_tsn.joinpath('{}'.format(rat))
    try:
        if not outpath.exists():
            outpath.mkdir() 
    except Exception as ex:
        print(ex.__class__.__name__)

    path_frames = outpath.joinpath('frames')
#     path_rat_grooming = outpath.joinpath('grooming')
#     path_rat_nongrooming = outpath.joinpath('nongrooming')
    if not path_frames.exists():
        path_frames.mkdir()   
#     if path_rat_grooming.exists():
#         shutil.rmtree(str(path_rat_grooming))
#         path_rat_grooming.mkdir()   
#     if path_rat_nongrooming.exists():
#         shutil.rmtree(str(path_rat_nongrooming))
#         path_rat_nongrooming.mkdir()


    rat_date_lst = [ rat +i for i in range(3)]
    print(rat_date_lst)
    print(path_frames)
#     print(path_rat_grooming)
#     print(path_rat_nongrooming)
    
    grooming_lst, nongrooming_lst = copy_clips(copy=False)
    
    fname = outpath.joinpath('grooming_lst.txt')
    with open(fname, 'w') as f:
        f.writelines( grooming_lst )
        
    fname = outpath.joinpath('nongrooming_lst.txt')
    with open(fname, 'w') as f:
        f.writelines( nongrooming_lst )    

[930330, 930331, 930332]
e:\tsn_data\930330\frames
rat_gm_lst 106
copy  e:\rat_data\930330-base-1d 934


# step2. generate optical flow images

In [26]:
# https://github.com/qijiezhao/py-denseflow

import os,sys
import numpy as np
import cv2
from PIL import Image
# from multiprocessing import Pool

# from IPython import embed #to debug

import imageio


def ToImg(raw_flow,bound):
    '''
    this function scale the input pixels to 0-255 with bi-bound
    :param raw_flow: input raw pixel value (not in 0-255)
    :param bound: upper and lower bound (-bound, bound)
    :return: pixel value scale from 0 to 255
    '''
    flow=raw_flow
    flow[flow>bound]=bound
    flow[flow<-bound]=-bound
    flow-=-bound
    flow*=(255/float(2*bound))
    return flow

def save_flows(flows,image,save_dir,num,bound):
    '''
    To save the optical flow images and raw images
    :param flows: contains flow_x and flow_y
    :param image: raw image
    :param save_dir: save_dir name (always equal to the video id)
    :param num: the save id, which belongs one of the extracted frames
    :param bound: set the bi-bound to flow images
    :return: return 0
    '''
    #rescale to 0~255 with the bound setting
    flow_x=ToImg(flows[...,0],bound).astype(np.uint8)
    flow_y=ToImg(flows[...,1],bound).astype(np.uint8)
#    if not os.path.exists(os.path.join(data_root,new_dir,save_dir)):
#        os.makedirs(os.path.join(data_root,new_dir,save_dir))

    #save the image
    save_img=save_dir.joinpath('img_{:05d}.jpg'.format(num))
#    scipy.misc.imsave(save_img,image)
    cv2.imwrite(str(save_img), image)

    #save the flows
    save_x=save_dir.joinpath('flow_x_{:05d}.jpg'.format(num))
    save_y=save_dir.joinpath('flow_y_{:05d}.jpg'.format(num))
    flow_x_img=Image.fromarray(flow_x)
    flow_y_img=Image.fromarray(flow_y)
    imageio.imwrite(save_x,flow_x_img)
    imageio.imwrite(save_y,flow_y_img)
    
    if num==2:
        shutil.copyfile(save_x, save_dir.joinpath('flow_x_00001.jpg'))
        shutil.copyfile(save_y, save_dir.joinpath('flow_y_00001.jpg'))
        
#     cv2.imwrite(str(save_x), flow_x_img)
#     cv2.imwrite(str(save_y), flow_y_img)
    return 0

In [28]:
bound=20
rat_lst = get_rat_lst() # [921111, 930302, 930316, 921216, 930203, 930217]

def calc_optical_flow(fname, out_dir):
    
    cap = cv2.VideoCapture(str(fname))

    bOpenVideo = cap.isOpened()
    if bOpenVideo == False:
        print('Open Video failed')
    else:
        fcount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT ))
        fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))    
#         print('%s: fps = %d, w %d, h %d, count %d' % (clip.name, fps, width, height, fcount))


        ret, frame1 = cap.read()
        if ret==False:
            print('cap.read() error, frame:')
            return
        
        save_img=out_dir.joinpath('img_{:05d}.jpg'.format(1))
        cv2.imwrite(str(save_img), frame1)
        
        prvs_fm = cv2.cvtColor(frame1,cv2.COLOR_BGR2GRAY)
#         hsv = np.zeros_like(frame1)
#         hsv[...,1] = 255

        i = 2
#         pbar = tqdm(total=fcount-1, ascii=True) 
        if platform.node()=='Mozart':
            cuMat1 = cv2.cuda_GpuMat()
            cuMat2 = cv2.cuda_GpuMat()
        while(1):
            ret, frame2 = cap.read()
            if ret==False:
                break

            next_fm = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

            if platform.node()=='Mozart':
                cuMat1.upload(prvs_fm)
                cuMat2.upload(next_fm)
                optical_flow = cv2.cuda_OpticalFlowDual_TVL1.create()
                cuFlow = optical_flow.calc(cuMat1, cuMat2, None)
                flow = cuFlow.download()    
            else:
                optical_flow = cv2.optflow.DualTVL1OpticalFlow_create()
                flow = optical_flow.calc(prvs_fm, next_fm, None)

#             mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1])
#             hsv[...,0] = ang*180/np.pi/2
#             hsv[...,2] = cv2.normalize(mag,None,0,255,cv2.NORM_MINMAX)
#             rgb = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)

#             frame_name = out_dir.joinpath('img_{:05d}.jpg'.format(i))
#             opf_name = out_dir.joinpath('flow_{:05d}.jpg'.format(i))
#             cv2.imwrite(str(frame_name), frame2)
#             cv2.imwrite(str(opf_name), rgb)
            
            save_flows(flow, frame2, out_dir, i, bound)

            prvs_fm = next_fm
            i +=1
#             pbar.update(1)
            
#         pbar.close()
        
    cap.release()   
        
        
for rat in rat_lst:
    outpath = path_tsn.joinpath('{}'.format(rat))
    if not outpath.exists():
        print('folder not exists', outpath)
        break 
    

    path_frames = outpath.joinpath('frames')
#     path_rat_grooming = outpath.joinpath('grooming')
#     path_rat_nongrooming = outpath.joinpath('nongrooming')
  
    print(path_frames)
#     print(path_rat_grooming)
#     print(path_rat_nongrooming)
    
#     clip_gm_lst = sorted(path_rat_grooming.glob('9*.mp4'))
    fname = outpath.joinpath('grooming_lst.txt')
    with open(fname) as f:
        clip_gm_lst = f.readlines()
    print('clip_gm_lst',len(clip_gm_lst))

#     clip_nongm_lst = sorted(path_rat_nongrooming.glob('9*.mp4'))
    fname = outpath.joinpath('nongrooming_lst.txt')
    with open(fname) as f:
        clip_nongm_lst = f.readlines()
    print('clip_nongm_lst',len(clip_nongm_lst))

    print('calculate optical flow for grooming list')
    pbar = tqdm(total=len(clip_gm_lst), ascii=True)
    for clip in clip_gm_lst:
        cpath = pathlib.Path(clip)
        vid_name = cpath.stem.split('.')[0]
        clip_dir = path_frames.joinpath('Grooming_'+vid_name)
        if  clip_dir.exists():
            shutil.rmtree(str(clip_dir))
        clip_dir.mkdir()  
        

        calc_optical_flow(clip, clip_dir )            
        pbar.update(1)

    pbar.close()    

    print('calculate optical flow for non-grooming list')
    pbar = tqdm(total=len(clip_nongm_lst), ascii=True)
    for clip in clip_nongm_lst:
        cpath = pathlib.Path(clip)
        vid_name = cpath.stem.split('.')[0]
        clip_dir = path_frames.joinpath('Nongrooming_'+vid_name)
        if  clip_dir.exists():
            shutil.rmtree(str(clip_dir))
        clip_dir.mkdir()   

        calc_optical_flow(clip, clip_dir )            
        pbar.update(1)

    pbar.close() 

all  [921111, 921112, 921113, 921216, 921217, 921218, 930203, 930204, 930205, 930217, 930218, 930219, 930302, 930303, 930304, 930309, 930310, 930311, 930316, 930317, 930318, 930323, 930324, 930325, 930330]
remove_lst  [921112, 921113, 921217, 921218, 930204, 930205, 930218, 930219, 930303, 930304, 930310, 930311, 930317, 930318, 930324, 930325]
rat_lst  [930217, 930309, 930323, 930330]
e:\tsn_data\930217\frames
clip_gm_lst 579
clip_nongm_lst 2456
calculate optical flow for grooming list


HBox(children=(FloatProgress(value=0.0, max=579.0), HTML(value='')))


calculate optical flow for non-grooming list


HBox(children=(FloatProgress(value=0.0, max=2456.0), HTML(value='')))


e:\tsn_data\930309\frames
clip_gm_lst 515
clip_nongm_lst 2495
calculate optical flow for grooming list


HBox(children=(FloatProgress(value=0.0, max=515.0), HTML(value='')))


calculate optical flow for non-grooming list


HBox(children=(FloatProgress(value=0.0, max=2495.0), HTML(value='')))


e:\tsn_data\930323\frames
clip_gm_lst 521
clip_nongm_lst 2420
calculate optical flow for grooming list


HBox(children=(FloatProgress(value=0.0, max=521.0), HTML(value='')))


calculate optical flow for non-grooming list


HBox(children=(FloatProgress(value=0.0, max=2420.0), HTML(value='')))


e:\tsn_data\930330\frames
clip_gm_lst 106
clip_nongm_lst 828
calculate optical flow for grooming list


HBox(children=(FloatProgress(value=0.0, max=106.0), HTML(value='')))


calculate optical flow for non-grooming list


HBox(children=(FloatProgress(value=0.0, max=828.0), HTML(value='')))




In [None]:

# frame1 = (cv2.imread('basketball1.png', cv2.IMREAD_GRAYSCALE))
# frame2 = (cv2.imread('basketball2.png', cv2.IMREAD_GRAYSCALE))

# nvof = cv2.cuda_NvidiaOpticalFlow_1_0.create(frame1.shape[1], frame1.shape[0], 5, False, False, False, 0)

# flow = nvof.calc(frame1, frame2, None)

# flowUpSampled = nvof.upSampler(flow[0], frame1.shape[1], frame1.shape[0], nvof.getGridSize(), None)

# cv2.writeOpticalFlow('OpticalFlow.flo', flowUpSampled)

# nvof.collectGarbage()

# image resize to (340, 256)
## copy to newframes dir

In [3]:
rat_lst = [921111, 930302, 930316, 921216, 930203, 930217]

path_newframes = path_tsn.joinpath('newframes')
print('path_newframes', path_newframes)
if not path_newframes.exists():
    print('create path_newframes ')
    path_newframes.mkdir()
        
for rat in rat_lst:
    path_frames = path_tsn.joinpath(str(rat), 'frames')
    print(path_frames)
    if not path_frames.exists():
        print('cannot find path ', path_frames)
        break
        
    oldframe_dir_lst = [x for x in path_frames.iterdir() if x.is_dir()]
    print('oldframe_dir_lst', len(oldframe_dir_lst))
    pbar = tqdm(total=len(oldframe_dir_lst), ascii=True)
    for dd in oldframe_dir_lst:
        new_frame_dir = path_newframes.joinpath(dd.name)
        new_frame_dir.mkdir()
        image_lst = dd.glob('*.jpg')
        for img in image_lst:
            pic = cv2.imread(str(img))
            pic = cv2.resize(pic, (340, 256), interpolation=cv2.INTER_CUBIC)
            cv2.imwrite(str(new_frame_dir.joinpath(img.name)), pic)
        
        pbar.update(1)

    pbar.close() 

path_newframes e:\tsn\newframes
e:\tsn\921111\frames
oldframe_dir_lst 3207


HBox(children=(FloatProgress(value=0.0, max=3207.0), HTML(value='')))


e:\tsn\930302\frames
oldframe_dir_lst 3093


HBox(children=(FloatProgress(value=0.0, max=3093.0), HTML(value='')))


e:\tsn\930316\frames
oldframe_dir_lst 3263


HBox(children=(FloatProgress(value=0.0, max=3263.0), HTML(value='')))


e:\tsn\921216\frames
oldframe_dir_lst 2986


HBox(children=(FloatProgress(value=0.0, max=2986.0), HTML(value='')))


e:\tsn\930203\frames
oldframe_dir_lst 3026


HBox(children=(FloatProgress(value=0.0, max=3026.0), HTML(value='')))


e:\tsn\930217\frames
oldframe_dir_lst 3035


HBox(children=(FloatProgress(value=0.0, max=3035.0), HTML(value='')))




# step 3. generate train_lst and val_lst files
## Rat dependent
The user-dependent (UD) training methods require training data from each user, from which a user-specific model is generated. 

The UI training methods require training data from multiple participants and a generalized model, or a ‘UI’ model, is generated such that it can be applied to unseen users. 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

SPLIT = 1
SEED = 43

rat_lst = get_rat_lst() # [921111, 930302, 930316, 921216, 930203, 930217]

df_train_lst = []
df_test_lst = []


for rat in rat_lst:
    path_frames = path_tsn.joinpath(str(rat), 'frames')
    print(path_frames)
    if not path_frames.exists():
        print('cannot find path ', path_frames)
        break    

    grooming_train_lst = [str(x) for x in path_frames.iterdir() if x.is_dir() and x.name[0]=='G' and str(rat) in x.name]
    non_grooming_train_lst = [str(x) for x in path_frames.iterdir() if x.is_dir() and x.name[0]=='N' and str(rat) in x.name]
    
    print('grooming_train_lst', len(grooming_train_lst))
    print('non_grooming_train_lst', len(non_grooming_train_lst))
    x = grooming_train_lst.copy()
    
    x.extend(non_grooming_train_lst)
    print('total x', len(x))
    
    y = list(np.ones(len(grooming_train_lst), np.int8))
    y.extend(list(np.zeros(len(non_grooming_train_lst), np.int8)))
    
    print('total y',len(y), 'sum y',sum(y), sum(y)/len(y))
    
    skf = StratifiedShuffleSplit(n_splits=SPLIT, random_state=SEED, test_size=0.2)
    for train_index, test_index in skf.split(x, y):
        print("TRAIN:", len(train_index), "TEST:", len(test_index), 'SUM:', len(train_index)+len(test_index))

    df = pd.DataFrame({'x':x, 'y':y}) 
    print(df)
    df_train = df.iloc[train_index]
    print('df_train', df_train.shape)
    
    # nongroom 取1.2倍的grooming 數量
    num_train_groom = sum(df_train['y'])
    num_train_nongroom = int(num_train_groom * 1.2)
    
    df_train_groom = df_train[df_train['y']==1]
    df_train_nongroom1 = df_train[df_train['y']==0]
    df_train_nongroom2 = df_train_nongroom1.sample(n=num_train_nongroom, random_state=SEED)
    df_train_nongroom3 = df_train_nongroom1.drop(df_train_nongroom2.index)
    
    df_train = df_train_groom.append(df_train_nongroom2)
    df_train = df_train.sample(frac=1, random_state=SEED)
    
    print('df_train: sum of 1', sum(df_train['y']), sum(df_train['y'])/len(df_train))
    print('df_train_nongroom1 (origin)', len(df_train_nongroom1))
    print('df_train_nongroom2', len(df_train_nongroom2))
    print('df_train_nongroom3 (rest)', len(df_train_nongroom3))
    print('df_train', len(df_train))
    
    df_test = df.iloc[test_index]
    print('df_test', df_test.shape)
    
    # nongroom 取1.2倍的grooming 數量
    num_test_groom = sum(df_test['y'])
    num_test_nongroom = int(num_test_groom * 1.2)
    
    df_test_groom = df_test[df_test['y']==1]
    df_test_nongroom1 = df_test[df_test['y']==0]
    df_test_nongroom2 = df_test_nongroom1.sample(n=num_test_nongroom, random_state=SEED)
    df_test_nongroom3 = df_test_nongroom1.drop(df_test_nongroom2.index)
    
    df_test = df_test_groom.append(df_test_nongroom2)
    df_test = df_test.sample(frac=1, random_state=SEED)
    
    
    print('df_test: sum of 1', sum(df_test['y']), sum(df_test['y'])/len(df_test))
    print('df_test (after append)', len(df_test))
    print('df_test: sum of 1', sum(df_test['y']), sum(df_test['y'])/len(df_test))
 
    df_train_lst.append(df_train)
    df_test_lst.append(df_test)
    
df_train = pd.concat(df_train_lst) 
df_test = pd.concat(df_test_lst)

#### output train file list
df_train = df_train.reset_index(drop=True)
frame_count_lst = []
for row in df_train.itertuples():
    path_folder = pathlib.Path(row.x)
    img_lst = list(path_folder.glob('img*.jpg'))
    frame_count_lst.append(len(img_lst))

ss = pd.Series(frame_count_lst)    
df_train.insert(1, 'count', ss)
print('df_train')
display(df_train)

path_tsn_data = path_tsn.joinpath('data')
if not path_tsn_data.exists():
    path_tsn_data.mkdir() 
fname = path_tsn_data.joinpath('train_lst.txt')
df_train.to_csv(fname, header=False, index = False, sep = ' ')

###########################################
#### output test file list
df_test = df_test.reset_index(drop=True)
frame_count_lst = []
for row in df_test.itertuples():
    path_folder = pathlib.Path(row.x)
    img_lst = list(path_folder.glob('img*.jpg'))
    frame_count_lst.append(len(img_lst))

ss = pd.Series(frame_count_lst)    
df_test.insert(1, 'count', ss)
print('df_test')
display(df_test)

fname = path_tsn_data.joinpath('val_lst.txt')
df_test.to_csv(fname, header=False, index = False, sep = ' ')

all  [921111, 921112, 921113, 921216, 921217, 921218, 930203, 930204, 930205, 930217, 930218, 930219, 930302, 930303, 930304, 930309, 930310, 930311, 930316, 930317, 930318, 930323, 930324, 930325, 930330]
remove_lst  [921112, 921113, 921217, 921218, 930204, 930205, 930218, 930219, 930303, 930304, 930310, 930311, 930317, 930318, 930324, 930325]
rat_lst  [921111, 921216, 930203, 930217, 930302, 930309, 930316, 930323, 930330]
d:\tsn_data\921111\frames
cannot find path  d:\tsn_data\921111\frames


ValueError: No objects to concatenate

In [64]:
path_folder = pathlib.Path('/Users/cclee/tsn/930217/frames/Nongrooming_930219_L32905_002725')
img_lst = list(path_folder.glob('img*.jpg'))
print(len(img_lst))

119
