# Data Pre-Processing and Data Augmentation

1) The .pkl files containing dataframes of all participants are read.<br>

2) The finger and phalanx touches are obtained through a customized blob-detection algorithm written using Open CV.<br>

3) The detected blobs are filtered using a specific threshold and are appended to same dataframe.<br> 

4) Data augmentations are also performed on the obtained blobs to increase the dataset.


In [1]:
import cv2
import pandas as pd
import numpy as np
import time
import os


In [2]:
def blob_detection(cap_matrix):
    '''
    performs blob detection over the obtained capactive matrix and returns the exteme left and right positions of the blob
    '''
    cap_matrix[cap_matrix < 0] = 0  # Negative pixel values are set to 0.

    image = np.array(abs(cap_matrix), dtype=np.uint8, copy=True)

    # Pixels below this threshold considered as noise.
    threshold = 30
    ret, threshold = cv2.threshold(image, threshold, 255, cv2.THRESH_BINARY)

    # Find contours will return contours from Threshold image.
    contours, _ = cv2.findContours(threshold, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # If there is no contours return null coordinates.
    if not contours:
        rec_pts = [(0, 0), (0, 0)]
    
    else:
        max_cnt = max(contours, key=cv2.contourArea)[:, 0]  # Determines maximum area contour.
        minimum_x, maximum_x = min(max_cnt[:, 0]), max(max_cnt[:, 0])
        minimum_y, maximum_y = min(max_cnt[:, 1]), max(max_cnt[:, 1])
        rec_points = [(minimum_x, minimum_y), (maximum_x, maximum_y)]
    
    return rec_points

# alternate method 
#     else:
#         c = max(contours, key=cv2.contourArea)  # Determines maximum area contour.

#         # determine the most extreme points along the contour
#         extLeft = tuple(c[c[:, :, 0].argmin()][0])
#         extRight = tuple(c[c[:, :, 0].argmax()][0])
#         extTop = tuple(c[c[:, :, 1].argmin()][0])
#         extBot = tuple(c[c[:, :, 1].argmax()][0])

#         alist = [extLeft, extRight, extTop, extBot]
#         temp = tuple(map(sorted, zip(*alist)))
#         min_x, max_x, min_y, max_y = temp[0][0], temp[0][-1], temp[1][0], temp[1][-1]
#         # print(min_x, max_x, min_y, max_y)

#         extreme_left = (min_x, min_y)
#         extreme_right = (max_x, max_y)

#         rec_pts = [extreme_left, extreme_right]

#     return rec_pts

In [3]:
def data_augmentation(df=None, participant_no=None):
    '''
    perform data augmentation by flipping the blobs left right and also up and down to make the dataset 4x large
    '''
    df_ud = df.copy(deep=True)

    for i in range(df_ud.shape[0]):
        df_ud.at[i, 'Cropped_Matrix'] = np.flipud(df.Cropped_Matrix[i])

    df_original_axes = pd.concat([df, df_ud], ignore_index=True)

    df_lr = df_original_axes.copy(deep=True)

    for i in range(df_original_axes.shape[0]):
        df_lr.at[i, 'Cropped_Matrix'] = np.fliplr(df_original_axes.Cropped_Matrix[i])

    df_final = pd.concat([df_original_axes, df_lr], ignore_index=True)
    print('Writing augmented data for Participant ' + str(participant_no), 'length', len(df_final))

    aug_data_save_path = '/home/rahul/Documents/phalanx_detection/pre_processed_aug/'
    if not os.path.exists(aug_data_save_path):
        os.makedirs(aug_data_save_path)

    # Save augmented data file for individual participant.
    df_final.to_pickle(os.path.join(aug_data_save_path, 'Participant_' + str(participant_no) + '.pkl'))

In [8]:
%%time

# Read data files of all participants for Pre-processing.
dropped_images = 0

for participant_no in range(1, 26):
    
    DATA_PATH = '/home/rahul/Documents/phalanx_detection/pickle_data/Participant_' + str(participant_no) + '.pkl'
    print('Reading data from Participant ' + str(participant_no) + '.')
    
    data_frame = pd.read_pickle(DATA_PATH)
    data_frame['Cropped_Matrix'] = [[0]] * len(data_frame)
    
    for j, matrix in enumerate(data_frame.Matrix):
        
        cap_matrix = np.reshape(matrix, (27, 15))
        rec_pts = blob_detection(cap_matrix)  # Run blob detection on reshaped capacitive matrix.

        # Cropping capacitive matrix with blob coordinates with offset of 1 extra row and column.
        crop_matrix = cap_matrix[rec_pts[0][1] - 1:rec_pts[1][1] + 1, rec_pts[0][0] - 1:rec_pts[1][0] + 1]
        min_area = crop_matrix.shape[0] * crop_matrix.shape[1]
        
        if min_area > 5:  # If area of pixels is greater than 5 then save in dataframe or drop it as noise.
            data_frame.at[j, 'Cropped_Matrix'] = crop_matrix
        else:
            data_frame.drop(j, inplace=True)
            dropped_images += 1  # will keep track of deleted images.

    print("Number of noisy images deleted are - ", dropped_images)
    data_frame = data_frame.reset_index()
    pickle_data_save_path = '/home/rahul/Documents/phalanx_detection/pre_processed/'
    
    if not os.path.exists(pickle_data_save_path):
        os.makedirs(pickle_data_save_path)

    # Store individual participant's pre processed data as pickle before augmentation.
    data_frame.to_pickle(os.path.join(pickle_data_save_path, 'Participant_' + str(participant_no) + '.pkl'))
    print('Writing preprocessed data for Participant ' + str(participant_no), 'length', len(data_frame))

    # Before running augmentation on data set, drop the following columns which are not necessary for training set.
    data_frame_compact = data_frame.drop(['Handedness', 'Finger', 'index', 'Timestamp', 'Matrix'], axis=1)
    data_augmentation(data_frame_compact, participant_no)


Reading data from Participant 1.
Number of noisy images deleted are -  8971
Writing preprocessed data for Participant 1 length 7948
Writing augmented data for Participant 1 length 31792
Reading data from Participant 2.
Number of noisy images deleted are -  20045
Writing preprocessed data for Participant 2 length 10675
Writing augmented data for Participant 2 length 42700
Reading data from Participant 3.
Number of noisy images deleted are -  29604
Writing preprocessed data for Participant 3 length 9383
Writing augmented data for Participant 3 length 37532
Reading data from Participant 4.
Number of noisy images deleted are -  42254
Writing preprocessed data for Participant 4 length 11724
Writing augmented data for Participant 4 length 46896
Reading data from Participant 5.
Number of noisy images deleted are -  57011
Writing preprocessed data for Participant 5 length 13483
Writing augmented data for Participant 5 length 53932
Reading data from Participant 6.
Number of noisy images deleted