## Environment

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import re
import sys
import glob
import random
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm_notebook
from collections import defaultdict
from itertools import chain
from sklearn.preprocessing import StandardScaler
from typing import List, Dict, Any, NoReturn, Tuple, Optional, Union


import warnings
warnings.filterwarnings('ignore')

In [3]:
# Networks
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (DataLoader, SequentialSampler, 
                              Dataset, TensorDataset)

In [4]:
# System adjustments - for all colums to fit into output (default width is 80)
pd.options.display.width = 2500
pd.options.display.max_rows = 999
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 100

### Define paths

In [5]:
# For train-test in training mode
GENERAL_DATA_DIR = Path('D:\\Data\\EyesSimulation Sessions\\Export_full')
INITIAL_DATA_DIR = GENERAL_DATA_DIR / "Export_full"
TEST_SEEN_DATA_DIR = GENERAL_DATA_DIR / "test_seen"
TEST_UNSEEN_DATA_DIR = GENERAL_DATA_DIR / "test_unseen"
TRAIN_DATA_DIR = GENERAL_DATA_DIR / "train"

## Preprocessing

In [7]:
def groupby_session(data: pd.DataFrame,
                    filter_threshold: int=50) -> List[pd.DataFrame]:
    """
    Group data by sessions.
    :param data: single DataFrame with all recorded sessions
    :param filter_threshold: minimum length of session recording to select
    :return: list of sessions (as DataFrames).
    """
    sess_data = []
    for group_name, group in data.groupby(by=['user_id', 'session_id']):
        group['user_id'] = group_name[0]
        group['session_id'] = group_name[1]
        if group.shape[0] > filter_threshold:
            sess_data.append(group)
    print(f"Resulted list of data length: {len(sess_data)}")
    del data
    return sess_data


def horizontal_align_data(df: pd.DataFrame,
                          grouping_cols: Union[str, List[str]],
                          aligning_cols: List[str]) -> pd.DataFrame:
    if len(aligning_cols) != 2:
        print("Should be given two separate columns of coordinates.")
        return df

    hdf = []
    for group_name, group_df in tqdm(df.groupby(by=grouping_cols)):
        group_df = pd.DataFrame(group_df[aligning_cols].T.values.flatten(order='F').reshape(1, -1),
                                columns=list(chain.from_iterable([[col_name + str(col_n) for col_name in ["x_", "y_"]]
                                                                  for col_n in range(group_df.shape[0])])))
        for i, col_name in enumerate(grouping_cols):
            group_df[col_name] = group_name[i]
        hdf.append(group_df)
    hdf = pd.concat(hdf, axis=0)
    return hdf


def vertical_align_data(data: pd.DataFrame,
                        data_col: Union[str, List[str]],
                        target_col: str,
                        guid_col: str) -> pd.DataFrame:
    # Transforms to long forme DF
    ts_df = []
    for i, row in data.iterrows():
        df = pd.DataFrame(columns=['x', 'y', 'label', 'guid'])
        if type(data_col) == str:
            # Joined array of x and y
            df['x'] = row[data_col].reshape(-1, 2)[:, 0]
            df['y'] = row[data_col].reshape(-1, 2)[:, 1]
        else:
            # Separately x and y
            df['x'] = row[data_col[0]]
            df['y'] = row[data_col[1]]
        df['label'] = row[target_col]
        df['guid'] = row[guid_col]
        ts_df.append(df)

    data = pd.concat(ts_df).reset_index().rename({"index": "i"}, axis=1)
    data.label = data.label.astype(int)
    return data



def split_dataset(dataset: pd.DataFrame, label_col_name: str,
                  max_seq_len: int):
    data = []
    guid_cnt = 0
    for i, (label, xy) in tqdm(enumerate(zip(dataset[label_col_name].values,
                                                      dataset.filter(regex=("[\d]+")).values))):
        xy = xy[~np.isnan(xy)]  # not nan values

        if len(xy) >= max_seq_len:
            for i in range(len(xy) // (max_seq_len)):
                if len(xy[i * max_seq_len: (i + 1) * max_seq_len]) > 0.85 * max_seq_len:
                    guid_cnt += 1
                    data.append({"guid": guid_cnt,
                                 "data": xy[i * max_seq_len: (i + 1) * max_seq_len],
                                 "label": label})
        elif len(xy) > 0.85 * max_seq_len:
            guid_cnt += 1
            data.append({'guid': guid_cnt,
                         'data': xy,
                         'label': label})

    return data


def pad_dataset(data: List[Dict[str, Any]], max_seq_len: int,
                pad_symbol: float):
    ret_data = []
    try:
        for _ in range(len(data)):
            data_pair = data.pop()
            if len(data_pair['data']) < max_seq_len:
                ret_data.append({'guid': data_pair['guid'],
                                 'data': np.pad(data_pair['data'],
                                                pad_width=(0, max_seq_len - len(data_pair['data'])),
                                                mode='constant', constant_values=0.0),
                                 'label': data_pair['label']})
            else:
                ret_data.append(data_pair)
    except:
        print("Data list ended.")
    del data
    return ret_data


def truncate_dataset(data: List[Dict[str, Any]], max_seq_len: int):
    ret_data = []
    try:
        for _ in range(len(data)):
            data_pair = data.pop()
            if len(data_pair['data']) > max_seq_len:
                ret_data.append({'guid': data_pair['guid'],
                                 'data': data_pair['data'][:max_seq_len],
                                 'label': data_pair['label']})
            else:
                ret_data.append(data_pair)
    except:
        print("Data list ended.")
    del data
    return ret_data

## Dataset

In [6]:
class SessionsDataset(Dataset):
    """Eye Gaze Sessions dataset."""

    def __init__(self, targets_fn: str, root_dir: str, 
                 do_transform: bool=False, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self._targets_mapping = pd.read_csv(targets_fn, sep=";", encoding="utf-8")
        self._data_root_dir = root_dir
        self._sess_fns = random.shuffle([fn for fn in glob.glob(root_dir + "\\*.csv") if not fn.endswith("_affmatrix.csv")])
        self._do_transform = do_transform
        self._transform = transform
        
    def __len__(self):
        return len(self._sess_fns)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sess_fn = self._sess_fns[idx]
        sess = pd.read_csv(sess_fn, sep='\t')
        target = self._targets_mapping.loc[self._targets_mapping.session_filename == sess_fn].user_id
        sample = {'sess': sess, 'target': target}
        
        if self._do_transform:
            sample = self._transform(sample)

        return sample

In [None]:
def read_json(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding='utf-8') as f:
        data = json.load(f)
    return data


class EyeMovementsClassification(object):
    
    def __init__(self, config):
        # Parameters
        self._filter_params = dict(read_json(config.get("EyemovementClassification", "filtering_params")))
        self._model_params = dict(read_json(config.get('EyemovementClassification', 'model_params')))
        
        # Classification model
        ivdt = IVDT(saccade_min_velocity=model_params.get('saccade_min_velocity'),
                    saccade_min_duration=model_params.get('min_saccade_duration_threshold'),
                    saccade_max_duration=model_params.get('max_saccade_duration_threshold'),
                    window_size=model_params.get('window_size'),
                    dispersion_threshold=model_params.get('dispersion_threshold'))

        # Post filtering
        thresholds_dict = {'min_saccade_duration_threshold': model_params.get('min_saccade_duration_threshold'),
                           'max_saccade_duration_threshold': model_params.get('max_saccade_duration_threshold'),
                           'min_fixation_duration_threshold': model_params.get('min_fixation_duration_threshold'),
                           'min_sp_duration_threshold': model_params.get('min_sp_duration_threshold')}

    def __call__(self, sample):
        data = groupby_session(sample)
        data = sgolay_filter_dataset(data, **dict(read_json(config.get("EyemovementClassification",
                                                     "filtering_params"))))