# Окружение

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import re
import sys
import glob
import random
import numpy as np
import pandas as pd
from shutil import copy
from pprint import pprint

from pathlib import Path
from tqdm import tqdm_notebook
from collections import defaultdict
from itertools import chain
from typing import List, Dict, Any, NoReturn, Tuple, Optional, Union

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

import warnings
warnings.simplefilter('ignore')

In [3]:
sys.path.insert(0, "..")
from config import init_config, config

## Определим пути

In [4]:
GENERAL_DATA_DIR = Path('D:\\Data\\EyesSimulation Sessions\\Export3')

init_config("../set_locations.ini")
TRAIN_DIR = config.get("DataPaths", "train_data")
OWNER_DIR = config.get("DataPaths", "owner_data")
RUN_DIR = config.get("DataPaths", "run_data")

# Целый датасет

In [5]:
sess_filenames = [fn for fn in glob.glob(str(GENERAL_DATA_DIR) + "\\*.csv") if not fn.endswith("_affmatrix.csv")]
print(f"Number of sessions files avaliable: {len(sess_filenames)}")

users_filenames = glob.glob(str(GENERAL_DATA_DIR) + "\\*.txt")
print(f"Number of users meta files avaliable: {len(users_filenames)}")

Number of sessions files avaliable: 243
Number of users meta files avaliable: 243


In [6]:
users_df = []
for fn in users_filenames:
    df = pd.read_csv(fn, delimiter = ";", encoding="Windows-1251", header=None, names=['name'])
    df['name'] = df['name'].str.replace('\t', ' ', regex=True)
    df['name'] = df['name'].str.strip()
    df['user_fns'] = fn
    users_df.append(df)
users_df = pd.concat(users_df).groupby(by='name').agg({'user_fns': lambda x: list(x)}).reset_index()
users_df['n_sessions'] = users_df.user_fns.apply(lambda x: len(x))
users_df['sess_fns'] = users_df.user_fns.apply(lambda x: [("_".join(fn.split("_")[:-1]) + ".csv")
                                                          for fn in x])
print(f"Unique users: {users_df.name.nunique()}")
users_df

Unique users: 15


Unnamed: 0,name,user_fns,n_sessions,sess_fns
0,Алина Танакова,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-17_09-23-03_cat_f#1.mp4_metadata.txt, D:\Da...",10,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-17_09-23-03_cat_f#1.mp4.csv, D:\Data\EyesSi..."
1,Ань Ту Чан,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-16_14-02-57_cat_f#1.mp4_metadata.txt, D:\Da...",10,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-16_14-02-57_cat_f#1.mp4.csv, D:\Data\EyesSi..."
2,Ван Туан Нгуен,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_20-06-26_cat_f#1.mp4_metadata.txt, D:\Da...",20,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_20-06-26_cat_f#1.mp4.csv, D:\Data\EyesSi..."
3,Вьет Данг Нгуен,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_20-12-52_cat_f#1.mp4_metadata.txt, D:\Da...",18,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_20-12-52_cat_f#1.mp4.csv, D:\Data\EyesSi..."
4,Вячяеслав Пинчук,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-17_10-33-05_cat_f#1.mp4_metadata.txt, D:\Da...",10,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-17_10-33-05_cat_f#1.mp4.csv, D:\Data\EyesSi..."
5,Даниил Бонк,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_15-16-47_cat_f#1.mp4_metadata.txt, D:\Da...",22,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_15-16-47_cat_f#1.mp4.csv, D:\Data\EyesSi..."
6,Динь Кыонг Ле,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_16-00-08_cat_f#1.mp4_metadata.txt, D:\Da...",20,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_16-00-08_cat_f#1.mp4.csv, D:\Data\EyesSi..."
7,Елизавета Новикова,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_14-07-54_cat_f#1.mp4_metadata.txt, D:\Da...",12,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_14-07-54_cat_f#1.mp4.csv, D:\Data\EyesSi..."
8,Куанг Хынг Нгуен,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_18-08-40_cat_f#1.mp4_metadata.txt, D:\Da...",21,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_18-08-40_cat_f#1.mp4.csv, D:\Data\EyesSi..."
9,Нят Чыонг Динь,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_18-13-27_cat_f#1.mp4_metadata.txt, D:\Da...",20,"[D:\Data\EyesSimulation Sessions\Export3\exp-2020-12-15_18-13-27_cat_f#1.mp4.csv, D:\Data\EyesSi..."


# Тест №1. 
1. Выбирается владелец и одна его запись (случайная);
2. Выбирается случайные N записей других пользователей (из числа тех, у кого записей больше среднего) и M записей владельца (случайных, кроме выбранной в п.1);
3. Сохраняются в ранее определенные пути и выбранные N+M записей исключаются из обучающего набора.

In [8]:
def copy_files_to_folder(filenames: List[str], to_folder: str):
    for fn in tqdm_notebook(filenames, total=len(filenames)):
        try:
            copy(fn, to_folder)
        except:
            print(f"Can't copy file {fn} to {to_folder}!")
            
            
def clear_folders(folders_paths: List[str]):
    for folder_fn in tqdm_notebook(folders_paths, total=len(folders_paths)):
        files = glob.glob(folder_fn + "/*")
        for f in files:
            try:
                os.remove(f)
            except IsADirectoryError:
                print(f"Oops, found a folder inside clearing folder. Skipping.")

In [8]:
def test_split_var1(df: pd.DataFrame, owner_name: str, N: int, M: int):
    clear_folders([RUN_DIR, TRAIN_DIR, OWNER_DIR])
    
    # Split owner data
    all_owner_meta_fns = df.loc[df.name == owner_name].user_fns.values[0]
    random.shuffle(all_owner_meta_fns)
    owner_meta_fns = random.sample(all_owner_meta_fns, k=M+1)
    owner_train_meta_fns = [fn for fn in all_owner_meta_fns if fn not in owner_meta_fns]
    owner_train_data_fns = ["_".join(fn.split("_")[:-1]) + ".csv" for fn in owner_train_meta_fns]
    owner_main_fns = [owner_meta_fns[0], ("_".join(owner_meta_fns[0].split("_")[:-1]) + ".csv")]
    owner_run_meta_fns = [fn for fn in owner_meta_fns[1:]]
    owner_run_data_fns = ["_".join(fn.split("_")[:-1]) + ".csv" for fn in owner_run_meta_fns]
    
    # Select other users data
    all_others_meta_fns = list(chain.from_iterable(df.loc[(df.n_sessions > df.n_sessions.mean()) &
                                                         (df.name != owner_name)].user_fns.to_list()))
    random.shuffle(all_others_meta_fns)
    others_meta_fns = random.sample(all_others_meta_fns, k=N)
    others_train_meta_fns = [fn for fn in all_others_meta_fns if fn not in others_meta_fns]
    others_train_data_fns = ["_".join(fn.split("_")[:-1]) + ".csv" for fn in others_train_meta_fns]
    others_run_meta_fns = [fn for fn in others_meta_fns]
    others_run_data_fns = ["_".join(fn.split("_")[:-1]) + ".csv" for fn in others_run_meta_fns]
    
    print(f"Owner all: {len(all_owner_meta_fns)}, train: {len(owner_train_meta_fns)},"
          f" main: {len(owner_main_fns) // 2}, run: {len(owner_run_meta_fns)}") 
    print(f"Others all: {len(all_others_meta_fns)}, train: {len(others_train_meta_fns)}, run: {len(others_run_meta_fns)}") 
    
    # Copy owner data
    copy_files_to_folder(owner_train_meta_fns + others_train_data_fns, TRAIN_DIR)
    copy_files_to_folder(owner_main_fns, OWNER_DIR)
    copy_files_to_folder(owner_run_meta_fns + owner_run_data_fns, RUN_DIR)
    
    # Copy others data
    copy_files_to_folder(others_train_meta_fns + others_train_data_fns, TRAIN_DIR)
    copy_files_to_folder(others_run_meta_fns + others_run_data_fns, RUN_DIR)
    
    print("Done coping!")
    return ({
        "owner_run": owner_run_meta_fns + owner_run_data_fns,
        "owner_main": owner_main_fns,
        "others_run": others_run_meta_fns + others_run_data_fns
    })

In [9]:
data_fns = test_split_var1(users_df, "Даниил Бонк", N=10, M=4)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Owner all: 22, train: 17, main: 1, run: 4
Others all: 161, train: 151, run: 10


HBox(children=(FloatProgress(value=0.0, max=168.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=302.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Done coping!


# Тест №2. 
1. Выбирается владелец и одна **определенная** его запись (по наиментованию);
2. Выбирается случайные N записей других пользователей (из числа тех, у кого записей больше среднего) и M записей владельца (случайных, кроме выбранной в п.1);
3. Сохраняются в ранее определенные пути и выбранные N+M записей исключаются из обучающего набора.

In [9]:
def meta_to_data(fn: str):
    return "_".join(fn.split("_")[:-1]) + ".csv"

def data_to_meta(fn: str):
    return ".".join(fn.split(".")[:-1]) + "_metadata.txt"

In [10]:
def test_split_var2(df: pd.DataFrame, owner_name: str, 
                    owner_init_fn: str, N: int, M: int):
    clear_folders([RUN_DIR, TRAIN_DIR, OWNER_DIR])
    
    # Split owner data
    all_owner_meta_fns = df.loc[df.name == owner_name].user_fns.values[0]
    random.shuffle(all_owner_meta_fns)
    
    # Select main owner data and meta files
    owner_main_meta_fn = all_owner_meta_fns.pop(all_owner_meta_fns.index(owner_init_fn))
    owner_main_fns = [owner_main_meta_fn, meta_to_data(owner_main_meta_fn)]
    # Train owner data
    owner_meta_fns = random.sample(all_owner_meta_fns, k=M+1)
    owner_train_meta_fns = [fn for fn in all_owner_meta_fns if fn not in owner_meta_fns]
    owner_train_data_fns = [meta_to_data(fn) for fn in owner_train_meta_fns]
    # Run owner data
    owner_run_meta_fns = [fn for fn in owner_meta_fns[1:]]
    owner_run_data_fns = ["_".join(fn.split("_")[:-1]) + ".csv" for fn in owner_run_meta_fns]
    
    # Select other users data
    all_others_meta_fns = list(chain.from_iterable(df.loc[(df.n_sessions > df.n_sessions.mean()) &
                                                         (df.name != owner_name)].user_fns.to_list()))
    random.shuffle(all_others_meta_fns)
    others_meta_fns = random.sample(all_others_meta_fns, k=N)
    others_train_meta_fns = [fn for fn in all_others_meta_fns if fn not in others_meta_fns]
    others_train_data_fns = [meta_to_data(fn) for fn in others_train_meta_fns]
    others_run_meta_fns = [fn for fn in others_meta_fns]
    others_run_data_fns = [meta_to_data(fn) for fn in others_run_meta_fns]
    
    print(f"Owner all: {len(all_owner_meta_fns)}, train: {len(owner_train_meta_fns)},"
          f" main: {len(owner_main_fns) // 2}, run: {len(owner_run_meta_fns)}") 
    print(f"Others all: {len(all_others_meta_fns)}, train: {len(others_train_meta_fns)}, run: {len(others_run_meta_fns)}") 
    
    # Copy owner data
    copy_files_to_folder(owner_train_meta_fns + others_train_data_fns, TRAIN_DIR)
    copy_files_to_folder(owner_main_fns, OWNER_DIR)
    copy_files_to_folder(owner_run_meta_fns + owner_run_data_fns, RUN_DIR)
    
    # Copy others data
    copy_files_to_folder(others_train_meta_fns + others_train_data_fns, TRAIN_DIR)
    copy_files_to_folder(others_run_meta_fns + others_run_data_fns, RUN_DIR)
    
    print("Done coping!")
    return ({
        "owner_run": owner_run_meta_fns + owner_run_data_fns,
        "owner_main": owner_main_fns,
        "others_run": others_run_meta_fns + others_run_data_fns
    })

In [12]:
data_fns = test_split_var2(users_df, "Даниил Бонк", 
                              'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-19-00_cat_s#1.mp4_metadata.txt',
                               N=10, M=4)

data_fns

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Owner all: 21, train: 16, main: 1, run: 4
Others all: 161, train: 151, run: 10


HBox(children=(FloatProgress(value=0.0, max=167.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=302.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Done coping!


{'owner_run': ['D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-19-46_cat_f#1.mp4_metadata.txt',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-18-23_cat_s#1.mp4_metadata.txt',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-17-09_cat_f#1.mp4_metadata.txt',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-15_15-17-19_cat_f#1.mp4_metadata.txt',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-19-46_cat_f#1.mp4.csv',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-18-23_cat_s#1.mp4.csv',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-17-09_cat_f#1.mp4.csv',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-15_15-17-19_cat_f#1.mp4.csv'],
 'owner_main': ['D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-19-00_cat_s#1.mp4_metadata.txt',
  'D:\\Data\\EyesSimulation Sessions\\Export3\\exp-2020-12-16_18-19-00_cat_s#1.mp4.csv'],
 'others_run': ['D:\\Data\\EyesSimul