In [1]:
import pickle
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

from dataset import PersonalityDataset
from params import LocationConfig

from PIL import Image
from glob import glob

In [2]:
images_dir = Path('dataset')

In [5]:
df = pd.read_csv(images_dir.joinpath('bigfive_labels.csv'))
df.head(3)

Unnamed: 0,VideoName,ValueExtraversion,ValueAgreeableness,ValueConscientiousness,ValueNeurotisicm,ValueOpenness
0,GQczMGrVgbc.001.mp4,0.570093,0.703297,0.640777,0.666667,0.544444
1,-utrsarZeIY.004.mp4,0.523364,0.516484,0.436893,0.333333,0.411111
2,3HA2W1s4oP8.001.mp4,0.401869,0.538462,0.427184,0.510417,0.388889


In [21]:
df.describe()

Unnamed: 0,ValueExtraversion,ValueAgreeableness,ValueConscientiousness,ValueNeurotisicm,ValueOpenness
count,8000.0,8000.0,8000.0,8000.0,8000.0
mean,0.476313,0.548898,0.524053,0.520605,0.56629
std,0.151226,0.134234,0.155351,0.152635,0.1461
min,0.0,0.0,0.0,0.0,0.0
25%,0.373832,0.461538,0.417476,0.416667,0.466667
50%,0.476636,0.56044,0.524272,0.53125,0.566667
75%,0.579439,0.637363,0.640777,0.625,0.666667
max,1.0,1.0,1.0,0.979167,1.0


In [6]:
ShortVideoName = []
for videoName in df.VideoName.values:
    ShortVideoName.append(videoName.split('.')[0])
df['ShortVideoName'] = ShortVideoName

In [10]:
cols = ['ValueExtraversion','ValueAgreeableness','ValueConscientiousness','ValueNeurotisicm','ValueOpenness','ShortVideoName']
grouped_df = df[cols].groupby('ShortVideoName')
mean_df = grouped_df.mean()
mean_df = mean_df.reset_index()
# mean_df.to_csv(images_dir.joinpath('bigfive_labels_mean.csv'))
mean_df.head(3)

Unnamed: 0,ShortVideoName,ValueExtraversion,ValueAgreeableness,ValueConscientiousness,ValueNeurotisicm,ValueOpenness
0,--Ymqszjv54,0.390187,0.491758,0.453883,0.385417,0.525
1,-2qsCrkXdWs,0.476636,0.593407,0.572816,0.604167,0.611111
2,-55DRRMTppE,0.658879,0.708791,0.728155,0.671875,0.666667


In [20]:
mean_df = mean_df.set_index('ShortVideoName')
mean_df.head(3)

Unnamed: 0_level_0,ValueExtraversion,ValueAgreeableness,ValueConscientiousness,ValueNeurotisicm,ValueOpenness
ShortVideoName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
--Ymqszjv54,0.390187,0.491758,0.453883,0.385417,0.525
-2qsCrkXdWs,0.476636,0.593407,0.572816,0.604167,0.611111
-55DRRMTppE,0.658879,0.708791,0.728155,0.671875,0.666667


### Create new dataset

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(np.array(mean_df.index), test_size=0.09, random_state=42)
X_train.shape, X_test.shape

((2626,), (260,))

In [19]:
import cv2

In [21]:
images_dict_train = {'X':[], 'Y':[]}
images_dict_test = {'X':[], 'Y':[]}
Y_threshold = 0.5
for image_path in tqdm(images_dir.glob('*/*.jpg'), total=30935):
    X = cv2.imread(str(image_path))    
    image_group = image_path.name.split('.')[0]
    image_no = image_path.name.split('.')[2][-5:]
    Y = mean_df.loc[image_group].values
    Y = list(np.where(Y>Y_threshold, 1, 0))
    if image_group in X_test:
        images_dict_test['X'].append(X)
        images_dict_test['Y'].append(Y)
    else:
        images_dict_train['X'].append(X)
        images_dict_train['Y'].append(Y)

len(images_dict_train['Y']), len(images_dict_test['Y'])

100%|██████████| 30935/30935 [00:13<00:00, 2282.27it/s]


(28136, 2799)

In [22]:
with open('data_connected/train/train.pickle', 'wb') as handle:
    pickle.dump(images_dict_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data_connected/test/test.pickle', 'wb') as handle:
    pickle.dump(images_dict_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('data/test/test_clselfie_v7.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [17]:
new_data = {
    'X': df['X'][:2],
    'Y': df['Y'][:2],
}

In [18]:
with open('small_data/small.pickle', 'wb') as handle:
    pickle.dump(new_data, handle, protocol=pickle.HIGHEST_PROTOCOL)