# 03 Create Data for Path Animation Rating Model

In this notebook, we create a training dataset for a model that rates path aniamtions of an SVG file. For this, the following steps are required:
* Create a dataset of randomly animated logos
* Upload the separate randomly animated logos to our label website and then rate the animated paths should be animated
* Download the labelled dataset from the website and preprocess them to be used to train a model

## I. Create a dataset of randomly animated logos

In [None]:
from src.data.create_random_animations import create_random_animations, combine_dataframes,

In [None]:
import os
import pickle
import warnings
import pandas as pd

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 10)

In [None]:
create_random_animations(folder_svg_dataset = "../data/external/logos_dataset",
                         animation_path_label="../data/interim/path_animation_decision/path_animation_decision_label.pkl",
                         path_relevance_order="../data/interim/logos_meta_data/path_relevance_order.pkl",
                         nb_animations = 25,
                         split_df = True,
                         very_random=False)

In [None]:
animated_logos_vectors = combine_dataframes("../data/interim/animated_logos_information")

In [None]:
# save dataframe
output = open(f"../data/processed/model_data/animation_vectors.pkl", 'wb')
pickle.dump(animated_logos_vectors, output)
output.close()

## II. Upload the separate randomly animated logos to our label website and then rate the animated paths should be animated

We have to upload the randomly animated logos to the website backend and upload information about which paths have been animated so that the [label website](https://animate-logos.web.app/label-paths.html). can only display the animated versions.  After they have been labelled by enough persons, the data can be downloaded in the next step.

In [None]:
from src.data.interact_with_website_database import *
from src.data.create_random_animations import create_backend_mapping_df

In [None]:
backend_mapping_path_information = create_backend_mapping_df(animated_logos_vectors)

In [None]:
firestore_client = connect_to_firestore(credentials="../secrets/animate_logos_admin_key.json",
                         database_url="https://animate-logos.firebaseio.com/")

In [None]:
write_documents_to_collection(firestore_client, backend_mapping_path_information, collection="labelpath")

## III. Get data from website and prepare for modelling

We have to download the data from the websites database and then prepare them for the modelling.

In [None]:
from src.data.interact_with_website_database import *
from src.data.create_path_animation_labels import create_path_animation_labels
import pickle
import os
import pandas as pd
from src.preprocessing.sm_label_transformer import *
from src.features.get_svg_size_pos import get_relative_pos_to_bounding_box_of_animated_paths

In [None]:
firestore_client = connect_to_firestore(credentials="../secrets/animate_logos_admin_key.json",
                         database_url="https://animate-logos.firebaseio.com/")

In [None]:
animation_paths_ratings = retrieve_documents_from_collection(firestore_client, collection="label_animation_path")

In [None]:
create_path_animation_labels("../data/processed/logos_animated_random_paths_rating",
                             animation_paths_ratings, animated_logos_vectors)

In [2]:
with open('data/surrogate_model/sm_animation_vectors_label_23042021.pkl', 'rb') as f:
    animations = pickle.load(f)

In [5]:
X_train = pd.read_csv("data/model_1/model_1_train.csv")

X_test = pd.read_csv("data/model_1/model_1_test.csv")

# drop features that are not meaningful
X_train.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)
X_test.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)

X_train.head()

Unnamed: 0,filename,animation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,logo_0,1,13.535693,6.971131,-0.009867,-0.915823,-2.984741,5.383109,0.495111,-2.146576,...,-41.6,162.6,-162.6,127.4,-4.4,0.40018,0.472019,0.70009,0.564323,10
1,logo_0,0,-1.000982,4.641413,1.455743,-0.487705,-1.037431,6.984421,-0.485484,-3.933893,...,11.4,162.6,24.4,127.4,-127.4,0.5,0.671553,0.25,0.335776,10
2,logo_0,2,10.590673,0.53337,8.743198,0.241546,-2.132272,0.620374,1.687153,6.592582,...,-2.6,162.6,-46.6,127.4,-123.4,0.787194,0.328324,0.499991,0.835838,10
3,logo_0,3,4.222565,-0.735711,5.308626,-0.09091,-4.875907,2.410124,0.314957,-1.771255,...,-41.6,162.6,-162.6,127.4,-4.4,0.400206,0.472019,0.299897,0.564323,10
4,logo_0,4,1.776277,8.866785,-2.770646,-0.955766,-4.725605,5.412009,0.597616,2.331442,...,-2.6,162.6,-46.6,127.4,-123.4,0.5869,0.315475,0.500106,0.170381,10


In [6]:
animations.head()

Unnamed: 0,file,animation_id,order_id,path_prob,begin_value,model_output,animated_animation_ids,animated_order_ids,backend_mapping,logo_id,animation_number,alias,animation_file,time,label
0,logo_316_animation_0,4,1,0.2,1.0,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
1,logo_316_animation_0,24,6,0.6,1.25,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
2,logo_316_animation_0,23,7,0.2,1.5,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,0,Jakob,animation/logo_316_animation_0.svg,"Timestamp(seconds=1617783014, nanoseconds=3260...",Good
3,logo_316_animation_1,4,1,0.2,1.0,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Very Bad
4,logo_316_animation_1,24,6,0.6,1.25,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...","[4, 24, 23]","[1, 6, 7]","[0, 1, 0, 0, 0, 0, 1, 1]",316,1,Jonathan,animation/logo_316_animation_1.svg,"Timestamp(seconds=1617813571, nanoseconds=9700...",Okay


### 2. Prepare animation data for merge

In [7]:
# get correct filenames to match
animations['filename'] = animations['file'].apply(lambda row: row.split("_animation")[0])

In [8]:
# Tims feature
animations["rel_position_to_animations"] = animations.apply(lambda row: get_relative_pos_to_bounding_box_of_animated_paths(f"data/initial_svgs/{row['filename']}.svg", row["animation_id"], row["animated_animation_ids"]), axis=1)
animations["rel_x_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[0])
animations["rel_y_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[1])

data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero
data/initial_svgs/logo_394.svg, animation_id 14, animated_animation_ids [14]: rel_y_position not defined and set to 0.5. float division by zero


In [9]:
# drop unnecessary columns
animations.drop(['file','order_id','path_prob','begin_value','animated_animation_ids','animated_order_ids','backend_mapping','logo_id','animation_number','alias','animation_file', 'time', 'rel_position_to_animations'], axis=1, inplace=True)

# map label names to label indexes and delete entries without rating
mapping = {'Very Bad': 0, 'Bad': 1, 'Okay': 2, 'Good': 3, 'Very Good': 4}
animations.replace({'label': mapping}, inplace=True)
animations = animations[animations['label'] != 'no_rating']
animations.reset_index(drop=True, inplace=True)

In [10]:
animations.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974


In [11]:
animations.label.value_counts()

2    4670
0    3078
1    2556
3    2424
4     509
Name: label, dtype: int64

### 3. Merge animation data with path vectors

In [12]:
train = animations.merge(X_train, how='left', on=['filename', 'animation_id'])
test = animations.merge(X_test, how='left', on=['filename', 'animation_id'])

In [13]:
train.head()

Unnamed: 0,animation_id,model_output,label,filename,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,...,diff_fill_r,svg_fill_g,diff_fill_g,svg_fill_b,diff_fill_b,rel_width,rel_height,rel_x_position,rel_y_position,nr_paths_svg
0,4,"[0, 0, 0, 1, 0, 0, -1.0, -1.0, -1.0, -1.0, 0.8...",3,logo_316,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
1,24,"[0, 0, 0, 0, 0, 1, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0
2,23,"[0, 0, 0, 0, 1, 0, -1.0, -1.0, -1.0, -1.0, -1....",3,logo_316,0.714309,0.778553,8.7067,3.79783,8.069836,-5.748081,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362904,0.501511,0.63794,24.0
3,4,"[1, 0, 0, 0, 0, 0, 0.13436424411240122, 0.8474...",0,logo_316,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,...,-4.541667,64.541667,-4.541667,64.541667,-4.541667,0.054752,0.084239,0.033838,0.04212,24.0
4,24,"[0, 0, 1, 0, 0, 0, -1.0, -1.0, -1.0, 0.7637746...",2,logo_316,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,...,102.458333,64.541667,102.458333,64.541667,102.458333,0.395994,0.362888,0.501511,0.579289,24.0


In [14]:
train.isnull().sum().sort_values(ascending = False)

dtype: int64

In [15]:
print(f"Before: Train: {train.shape}. Test: {test.shape}")
train.dropna(inplace=True)
test.dropna(inplace=True)
print(f"After: Train: {train.shape}. Test: {test.shape}")

Before: Train: (22465, 30). Test: (22462, 30)
After: Train: (10769, 30). Test: (2469, 30)


### 4. Transform animation vector into multiple dataframe columns and change column ordering

In [16]:
train[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(train['model_output'].tolist(), index=train.index)
test[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(test['model_output'].tolist(), index=test.index)

train.drop(['model_output'], inplace=True, axis=1)
test.drop(['model_output'], inplace=True, axis=1)

#train.drop(['model_output','animation_id','filename'], inplace=True, axis=1)
#test.drop(['model_output','animation_id','filename'], inplace=True, axis=1)

In [17]:
train.head()

Unnamed: 0,label,rel_x_position_to_animations,rel_y_position_to_animations,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,an_vec_10,an_vec_11
0,3,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,0.919059,-0.791849,3.824877,...,0,1,0,0,-1.0,-1.0,-1.0,-1.0,0.844422,0.757954
1,3,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,-2.27915,6.263388,1.082239,...,0,0,0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,3,0.714309,0.778553,8.7067,3.79783,8.069836,-5.748081,-1.975505,6.366743,1.201885,...,0,0,1,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,0,0.039501,0.051404,-12.294147,3.474895,-2.736042,1.6579,0.919059,-0.791849,3.824877,...,0,0,0,0,0.134364,0.847434,-1.0,-1.0,-1.0,-1.0
4,2,0.714309,0.706974,7.627292,4.582326,7.082139,-5.864271,-2.27915,6.263388,1.082239,...,1,0,0,0,-1.0,-1.0,-1.0,0.763775,-1.0,-1.0


In [18]:
col_order = ['filename', 'animation_id'] + [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']
#col_order = [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']

In [19]:
train = train[col_order]
test = test[col_order]

In [20]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,diff_fill_g,diff_fill_b,rel_height,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,label
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,3
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,3
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,102.458333,102.458333,0.362904,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,3
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,-4.541667,-4.541667,0.084239,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,102.458333,102.458333,0.362888,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,2


### 5. Encode labels into 4-binary labels

In [21]:
train[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(train['label'])), index=train.index)
test[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(test['label'])), index=test.index)

train.drop(['label'], axis=1, inplace=True)
test.drop(['label'], axis=1, inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [22]:
train.head()

Unnamed: 0,an_vec_0,an_vec_1,an_vec_2,an_vec_3,an_vec_4,an_vec_5,an_vec_6,an_vec_7,an_vec_8,an_vec_9,...,rel_width,rel_x_position,rel_y_position,rel_x_position_to_animations,rel_y_position_to_animations,nr_paths_svg,rating_0,rating_1,rating_2,rating_3
0,0,0,0,1,0,0,-1.0,-1.0,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,1,1,1,0
1,0,0,0,0,0,1,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,1,0
2,0,0,0,0,1,0,-1.0,-1.0,-1.0,-1.0,...,0.395994,0.501511,0.63794,0.714309,0.778553,24.0,1,1,1,0
3,1,0,0,0,0,0,0.134364,0.847434,-1.0,-1.0,...,0.054752,0.033838,0.04212,0.039501,0.051404,24.0,0,0,0,0
4,0,0,1,0,0,0,-1.0,-1.0,-1.0,0.763775,...,0.395994,0.501511,0.579289,0.714309,0.706974,24.0,1,1,0,0


### 6. Save data

In [23]:
train.to_csv('data/surrogate_model/sm_train_data.csv', index=False)
test.to_csv('data/surrogate_model/sm_test_data.csv', index=False)

In [None]:
with open('data/surrogate_model/sm_animation_vectors_label_23042021.pkl', 'rb') as f:
    animations = pickle.load(f)

X_train = pd.read_csv("data/model_1/model_1_train.csv")

X_test = pd.read_csv("data/model_1/model_1_test.csv")

# drop features that are not meaningful
X_train.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)
X_test.drop(['stroke_width','opacity','stroke_opacity','stroke_r','stroke_g','stroke_b','svg_stroke_r','diff_stroke_r','svg_stroke_g','diff_stroke_g','svg_stroke_b','diff_stroke_b'], axis=1, inplace=True)

X_train.head()

animations.head()

### 2. Prepare animation data for merge

# get correct filenames to match
animations['filename'] = animations['file'].apply(lambda row: row.split("_animation")[0])

# Tims feature
animations["rel_position_to_animations"] = animations.apply(lambda row: get_relative_pos_to_bounding_box_of_animated_paths(f"data/initial_svgs/{row['filename']}.svg", row["animation_id"], row["animated_animation_ids"]), axis=1)
animations["rel_x_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[0])
animations["rel_y_position_to_animations"] = animations["rel_position_to_animations"].apply(lambda row: row[1])

# drop unnecessary columns
animations.drop(['file','order_id','path_prob','begin_value','animated_animation_ids','animated_order_ids','backend_mapping','logo_id','animation_number','alias','animation_file', 'time', 'rel_position_to_animations'], axis=1, inplace=True)

# map label names to label indexes and delete entries without rating
mapping = {'Very Bad': 0, 'Bad': 1, 'Okay': 2, 'Good': 3, 'Very Good': 4}
animations.replace({'label': mapping}, inplace=True)
animations = animations[animations['label'] != 'no_rating']
animations.reset_index(drop=True, inplace=True)

animations.head()

animations.label.value_counts()

### 3. Merge animation data with path vectors

train = animations.merge(X_train, how='left', on=['filename', 'animation_id'])
test = animations.merge(X_test, how='left', on=['filename', 'animation_id'])

train.head()

train.isnull().sum().sort_values(ascending = False)

print(f"Before: Train: {train.shape}. Test: {test.shape}")
train.dropna(inplace=True)
test.dropna(inplace=True)
print(f"After: Train: {train.shape}. Test: {test.shape}")

### 4. Transform animation vector into multiple dataframe columns and change column ordering

train[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(train['model_output'].tolist(), index=train.index)
test[[f'an_vec_{i}' for i in range(12)]] = pd.DataFrame(test['model_output'].tolist(), index=test.index)

train.drop(['model_output'], inplace=True, axis=1)
test.drop(['model_output'], inplace=True, axis=1)

#train.drop(['model_output','animation_id','filename'], inplace=True, axis=1)
#test.drop(['model_output','animation_id','filename'], inplace=True, axis=1)

train.head()

col_order = ['filename', 'animation_id'] + [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']
#col_order = [f'an_vec_{i}' for i in range(12)] + [f'emb_{i}' for i in range(10)] + ['_'.join(['fill', ch]) for ch in ['r','g','b']] + ['_'.join(['svg_fill', ch]) for ch in ['r','g','b']] + ['_'.join(['diff_fill', ch]) for ch in ['r','g','b']] + ['rel_height','rel_width','rel_x_position','rel_y_position','rel_x_position_to_animations','rel_y_position_to_animations','nr_paths_svg','label']

train = train[col_order]
test = test[col_order]

train.head()

### 5. Encode labels into 4-binary labels

train[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(train['label'])), index=train.index)
test[[f'rating_{i}' for i in range(4)]] = pd.DataFrame(encode_classes(np.array(test['label'])), index=test.index)

train.drop(['label'], axis=1, inplace=True)
test.drop(['label'], axis=1, inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train.head()

### 6. Save data

train.to_csv('data/surrogate_model/sm_train_data.csv', index=False)
test.to_csv('data/surrogate_model/sm_test_data.csv', index=False)