# Description

In this notebook, I will read, explore and load dataset

In [13]:
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from collections import Counter
import gensim
import gensim.downloader as api
import multiprocessing

from utils.utils_read_dataset import *
from utils.util_data_augmentation import *

In [2]:
PATH_CSV_FILE = r"data/raw_data/full_emoji.csv"
LIST_COLUMNS = ['emoji', 'unicode', 'name']

PATH_FOLDER_PROCESSED_DATA = r"data/processed_data"

N_SAMPLES = 1_000

GLOVE_SIMILAR_MODEL = gensim.downloader.load("glove-wiki-gigaword-50")

# 1. Read dataset

In [3]:
df_raw = pd.read_csv(PATH_CSV_FILE)
df_raw.drop(columns=['SoftBank', 'DoCoMo', 'KDDI'], inplace=True)  # Drop uncommon emotion

# Drop row with NaN
df_raw.dropna(inplace=True)
df_raw.reset_index(inplace=True)

# if N_SAMPLES != None:
#     df_raw = df_raw[:N_SAMPLES]
#     df_raw.reset_index(inplace=True)


print(f"Shape of df_raw: {df_raw.shape}")
df_raw.head()

Shape of df_raw: (708, 13)


Unnamed: 0,index,#,emoji,unicode,name,Apple,Google,Facebook,Windows,Twitter,JoyPixels,Samsung,Gmail
0,0,1,üòÄ,U+1F600,grinning face,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAPAKIFAJh3AP/z..."
1,1,2,üòÉ,U+1F603,grinning face with big eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIFAF5LAP/z..."
2,2,3,üòÑ,U+1F604,grinning face with smiling eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIGAF5LAJh3..."
3,3,4,üòÅ,U+1F601,beaming face with smiling eyes,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhDAAMAKIGAIoAAf/v..."
4,4,5,üòÜ,U+1F606,grinning squinting face,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","data:image/png;base64,R0lGODlhEAAMAKIFAF5LAP/z..."


In this application, we only consider the list of columns `['emoji', 'unicode', 'name']`

In [4]:
df = df_raw[LIST_COLUMNS]
print(f"Shape of df: {df.shape}")
df.head()

Shape of df: (708, 3)


Unnamed: 0,emoji,unicode,name
0,üòÄ,U+1F600,grinning face
1,üòÉ,U+1F603,grinning face with big eyes
2,üòÑ,U+1F604,grinning face with smiling eyes
3,üòÅ,U+1F601,beaming face with smiling eyes
4,üòÜ,U+1F606,grinning squinting face


In [5]:
print(f"Dtype of each column:")
df.info()

Dtype of each column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   emoji    708 non-null    object
 1   unicode  708 non-null    object
 2   name     708 non-null    object
dtypes: object(3)
memory usage: 16.7+ KB


In [6]:
idx = np.random.randint(0, len(df))
test_emoji = df.loc[idx, 'emoji']
test_unicode = df.loc[idx, 'unicode']
test_name = df.loc[idx, 'name']

print(f"Emoji: {test_emoji}")
print(f"Python type of emoju: {type(test_emoji)}")
print(f"unicode: {test_unicode}")
print(f"Name: {test_name}")

Emoji: üç≠
Python type of emoju: <class 'str'>
unicode: U+1F36D
Name: lollipop


# 2. Load dataset

In this section, I will load dataset into `npy` format. Including:

- We only take SINGLE unicode. 
- Data augmentation.
- Save to `npy` format.

## 2.1. Load dataset 

In [7]:
list_unicode = []
list_name = []

for i in range(len(df)):
    unicode = df.loc[i, 'unicode']
    name = df.loc[i, 'name']

    if len(unicode.split()) == 1:
        list_unicode.append(unicode)
        list_name.append(name)

assert len(list_unicode) == len(list_name)
print(f"Number of emoji: {len(list_unicode)}")

Number of emoji: 690


In [8]:
idx = np.random.randint(0, len(list_unicode))

unicode = list_unicode[idx]
name = list_name[idx]

print(f"Emoji: {convert_unicode_2_emoji(unicode)}")
print(f"Name: {name}")

Emoji: üéÇ
Name: birthday cake


## 2.2 Data augmentation

In [18]:
def augment_text(sentence, glove_similar_model, max_iter=3):

    iter = 0
    while 1:
        iter += 1
        if iter > max_iter:  break
        try:
            seed = random.choice([1, 2, 3, 4])
            if seed == 1:
                new_sentence = sentence_synonyms_replacement(sentence)
            elif seed == 2:
                new_sentence = random_swap(sentence)
            elif seed  == 3:
                new_sentence = back_translation(sentence)
            elif seed == 4:
                new_sentence = sentence_similar_replacement(sentence, glove_similar_model)
            else:
                new_sentence = sentence
            
            if sentence != new_sentence:   break  # Find new text
        except:
            pass

    return new_sentence

In [27]:
def create_augment_text(sentence, glove_similar_model, n_iter=20):
    list_new_sentence = set()
    list_new_sentence.add(sentence)
    
    for i in range(n_iter):
        new_sentence = augment_text(sentence, glove_similar_model)
        list_new_sentence.add(new_sentence)
    list_new_sentence.remove(sentence)
    
    return list(list_new_sentence)


def create_augment_text_parallel(sentence, glove_similar_model, n_iter=20, n_cpus=4):
    pool = multiprocessing.Pool(processes=n_cpus) 

    input_args = [(sentence, glove_similar_model) for i in range(n_iter)]
    list_new_sentence = pool.starmap(augment_text, input_args)
    list_new_sentence = set(list_new_sentence)
    pool.close()
    pool.join()

    return list(list_new_sentence)

In [26]:
%time
idx = np.random.randint(0, len(list_unicode))

unicode = list_unicode[idx]
name = list_name[idx]

print(f"Emoji: {convert_unicode_2_emoji(unicode)}")
print(f"Name: {name}")

print(f"List possible new name:")
# print(create_augment_text(name, GLOVE_SIMILAR_MODEL, n_iter=20))
print(create_augment_text_parallel(name, GLOVE_SIMILAR_MODEL, n_iter=20, n_cpus=multiprocessing.cpu_count()))

CPU times: user 2 ¬µs, sys: 1e+03 ns, total: 3 ¬µs
Wall time: 9.54 ¬µs
Emoji: ‚ôª
Name: recycling symbol
List possible new name:
['symbol recycling', 'recycle image', 'reuse symbolic_representation', 'recycling icon', 'reprocess symbolisation', 'waste symbols', 'reuse symbolizes', 'recycle symbolizes', 'waste symbolizes', 'reprocess symbolic_representation']


In [28]:
list_new_unicode = []
list_new_name = []

for (unicode, name) in zip(list_unicode, list_name):
    list_new_unicode.append(unicode)
    list_new_name.append(name)

    try:
        # list_augmented_name = create_augment_text(name, GLOVE_SIMILAR_MODEL)
        list_augmented_name = create_augment_text_parallel(name, GLOVE_SIMILAR_MODEL)
        for augmented_name in list_augmented_name:
            list_new_unicode.append(unicode)
            list_new_name.append(augmented_name)
    except:
        pass

assert len(list_new_unicode) == len(list_new_name)
print(f"Number of NEW emoji: {len(list_new_unicode)}")

Number of NEW emoji: 7061


In [32]:
idx = np.random.randint(0, len(list_unicode))

unicode = list_unicode[idx]
name = list_name[idx]

print(f"Emoji: {convert_unicode_2_emoji(unicode)}")
print(f"Name: {name}")

Emoji: üê≠
Name: mouse face


# 3. Save dataset

We will save the processed dataset into `npy` file.

## 3.1 Remove label with only 1 sample

In [33]:
histogram = Counter(list_new_unicode)

removal_list = [unicode for unicode in list_new_unicode if histogram[unicode] <= 5]
print(f"List removal: {removal_list}")

List removal: ['U+1F4AB', 'U+1F4AB', 'U+1F4AB', 'U+1F4AB', 'U+1F4AB', 'U+1F44C', 'U+1F443', 'U+1F443', 'U+1F443', 'U+1F443', 'U+1F443', 'U+1F645', 'U+1F478', 'U+1F478', 'U+1F478', 'U+1F478', 'U+1F3C2', 'U+1F3C2', 'U+1F3C2', 'U+1F3C2', 'U+1F3C2', 'U+1F411', 'U+1F411', 'U+1F411', 'U+1F411', 'U+1F411', 'U+1F42B', 'U+1F42B', 'U+1F42B', 'U+1F42B', 'U+1F418', 'U+1F418', 'U+1F418', 'U+1F418', 'U+1F418', 'U+1F43B', 'U+1F43B', 'U+1F43B', 'U+1F43B', 'U+1F43B', 'U+1F427', 'U+1F427', 'U+1F427', 'U+1F427', 'U+1F427', 'U+1F419', 'U+1F419', 'U+1F419', 'U+1F419', 'U+1F419', 'U+1F33A', 'U+1F33A', 'U+1F33A', 'U+1F33A', 'U+1F33A', 'U+1F331', 'U+1F331', 'U+1F331', 'U+1F331', 'U+1F331', 'U+1F348', 'U+1F348', 'U+1F348', 'U+1F348', 'U+1F348', 'U+1F35D', 'U+1F35D', 'U+1F35D', 'U+1F35D', 'U+1F35D', 'U+1F362', 'U+1F362', 'U+1F362', 'U+1F362', 'U+1F362', 'U+1F361', 'U+1F361', 'U+1F361', 'U+1F361', 'U+1F361', 'U+1F36A', 'U+1F36A', 'U+1F36A', 'U+1F36A', 'U+1F36A', 'U+1F36E', 'U+1F36E', 'U+1F36E', 'U+1F36E', 'U+1F3

In [34]:
removal_indices = [np.where(element == list_new_unicode)[0] for element in removal_list]
print(f"Indices of removal: {removal_indices}")

Indices of removal: [array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int64), array([], dtype=int

In [35]:
list_new_unicode = np.delete(list_new_unicode, removal_indices)
list_new_name = np.delete(list_new_name, removal_indices)

In [36]:
print(f"Number of target name: {len(np.unique(list_new_unicode))}") 

Number of target name: 690


## 3.2. Save into `npy` file

In [38]:
list_new_unicode = np.array(list_new_unicode)
list_new_name = np.array(list_new_name)

np.save(os.path.join(f"{PATH_FOLDER_PROCESSED_DATA}", "list_processed_unicode.npy"), list_new_unicode)
np.save(os.path.join(f"{PATH_FOLDER_PROCESSED_DATA}", "list_processed_name.npy"), list_new_name)