In [108]:
import json

from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_feature_encoding
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

In [109]:
from pathlib import Path

In [110]:
source_data_dir = Path(r"F:\Datasets\Fairness\TweetAAE\Processed\sentiment_race")

In [111]:
texts = {}
for name in ['pos_pos', 'pos_neg', 'neg_pos', 'neg_neg']:
    with open(source_data_dir / (name + "_text"), encoding="latin-1") as f:
        _texts = f.readlines()
        _texts = [_t.strip() for _t in _texts]
        texts[name] = _texts

In [112]:
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, 32)

Tokenizing using dictionary from d:\Project\torchMoji/model/vocabulary.json


In [113]:
from tqdm import tqdm

In [114]:
tokenized_texts = {}
for name in texts.keys():
    tokenized = []
    for _text in tqdm(texts[name]):
        try:
            _tokenized, _, _ = st.tokenize_sentences([_text])
        except:
            _tokenized = [None]
        tokenized.append(_tokenized[0])
    tokenized_texts[name] = tokenized

100%|██████████| 73094/73094 [00:12<00:00, 5693.15it/s]
100%|██████████| 100007/100007 [00:18<00:00, 5293.93it/s]
100%|██████████| 44059/44059 [00:07<00:00, 5879.52it/s]
100%|██████████| 100001/100001 [00:17<00:00, 5664.65it/s]


In [115]:
import pandas as pd

In [116]:
data_dfs = {}
for name in texts.keys():
    data_dfs[name] = pd.DataFrame({
        "text":texts[name],
        "tokenized_texts":tokenized_texts[name],
    })

In [117]:
data_dfs["pos_neg"]

Unnamed: 0,text,tokenized_texts
0,Yayyyy the macs fixed,"[1, 10, 12646, 2259, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,Ik nothing about basketball sooooo imma keep q...,"[3701, 190, 52, 1998, 1, 2831, 276, 1531, 34, ..."
2,"Monsters and coffee at 9 o'clock!! Oh ya , it'...","[5357, 12, 511, 56, 4, 5042, 50, 501, 1964, 14..."
3,I can't wait until Monday & gt ; & gt ; & gt ;...,"[18, 164, 262, 393, 1875, 193, 2952, 385, 193,..."
4,Surprise mudding trip,"[1146, 22084, 978, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
100002,_TWITTER-ENTITY_ Hahaha lolololol it means dir...,"[1, 1, 1, 1561, 11295, 26, 689, 3111, 1156, 1,..."
100003,_TWITTER-ENTITY_ haha I eat cats dogs are nast...,"[1, 1, 1, 924, 18, 330, 2050, 900, 24, 1062, 1..."
100004,_TWITTER-ENTITY_ the facial expressions are th...,"[1, 1, 1, 10, 4884, 11677, 24, 10, 204, 805, 4..."
100005,Well that was extremely awkward,"[100, 23, 25, 662, 2064, 0, 0, 0, 0, 0, 0, 0, ..."


In [118]:
df = data_dfs["pos_neg"]
df = df[df["tokenized_texts"].notnull()]
df = df.sample(n=44000, random_state=2020)
df

Unnamed: 0,text,tokenized_texts
37519,_TWITTER-ENTITY_ tell me it's not funny !,"[1, 1, 1, 307, 76, 68, 20, 288, 19, 0, 0, 0, 0..."
99003,â _TWITTER-ENTITY_ : wtf do you mean you don...,"[1, 1, 1, 1, 1, 1, 148, 1047, 126, 13, 376, 13..."
70610,Everyone has their opinion so SHUT THE FUCK UP...,"[314, 81, 74, 735, 60, 505, 10, 133, 112, 386,..."
72997,I actually put jeans on today and I already wa...,"[18, 286, 250, 2883, 44, 468, 12, 18, 386, 113..."
71307,_TWITTER-ENTITY_ LMFAO . I actually sing good ...,"[1, 1, 1, 1916, 11, 18, 286, 1613, 32, 12, 18,..."
...,...,...
38855,What do ducks smoke ? Quack .,"[39, 126, 10086, 1273, 61, 19412, 11, 0, 0, 0,..."
69142,_TWITTER-ENTITY_ we went into protective mode ...,"[1, 1, 1, 54, 239, 185, 8056, 3638, 1, 40, 362..."
22623,Spending all day tomorrow studying for exams,"[1263, 40, 196, 1576, 2895, 16, 4035, 0, 0, 0,..."
83328,"_TWITTER-ENTITY_ hahah , you flipped out when ...","[1, 1, 1, 2658, 14, 13, 8578, 53, 63, 13, 627,..."


In [119]:
for name in data_dfs.keys():
    df = data_dfs[name]
    df = df[df["tokenized_texts"].notnull()]
    df = df.sample(n=44000, random_state=2020)
    data_dfs[name] = df

In [120]:
print('Loading model from {}.'.format(PRETRAINED_PATH))
model = torchmoji_feature_encoding(PRETRAINED_PATH)
print(model)

Loading model from d:\Project\torchMoji/model/pytorch_model.bin.


  nn.init.uniform(self.embed.weight.data, a=-0.5, b=0.5)
  nn.init.xavier_uniform(t)
  nn.init.orthogonal(t)
  nn.init.constant(t, 0)


Loading weights for embed.weight
Loading weights for lstm_0.weight_ih_l0
Loading weights for lstm_0.weight_hh_l0
Loading weights for lstm_0.bias_ih_l0
Loading weights for lstm_0.bias_hh_l0
Loading weights for lstm_0.weight_ih_l0_reverse
Loading weights for lstm_0.weight_hh_l0_reverse
Loading weights for lstm_0.bias_ih_l0_reverse
Loading weights for lstm_0.bias_hh_l0_reverse
Loading weights for lstm_1.weight_ih_l0
Loading weights for lstm_1.weight_hh_l0
Loading weights for lstm_1.bias_ih_l0
Loading weights for lstm_1.bias_hh_l0
Loading weights for lstm_1.weight_ih_l0_reverse
Loading weights for lstm_1.weight_hh_l0_reverse
Loading weights for lstm_1.bias_ih_l0_reverse
Loading weights for lstm_1.bias_hh_l0_reverse
Loading weights for attention_layer.attention_vector
Ignoring weights for output_layer.0.weight
Ignoring weights for output_layer.0.bias
TorchMoji(
  (embed): Embedding(50000, 256)
  (embed_dropout): Dropout2d(p=0, inplace=False)
  (lstm_0): LSTMHardSigmoid(256, 512, batch_first

In [60]:
import numpy as np

np.stack(df["tokenized_texts"].to_list(), axis=1).T

array([[ 128,  943,  248, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0],
       ...,
       [ 907,   17,   67, ...,    0,    0,    0],
       [ 249,   10, 2895, ...,    0,    0,    0],
       [   1,    1,    1, ...,    0,    0,    0]], dtype=uint16)

In [63]:
_encoding = model(np.stack(df["tokenized_texts"].to_list(), axis=1).T[:200])

In [121]:
print('Encoding texts..')

for name in data_dfs.keys():
    df = data_dfs[name]
    _encoding = model(np.stack(df["tokenized_texts"].to_list(), axis=1).T)
    df["encoding"] = list(_encoding)
    data_dfs[name] = df

Encoding texts..




In [124]:
data_dfs[name].iloc[:5]

Unnamed: 0,text,tokenized_texts,encoding
36083,I'm sitting next to the most awkward couple on...,"[128, 943, 248, 17, 10, 169, 2064, 659, 44, 10...","[0.08898257, 0.014492264, -0.0073325643, 0.0, ..."
31787,... _TWITTER-ENTITY_ you snapped me a pic of y...,"[1, 1, 1, 1, 13, 6438, 76, 15, 1788, 21, 13, 1...","[-0.047121312, -0.15688002, -0.0153493155, -0...."
33964,_TWITTER-ENTITY_ I had my head covered or hand...,"[1, 1, 1, 18, 58, 41, 471, 1920, 127, 868, 118...","[0.0033988312, -0.0069653085, -0.030654537, 0...."
26097,I just need a charger for my ipod,"[18, 42, 159, 15, 1898, 16, 41, 2024, 0, 0, 0,...","[0.000186576, 0.033296738, 0.0, 0.0, -0.003505..."
89039,$18 to go to the 103rd floor & amp ; stand in ...,"[179, 4, 17, 157, 17, 10, 4, 1855, 1092, 193, ...","[0.0014130066, 0.004298318, -0.09110178, 0.0, ..."


In [125]:
for name in data_dfs.keys():
    data_dfs[name].iloc[:40000].to_pickle(source_data_dir / ("train_"+name+"_df.pkl"))
    data_dfs[name].iloc[40000:42000].to_pickle(source_data_dir / ("dev_"+name+"_df.pkl"))
    data_dfs[name].iloc[42000:44000].to_pickle(source_data_dir / ("test_"+name+"_df.pkl"))