# Clasificador basado en transformers

[BERTweet-base for Emoji prediction](https://huggingface.co/cardiffnlp/bertweet-base-emoji)

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
task='emoji'
MODEL = f"cardiffnlp/bertweet-base-{task}"
folder = MODEL.replace('cardiffnlp','modelos')

try:
    tokenizer = AutoTokenizer.from_pretrained(folder)
except ValueError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.save_pretrained(folder)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [4]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [6]:
try:
    model = AutoModelForSequenceClassification.from_pretrained(folder)
except OSError:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(folder)


Palabras rellenar

In [7]:
def eval_text(text):
    # retorna el indice del emoji con mas probabilidad y los scores
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return np.argmax(scores), scores

In [8]:
def rank_emojis_text(text):
    # imprime los emojis ordenados por probabilidad
    _, scores = eval_text(text)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [9]:
ejemplo = "Looking forward to Christmas"
label, _ = eval_text(ejemplo)
print(ejemplo+'\nemoji escogido = {} ({})'.format(labels[label],label))

Looking forward to Christmas
emoji escogido = 🎄 (17)


In [10]:
rank_emojis_text(ejemplo)

1) 🎄 0.7746
2) ❤ 0.0532
3) 😊 0.0275
4) 😍 0.0265
5) 😁 0.0154
6) ✨ 0.0141
7) 💕 0.0135
8) 😂 0.0115
9) 💜 0.0106
10) 😘 0.0099
11) 💙 0.0091
12) 😜 0.0067
13) 😉 0.0054
14) 🔥 0.0041
15) ☀ 0.004
16) 💯 0.0039
17) 😎 0.0034
18) 🇺🇸 0.0027
19) 📷 0.0026
20) 📸 0.0013


Iteramos sobre el test set

In [10]:
import pickle

path =  "../../Data/test/df_us_test.pickle"
df_us_test = pickle.load(open(path, "rb"))

In [11]:
%%time
y_pred = []

for texto in df_us_test['text']:
    label, _ = eval_text(texto)
    y_pred.append(label)

CPU times: user 3h 42min 23s, sys: 1min 32s, total: 3h 43min 55s
Wall time: 37min 22s


In [12]:
from sklearn.metrics import classification_report
print(classification_report(df_us_test["label"].astype(int), y_pred, target_names=labels))

              precision    recall  f1-score   support

           ❤       0.81      0.85      0.83     10798
           😍       0.33      0.53      0.40      4830
           😂       0.42      0.66      0.51      4534
           💕       0.28      0.24      0.26      2605
           🔥       0.74      0.44      0.56      3716
           😊       0.16      0.19      0.17      1613
           😎       0.21      0.25      0.23      1996
           ✨       0.34      0.41      0.37      2749
           💙       0.34      0.06      0.10      1549
           😘       0.18      0.24      0.21      1175
           📷       0.31      0.71      0.43      1432
          🇺🇸       0.82      0.57      0.67      1949
           ☀       0.75      0.58      0.65      1265
           💜       0.23      0.01      0.03      1114
           😉       0.18      0.07      0.10      1306
           💯       0.29      0.27      0.28      1244
           😁       0.14      0.05      0.07      1153
           🎄       0.65    

Guardamos los resultados en pickle (la lista)

In [13]:
import pickle

with open('resultados_test/bertweet-base.pickle', 'wb') as handle:
    pickle.dump(y_pred, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
with open('resultados_test/bertweet-base.pickle', 'rb') as handle:
    preds = pickle.load(handle)

Para volver a cargar la lista

In [15]:
preds[:10]  # primeros 10 labels

[1, 10, 6, 1, 1, 17, 2, 10, 12, 10]