# Clasificador basado en transformers

[Twitter-roBERTa-base for Emoji prediction](https://huggingface.co/cardiffnlp/twitter-roberta-base-emoji)

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [2]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
task='emoji'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
folder = MODEL.replace('cardiffnlp','modelos')

try:
    tokenizer = AutoTokenizer.from_pretrained(folder)
except ValueError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    tokenizer.save_pretrained(folder)

In [4]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [5]:
try:
    model = AutoModelForSequenceClassification.from_pretrained(folder)
except OSError:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    model.save_pretrained(folder)

Palabras rellenar

In [6]:
def eval_text(text):
    # retorna el indice del emoji con mas probabilidad y los scores
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return np.argmax(scores), scores

In [7]:
def rank_emojis_text(text):
    # imprime los emojis ordenados por probabilidad
    _, scores = eval_text(text)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [8]:
ejemplo = "Looking forward to Christmas"
label, _ = eval_text(ejemplo)
print(ejemplo+'\nemoji escogido = {} ({})'.format(labels[label],label))

Looking forward to Christmas
emoji escogido = 🎄 (17)


In [9]:
rank_emojis_text(ejemplo)

1) 🎄 0.5457
2) 😊 0.1417
3) 😁 0.0649
4) 😍 0.0395
5) ❤ 0.03
6) 😜 0.028
7) ✨ 0.0263
8) 😉 0.0237
9) 😂 0.0177
10) 😎 0.0166
11) 😘 0.0143
12) 💕 0.014
13) 💙 0.0076
14) 💜 0.0068
15) 🔥 0.0065
16) 💯 0.004
17) 🇺🇸 0.0037
18) 📷 0.0034
19) ☀ 0.0033
20) 📸 0.0021


Iteramos sobre el test set

In [15]:
import pickle

path =  "../../Data/test/df_us_test.pickle"
df_us_test = pickle.load(open(path, "rb"))

In [16]:
%%time
y_pred = []

for texto in df_us_test['text']:
    label, _ = eval_text(texto)
    y_pred.append(label)

CPU times: user 3h 25min 31s, sys: 1min 10s, total: 3h 26min 41s
Wall time: 34min 28s


In [18]:
from sklearn.metrics import classification_report
print(classification_report(df_us_test["label"].astype(int), y_pred, target_names=labels))

              precision    recall  f1-score   support

           ❤       0.74      0.86      0.80     10798
           😍       0.32      0.44      0.37      4830
           😂       0.45      0.53      0.49      4534
           💕       0.28      0.14      0.18      2605
           🔥       0.58      0.53      0.55      3716
           😊       0.14      0.27      0.18      1613
           😎       0.19      0.23      0.20      1996
           ✨       0.31      0.42      0.35      2749
           💙       0.24      0.09      0.13      1549
           😘       0.18      0.20      0.19      1175
           📷       0.30      0.70      0.42      1432
          🇺🇸       0.72      0.53      0.61      1949
           ☀       0.71      0.62      0.67      1265
           💜       0.34      0.02      0.04      1114
           😉       0.16      0.10      0.12      1306
           💯       0.34      0.12      0.18      1244
           😁       0.14      0.03      0.05      1153
           🎄       0.64    

Guardamos los resultados en pickle (la lista)

In [19]:
import pickle

with open('resultados_test/twitter-roberta-base.pickle', 'wb') as handle:
    pickle.dump(y_pred, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
with open('resultados_test/twitter-roberta-base.pickle', 'rb') as handle:
    preds = pickle.load(handle)

Para volver a cargar la lista

In [22]:
preds[:10]  # primeros 10 labels

[1, 10, 6, 1, 1, 17, 2, 10, 12, 10]