In [59]:
import torch
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from transformers import RobertaModel

In [64]:
tokenizer_path = "./tokenizer_roberta"
model_path = "./model_roberta"

tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

In [65]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1).squeeze()

    predicted_class = torch.argmax(probabilities).item()
    predicted_sentiment = "positive" if predicted_class == 1 else "negative"

    return predicted_sentiment, probabilities


In [66]:
text = "Hello, how are you?"
sentiment, probabilities = predict_sentiment(text)
print("Sentiment prédit:", sentiment)
print("Probabilités:", probabilities)

Sentiment prédit: positive
Probabilités: tensor([0.0429, 0.9571])


In [67]:
tokenizer_path = "./tokenizer_roberta"
model_path = "./model_roberta"

tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaModel.from_pretrained(model_path)

Some weights of RobertaModel were not initialized from the model checkpoint at ./model_roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
def interpret_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Calculer les embeddings
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    logits = output.logits
    embeddings = model.roberta.embeddings.word_embeddings(input_ids)

    # Récupérer les couches d'attention
    attentions = output.attentions

    return embeddings, attentions

In [69]:
sentence = "Hello, how are you?"
embeddings, attentions = interpret_sentence(sentence)

AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'

In [71]:
import torch
from transformers import RobertaTokenizer, RobertaModel
import matplotlib.pyplot as plt
import seaborn as sns

tokenizer_path = "./tokenizer_roberta"
model_path = "./model_roberta"

tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaModel.from_pretrained(model_path)
model.eval()

def interpret_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Calculer les embeddings
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    embeddings = output.last_hidden_state
    attentions = output.attentions

    return embeddings, attentions

def visualize_attention(sentence, attentions):
    # Récupérer la longueur de la phrase
    n_tokens = len(tokenizer(sentence)['input_ids'])

    # Créer une heatmap pour chaque couche d'attention
    for layer, layer_attention in enumerate(attentions):
        plt.figure(figsize=(10, 6))
        sns.heatmap(layer_attention[0][:n_tokens, :n_tokens], cmap='viridis', annot=False)
        plt.title(f'Layer {layer+1} Attention')
        plt.xlabel('To')
        plt.ylabel('From')
        plt.show()

# Utiliser la fonction pour visualiser l'attention
sentence = "Hello, how are you?"
embeddings, attentions = interpret_sentence(sentence)
visualize_attention(sentence, attentions)


Some weights of RobertaModel were not initialized from the model checkpoint at ./model_roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: 'NoneType' object is not iterable

In [47]:
import pandas as pd

data = pd.read_csv('../Data/tweets.csv', encoding='latin-1', header=None)
data = data.rename(columns={data.columns[0]: 'target'})
data = data.rename(columns={data.columns[1]: 'id'})
data = data.rename(columns={data.columns[2]: 'date'})
data = data.rename(columns={data.columns[3]: 'flag'})
data = data.rename(columns={data.columns[4]: 'user'})
data = data.rename(columns={data.columns[5]: 'text'})
data = data.sample(1000)
data

Unnamed: 0,target,id,date,flag,user,text
999755,4,1879920060,Thu May 21 23:32:07 PDT 2009,NO_QUERY,buddhapest,@NovaWildstar my dear husband will have to go ...
793985,0,2326819086,Thu Jun 25 07:33:10 PDT 2009,NO_QUERY,Sorabu,I wanna go to the beach
1418798,4,2057852183,Sat Jun 06 13:47:07 PDT 2009,NO_QUERY,cwoffeegirl,@cleff re:hangover - ahhhh I so want to see th...
1070802,4,1966217564,Fri May 29 17:20:55 PDT 2009,NO_QUERY,aminorjourney,Wow. I was featured on AutoBlog Green Sweet.
918493,4,1753664697,Sun May 10 02:47:30 PDT 2009,NO_QUERY,mob61uk,"@SteveLangton Yes, they clearly relished actin..."
...,...,...,...,...,...,...
1492623,4,2069325416,Sun Jun 07 15:40:16 PDT 2009,NO_QUERY,Arantza92,@LisaHopeCyrus well if u're tired go to sleep ...
1335722,4,2017084567,Wed Jun 03 07:50:09 PDT 2009,NO_QUERY,lauraeatworld,trying to write
126342,0,1834499716,Mon May 18 03:10:14 PDT 2009,NO_QUERY,tewitje,Having hayfever when it is raining is so wrong.
1315983,4,2014100940,Wed Jun 03 00:27:46 PDT 2009,NO_QUERY,KeiraShakesby,is watching bones b4 work


In [48]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
text_list = data['text'].tolist()
y_pred = classifier(text_list)

In [49]:
y_pred_tab = []
for i in y_pred:
    if i['label'] == 'LABEL_1':
        y_pred_tab.append(1)
    else :
        y_pred_tab.append(0)

In [50]:
y_test_tab = []
for i in range(len(data)):
    if data['target'].iloc[i] == 0:
        y_test_tab.append(0)
    else :
        y_test_tab.append(1)

In [51]:
from sklearn.metrics import accuracy_score

base_accuracy = accuracy_score(y_test_tab, y_pred_tab)

In [52]:
base_accuracy

0.838

In [53]:
def interpret_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Calculer les embeddings
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)

    logits = output.logits
    embeddings = model.roberta.embeddings.word_embeddings(input_ids)

    attentions = output.attentions

    return embeddings, attentions


In [54]:
sentence = "Go fuck yourself i hate you"
embeddings, attentions = interpret_sentence(sentence)

In [55]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

In [56]:
visualization.visualize_text_attention(sentence, attentions)

AttributeError: module 'captum.attr._utils.visualization' has no attribute 'visualize_text_attention'

In [58]:
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_attention(sentence, attentions):
    # Récupérer la longueur de la phrase
    n_tokens = len(tokenizer(sentence)['input_ids'])

    # Créer une heatmap pour chaque couche d'attention
    for layer, layer_attention in enumerate(attentions):
        plt.figure(figsize=(10, 6))
        sns.heatmap(layer_attention[0][:n_tokens, :n_tokens], cmap='viridis', annot=False)
        plt.title(f'Layer {layer+1} Attention')
        plt.xlabel('To')
        plt.ylabel('From')
        plt.show()

# Utiliser la fonction pour visualiser l'attention
visualize_attention(sentence, attentions)

TypeError: 'NoneType' object is not iterable