## Load model from hugging face

In [1]:
# Load model directly
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("citizenlab/twitter-xlm-roberta-base-sentiment-finetunned")
model = AutoModelForSequenceClassification.from_pretrained("citizenlab/twitter-xlm-roberta-base-sentiment-finetunned")

2023-09-18 20:24:08.937400: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
config = AutoConfig.from_pretrained("citizenlab/twitter-xlm-roberta-base-sentiment-finetunned")
model.save_pretrained("citizenlab/twitter-xlm-roberta-base-sentiment-finetunned")
tokenizer.save_pretrained("citizenlab/twitter-xlm-roberta-base-sentiment-finetunned")

('citizenlab/twitter-xlm-roberta-base-sentiment-finetunned/tokenizer_config.json',
 'citizenlab/twitter-xlm-roberta-base-sentiment-finetunned/special_tokens_map.json',
 'citizenlab/twitter-xlm-roberta-base-sentiment-finetunned/sentencepiece.bpe.model',
 'citizenlab/twitter-xlm-roberta-base-sentiment-finetunned/added_tokens.json',
 'citizenlab/twitter-xlm-roberta-base-sentiment-finetunned/tokenizer.json')

## Load all libraries

In [11]:
import numpy as np
from scipy.special import softmax
import nltk
import pandas as pd
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/sulu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sulu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Function to preprocess text

In [4]:
def preprocess(text):
    new_text = []
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(lemmatized_words)
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [5]:
text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)


# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) Positive 0.8976
2) Neutral 0.1007
3) Negative 0.0017


## Test model on validation dataset

In [6]:
# Laden Sie Ihren CSV-Datensatz
csv_file = "../../data/val_data.csv"  # Ersetzen Sie durch den Pfad zu Ihrer CSV-Datei
df = pd.read_csv(csv_file)

# Leere Listen zum Speichern der vorhergesagten Sentiments und der tatsächlichen Labels
predicted_sentiments = []
actual_labels = []

count = 0
# Durchlaufen Sie jeden Eintrag im DataFrame und wenden Sie das Modell an
for index, row in df.iterrows():
    text = row['text']
    text = preprocess(text)  # Stellen Sie sicher, dass Sie Ihre Preprocessing-Funktion hier verwenden
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    softmax_scores = softmax(scores)

    # Extrahieren Sie das vorhergesagte Sentiment
    predicted_label_id = np.argmax(softmax_scores)
    #predicted_sentiment = model.config.id2label[predicted_label_id]

    # Speichern Sie das vorhergesagte Sentiment und das tatsächliche Label
    predicted_sentiments.append(predicted_label_id)
    actual_labels.append(row['label'])

# Fügen Sie die vorhergesagten Sentiments und die tatsächlichen Labels als neue Spalten zum DataFrame hinzu
df['predicted_sentiment'] = predicted_sentiments

# Vergleichen Sie die vorhergesagten Sentiments mit den tatsächlichen Labels
correct_predictions = (df['predicted_sentiment'] == df['label']).sum()
total_predictions = len(df)

# Berechnen Sie die Genauigkeit (Accuracy) der Vorhersagen
accuracy = correct_predictions / total_predictions

# Drucken Sie die Genauigkeit
print(f"Genauigkeit: {accuracy:.2%}")

Genauigkeit: 62.10%


## Print statistics and conf matrix

In [7]:
df.head()

Unnamed: 0,text,label,predicted_sentiment
0,Dark Souls 3 April Launch Date Confirmed With ...,1,1
1,"""National hot dog day, national tequila day, t...",2,1
2,When girls become bandwagon fans of the Packer...,0,1
3,@user I may or may not have searched it up on ...,1,1
4,Here's your starting TUESDAY MORNING Line up a...,1,1


In [8]:
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Ihre Verwirrungsmatrix
conf_matrix = confusion_matrix(df['label'], df['predicted_sentiment'])

# Berechnen Sie die Precision und den Recall für jede Klasse
precision = precision_score(df['label'], df['predicted_sentiment'], average=None)
recall = recall_score(df['label'], df['predicted_sentiment'], average=None)

# Berechnen Sie den gewichteten Durchschnitt von Precision und Recall (Macro-Durchschnitt)
macro_precision = precision_score(df['label'], df['predicted_sentiment'], average='macro')
macro_recall = recall_score(df['label'], df['predicted_sentiment'], average='macro')

# Erstellen Sie ein DataFrame für die Verwirrungsmatrix
confusion_df = pd.DataFrame(conf_matrix, columns=["Predicted Class 0", "Predicted Class 1", "Predicted Class 2"], index=["True Class 0", "True Class 1", "True Class 2"])

# Drucken Sie die Verwirrungsmatrix
print("Verwirrungsmatrix:")
print(confusion_df)

# Erstellen Sie ein DataFrame für Precision und Recall
precision_recall_df = pd.DataFrame({"Precision": precision, "Recall": recall}, index=["Class 0", "Class 1", "Class 2"])

# Drucken Sie die Precision und den Recall für jede Klasse
print("\nPrecision und Recall pro Klasse:")
print(precision_recall_df)

# Drucken Sie den gewichteten Durchschnitt von Precision und Recall (Macro-Durchschnitt)
print(f"\nMacro Precision: {macro_precision:.2f}")
print(f"Macro Recall: {macro_recall:.2f}")


Verwirrungsmatrix:
              Predicted Class 0  Predicted Class 1  Predicted Class 2
True Class 0                144                166                  2
True Class 1                 51                804                 14
True Class 2                 12                513                294

Precision und Recall pro Klasse:
         Precision    Recall
Class 0   0.695652  0.461538
Class 1   0.542144  0.925201
Class 2   0.948387  0.358974

Macro Precision: 0.73
Macro Recall: 0.58


In [9]:
precision_recall_df

Unnamed: 0,Precision,Recall
Class 0,0.695652,0.461538
Class 1,0.542144,0.925201
Class 2,0.948387,0.358974


In [10]:
confusion_df

Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
True Class 0,144,166,2
True Class 1,51,804,14
True Class 2,12,513,294
