In [1]:
# installations for colab
#!pip install transformers accelerate torch huggingface_hub datasets emoji

# BerTweet Sentiment Analysis

In [None]:
# using bertweet-sentiment analysis
# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis

from transformers import AutoTokenizer, AutoModelForSequenceClassification
bertweet_t = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

bertweet = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis",
    device_map="auto",
    num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch

def sentiment_score(text, tokenizer, model):
    tokens = tokenizer.encode(text, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))

In [None]:
import torch

#def bert_pred(model, tokenizer, df, text_col):

    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(model.device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_tokens = tokenizer(df[text_col][start_idx:end_idx].tolist(),
                                 padding=True, truncation=True, return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred"] = predictions

In [None]:
import pandas as pd

#df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")

# read in data after NER csv
df_brd_sa = pd.read_csv("./data after NER.csv")

df_brd_sa.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{""word"": ""."", ""entity"": ""B-person""}, {""word"":..."
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":..."
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":..."
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{""word"": ""\u0120festival"", ""entity"": ""B-event..."
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{""word"": ""xt"", ""entity"": ""I-event""}, {""word"":..."


In [None]:
df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [None]:
df_brd_sa.dropna(inplace=True)

In [None]:
df_brd_sa["sentiment_prediction"] = df_brd_sa["tweet_text"].apply(lambda x: sentiment_score(x[:500], bertweet_t, bertweet))

NameError: name 'sentiment_score' is not defined

In [None]:
df_brd_sa["sentiment_prediction"].unique()

array([1, 2, 0], dtype=int64)

In [None]:
# 0 negative
# 1 neutral
# 2 positive
df_brd_sa["sentiment"] = df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].replace({"Negative emotion" : 0, "Positive emotion" : 2, "No emotion toward brand or product" : 1, "I can't tell" : 1})
df_brd_sa.head()

  df_brd_sa["sentiment"] = df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].replace({"Negative emotion" : 0, "Positive emotion" : 2, "No emotion toward brand or product" : 1, "I can't tell" : 1})


Unnamed: 0.1,Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities,sentiment
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{""word"": ""."", ""entity"": ""B-person""}, {""word"":...",0
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":...",2
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":...",2
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{""word"": ""\u0120festival"", ""entity"": ""B-event...",0
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{""word"": ""xt"", ""entity"": ""I-event""}, {""word"":...",2


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def performance_metrics(df, label, prediction):
     accuracy = accuracy_score(df[label], df[prediction])
     precision = precision_score(df[label], df[prediction], average="weighted")
     recall = recall_score(df[label], df[prediction], average="weighted")
     f1 = f1_score(df[label], df[prediction], average="weighted")

     print(f"Accuracy: {accuracy}")
     print(f"Precision: {precision}")
     print(f"Recall: {recall}")
     print(f"F1-Score: {f1}")

In [None]:
performance_metrics(df_brd_sa, "sentiment", "sentiment_prediction")

Accuracy: 0.7289577635976907
Precision: 0.8810658835129812
Recall: 0.7289577635976907
F1-Score: 0.7884397587742731


In [None]:
df_brd_sa.to_csv("./data after SA.csv")

In [None]:
bert_pred(bertweet, bertweet_t, df_brd_sa, "tweet_text")

In [None]:
df_brd_sa.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities,sentiment,bert_pred
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{""word"": ""."", ""entity"": ""B-person""}, {""word"":...",0,1
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":...",2,2
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{""word"": ""@"", ""entity"": ""B-person""}, {""word"":...",2,2
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{""word"": ""\u0120festival"", ""entity"": ""B-event...",0,1
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{""word"": ""xt"", ""entity"": ""I-event""}, {""word"":...",2,2


In [None]:
performance_metrics(df_brd_sa, "sentiment", "bertweet_pred")

Accuracy: 0.7289577635976907
Precision: 0.8810658835129812
Recall: 0.7289577635976907
F1-Score: 0.7884397587742731


## testing with sentiment corpus

In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

Unnamed: 0,text,labels
0,Nun LIVE ▶️ [PC│GER] Bugisoft bestes Spiel Kap...,1
1,{{MENTION}} Pick,1
2,ETTelecom | Fake Apple items worth Rs 13.8L se...,0
3,RT {{MENTION}} ⚠️ The affected giants include ...,1
4,Is there any way of deleting an app that won't...,0


In [None]:
bert_pred(bertweet, bertweet_t, df_sc, "text")

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [None]:
performance_metrics(df_sc, "labels", "bert_pred")

Accuracy: 0.6272260790824027
Precision: 0.6611441262229791
Recall: 0.6272260790824027
F1-Score: 0.6272425239648367


## bertweet fine-tuning

In [None]:
# trying to fine-tune this model (goal of accuracy > 72,89% and precision > 88,1%)
from datasets import Dataset

dataset = Dataset.from_pandas(df_brd_sa)

In [None]:
def tokenize_function(example):
    return tokenizer(example['tweet_text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column('sentiment', 'labels')
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/3291 [00:00<?, ? examples/s]

In [None]:
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # adjust depending on resources
    per_device_eval_batch_size=4,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: hausbichler-georg (hausbichler-georg-wirtschaftsuniversit-t-wien). Use `wandb login --relogin` to force relogin


  0%|          | 0/1316 [00:00<?, ?it/s]

{'loss': 0.4696, 'grad_norm': 34.91965103149414, 'learning_rate': 1.2401215805471124e-05, 'epoch': 0.76}


  0%|          | 0/165 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3217962682247162, 'eval_accuracy': 0.9044006069802731, 'eval_precision': 0.8776387724816433, 'eval_recall': 0.9044006069802731, 'eval_f1': 0.8906713195717271, 'eval_runtime': 125.3936, 'eval_samples_per_second': 5.255, 'eval_steps_per_second': 1.316, 'epoch': 1.0}
{'loss': 0.3243, 'grad_norm': 56.448238372802734, 'learning_rate': 4.80243161094225e-06, 'epoch': 1.52}


  0%|          | 0/165 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3860619068145752, 'eval_accuracy': 0.91350531107739, 'eval_precision': 0.8822575684694961, 'eval_recall': 0.91350531107739, 'eval_f1': 0.8969385263946141, 'eval_runtime': 115.0542, 'eval_samples_per_second': 5.728, 'eval_steps_per_second': 1.434, 'epoch': 2.0}
{'train_runtime': 7890.3636, 'train_samples_per_second': 0.667, 'train_steps_per_second': 0.167, 'train_loss': 0.36438215539810503, 'epoch': 2.0}


TrainOutput(global_step=1316, training_loss=0.36438215539810503, metrics={'train_runtime': 7890.3636, 'train_samples_per_second': 0.667, 'train_steps_per_second': 0.167, 'total_flos': 346257257730048.0, 'train_loss': 0.36438215539810503, 'epoch': 2.0})

In [None]:
results = trainer.evaluate(test_dataset)
print(results)

  0%|          | 0/165 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3217962682247162, 'eval_accuracy': 0.9044006069802731, 'eval_precision': 0.8776387724816433, 'eval_recall': 0.9044006069802731, 'eval_f1': 0.8906713195717271, 'eval_runtime': 114.8893, 'eval_samples_per_second': 5.736, 'eval_steps_per_second': 1.436, 'epoch': 2.0}


In [None]:
model.save_pretrained('./models/bertweetSA_after_inital_finetuning')
tokenizer.save_pretrained('./models/bertweetSA_after_inital_finetuning')

('./models/bertweetSA_after_inital_finetuning\\tokenizer_config.json',
 './models/bertweetSA_after_inital_finetuning\\special_tokens_map.json',
 './models/bertweetSA_after_inital_finetuning\\vocab.txt',
 './models/bertweetSA_after_inital_finetuning\\bpe.codes',
 './models/bertweetSA_after_inital_finetuning\\added_tokens.json')

# Testing fine-tuned BERTweet-SA

In [None]:
#loading and trying the saved model
tokenizer2 = AutoTokenizer.from_pretrained("./models/bertweetSA_after_inital_finetuning")

model2 = AutoModelForSequenceClassification.from_pretrained("./models/bertweetSA_after_inital_finetuning")

In [None]:
df_brd_sa["sentiment_prediction_finetuned"] = df_brd_sa["tweet_text"].apply(lambda x: sentiment_score(x[:500], tokenizer2, model2))

In [None]:
# very high results, but may not be too telling because it was tested on same data it was finetuned on
performance_metrics(df_brd_sa, "sentiment", "sentiment_prediction_finetuned")

Accuracy: 0.9310240048617442
Precision: 0.9037352697049685
Recall: 0.9310240048617442
F1-Score: 0.9170998038436116


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## preprocessing Tweets Big Tech data

In [None]:
import pandas as pd
df_tweets_bigtech_10ksample = pd.read_csv("tweets_bigtech_10ksample.csv")

In [None]:
# creating small test dataset with random sampling for later
df_tweets_bigtech_test = df_tweets_bigtech.sample(n=4000, seed=42)
df_tweets_bigtech_test.to_csv("./tweets_bigtech_test.csv")

In [None]:
df_tweets_bigtech_10ksample["sentiment_prediction_finetuned"] = df_tweets_bigtech_10ksample["text"].apply(lambda x: sentiment_score(x[:500], tokenizer2, model2))

In [None]:
df_tweets_bigtech_10ksample.head()

Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment,sentiment_prediction_finetuned
0,2020-07-12 09:24:26,AMD,25,114,AMD,United Kingdom,0.0,moffphcgaming,#AMD,Been on holiday so back now. Gonna try get som...,1.282244e+18,🕹MoffPHC Gaming🕹,-0.3102,Technology,AMD,0,2
1,2020-07-12 08:16:45,AMD,1719,1,AMD,digitalocean,1.0,LinuxDreams,#AMD,RT @LinuxReviews: #Linux architect Linus Torva...,1.282227e+18,LinuxDreams,-0.3612,Technology,AMD,0,0
2,2020-07-12 08:11:41,AMD,69,135,AMD,Amsterdam,1.0,LinuxReviews,#AMD,"#Linux architect Linus Torvalds: AVX512 Is ""A ...",1.282226e+18,LinuxReviews,-0.3612,Technology,AMD,0,0
3,2020-07-12 02:22:50,AMD,34,155,AMD,San Francisco,0.0,NdrewGarcia,#AMD,#AMD stuck in a range box chart https://t.co/5...,1.282138e+18,Encino_Man,-0.25,Technology,AMD,0,0
4,2020-07-11 23:58:44,AMD,802,730,AMD,"New Jersey, USA",0.0,Roger_Clinton1,#AMD,$AMD Epyc Milan Leak – Three early Genesis sam...,1.282102e+18,Roger Ocasio-Clinton,-0.34,Technology,AMD,0,2


In [None]:
performance_metrics(df_tweets_bigtech_10ksample, "sentiment", "sentiment_prediction_finetuned")

Accuracy: 0.5633684210526316
Precision: 0.5890056459480304
Recall: 0.5633684210526316
F1-Score: 0.48415694314369734


In [None]:
# comparison with not fine-tuned model on tweets-bigtech-data
df_tweets_bigtech_10ksample["sentiment_prediction"] = df_tweets_bigtech_10ksample["text"].apply(lambda x: sentiment_score(x[:500], tokenizer, model))

In [None]:
performance_metrics(df_tweets_bigtech_10ksample, "sentiment", "sentiment_prediction")

Accuracy: 0.5675789473684211
Precision: 0.6583776609118885
Recall: 0.5675789473684211
F1-Score: 0.5742206759300184


# Second Fine-tuning of BERTweet-SA with big tech data

In [None]:
# finetuning on tweets_bigtech_sample
import pandas as pd
df_tweets_bigtech_sample = pd.read_json("./tweets_bigtech_sample.json", orient="records")
df_tweets_bigtech_sample.head()

Unnamed: 0,text,labels
0,#Apple 'to postpone 5G #iPhone 12 launch until...,0
1,"Very impressive performance, no pun intended, ...",1
2,#News #App #Apple Kuo: Chinese iPhone Shipment...,0
3,#Facebook Says #Apple Changes to #iOS Will Dra...,0
4,RT @latestly: COVID-19 Effect: Quarter of a bi...,0


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_tweets_bigtech_sample)

In [None]:
def tokenize_function(example):
    return bertweet_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
#tokenized_datasets = tokenized_datasets.rename_column('sentiment', 'labels')
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/14100 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)



In [None]:
trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bertweet_t,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.43179,0.849277,0.85047,0.849277,0.848607
2,0.501100,0.402467,0.853726,0.854587,0.853726,0.853005


TrainOutput(global_step=638, training_loss=0.4641368232176969, metrics={'train_runtime': 241.0762, 'train_samples_per_second': 84.513, 'train_steps_per_second': 2.646, 'total_flos': 1340168193197568.0, 'train_loss': 0.4641368232176969, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.41193854808807373,
 'eval_accuracy': 0.8520094562647754,
 'eval_precision': 0.8523947769998391,
 'eval_recall': 0.8520094562647754,
 'eval_f1': 0.8511460660354356,
 'eval_runtime': 6.486,
 'eval_samples_per_second': 326.089,
 'eval_steps_per_second': 10.33,
 'epoch': 2.0}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, # adjust depending on resources
    per_device_eval_batch_size=16,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.350235,0.887368,0.890646,0.887368,0.886573
2,0.461400,0.345854,0.902105,0.901972,0.902105,0.901811


TrainOutput(global_step=950, training_loss=0.3481455752724095, metrics={'train_runtime': 203.2829, 'train_samples_per_second': 74.773, 'train_steps_per_second': 4.673, 'total_flos': 999830987366400.0, 'train_loss': 0.3481455752724095, 'epoch': 2.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.34585440158843994,
 'eval_accuracy': 0.9021052631578947,
 'eval_precision': 0.901972325838357,
 'eval_recall': 0.9021052631578947,
 'eval_f1': 0.9018111448193392,
 'eval_runtime': 6.0186,
 'eval_samples_per_second': 315.69,
 'eval_steps_per_second': 19.772,
 'epoch': 2.0}

In [None]:
trainer.save_model("./models/bertweet-sa-after-tweetsbigtech")
bertweet_t.save_pretrained("./models/bertweet-sa-after-tweetsbigtech")

('./models/bertweet-sa-after-tweetsbigtech/tokenizer_config.json',
 './models/bertweet-sa-after-tweetsbigtech/special_tokens_map.json',
 './models/bertweet-sa-after-tweetsbigtech/vocab.txt',
 './models/bertweet-sa-after-tweetsbigtech/bpe.codes',
 './models/bertweet-sa-after-tweetsbigtech/added_tokens.json')

In [None]:
model_after_tweetsbigtech = AutoModelForSequenceClassification.from_pretrained(
    "./models/bertweet-sa-after-tweetsbigtech",
    num_labels=3)
tokenizer_after_tweetsbigtech = AutoTokenizer.from_pretrained("./models/bertweet-sa-after-tweetsbigtech")

In [None]:
bert_pred(model_after_tweetsbigtech, tokenizer_after_tweetsbigtech, df_sc_10ksample, "text")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
performance_metrics(df_sc_10ksample, "labels", "bert_pred_sa")

KeyError: 'bert_pred_sa'

In [None]:
df_tweets_bigtech_test = pd.read_csv("./tweets_bigtech_test.csv")
df_tweets_bigtech_test.head()

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment
0,266816,2020-07-17 20:33:44,Apple,57,169,Apple,Bruxelles - Luxembourg,0.0,realSebThomas,#Apple OR #Iphone,2 days after the #Apple #Stateaid judgment of ...,1.284225e+18,Sébastien Thomas,0.0,Technology,Apple,1
1,259583,2020-07-17 12:39:25,Apple,6,89,Apple,Australia,235.0,holycrapps,#Apple OR #Iphone,RT @GFSquad__com: [PIC] 200717- @GFRDofficial ...,1.284105e+18,『 』,0.0,Technology,Apple,1
2,434342,2020-08-10 11:21:00,Twitch,134,142,Twitch,Leeds,2.0,FarrerArt,#Twitch,✨✍🏻💫MEET THE ARTIST &amp; SUB SKETCHES💫✍🏻✨ St...,1.292783e+18,Farrer Art,0.0,Technology,Twitch,1
3,144183,2020-07-12 08:58:23,Netflix,279,323,Netflix,ur heart,102.0,apphiashohe,#Netflix,RT @nakulnarayanak: To the casting director of...,1.282238e+18,a 🦋,0.0,Technology,Netflix,1
4,191249,2020-07-28 22:45:49,Google,198,117,Google,Northern California,0.0,rdmind,#Google,RDM Industrial Products Inc on #Google https:/...,1.288244e+18,RDM Industrial Prod.,0.0,Technology,Google,1


In [None]:
import torch

def sentiment_score(text, tokenizer, model):
    tokens = tokenizer.encode(text, return_tensors='pt', truncation = True, padding="max_length") #max_length=128 necessary?
    result = model(tokens)
    return int(torch.argmax(result.logits))

In [None]:
df_tweets_bigtech_test["sentiment_prediction_finetuned"] = df_tweets_bigtech_test["text"].apply(lambda x: sentiment_score(x[:500], tokenizer_after_tweetsbigtech, model_after_tweetsbigtech))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
performance_metrics(df_tweets_bigtech_test, "sentiment", "sentiment_prediction_finetuned")

Accuracy: 0.5525
Precision: 0.6684950730088312
Recall: 0.5525
F1-Score: 0.5044394096592938


# finetuning bertweet sa with corpus

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bertweet_t = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

bertweet = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis",
    device_map="auto",
    num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

Unnamed: 0,text,labels
0,{{MENTION}} {{MENTION}} {{MENTION}} 226 x #tes...,1
1,{{MENTION}} Our fleet's on fleek. {{URL}} LMFA...,1
2,So many kitchen blenders are missing dick. #T...,0
3,#tesla. Not too rare anymore. {{URL}},1
4,iPhone users at #SXSW - any of you have your G...,0


In [4]:
# examining length of tokens in the dataset to determine maxlength
df_sc["text"].str.split().str.len().agg(["mean","max","std"])

Unnamed: 0,text
mean,21.100025
max,63.0
std,9.915533


In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

def tokenize_function(example):
    return bertweet_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/19915 [00:00<?, ? examples/s]

In [6]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [7]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score
from transformers import Trainer, TrainingArguments

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bertweet_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [8]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.500071,0.810236,0.81345,0.810236,0.807268
2,0.596100,0.482049,0.823228,0.823472,0.823228,0.822293


TrainOutput(global_step=900, training_loss=0.503199479844835, metrics={'train_runtime': 304.5665, 'train_samples_per_second': 94.475, 'train_steps_per_second': 2.955, 'total_flos': 1892706370426368.0, 'train_loss': 0.503199479844835, 'epoch': 2.0})

In [9]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.4998858571052551,
 'eval_accuracy': 0.8162650602409639,
 'eval_precision': 0.8164540663746535,
 'eval_recall': 0.8162650602409639,
 'eval_f1': 0.8151020631427773,
 'eval_runtime': 9.1122,
 'eval_samples_per_second': 327.911,
 'eval_steps_per_second': 10.316,
 'epoch': 2.0}

In [10]:
trainer.save_model("./bertweet-sa-corpus")

## applying the model

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

bertweet_corpus_t = AutoTokenizer.from_pretrained("./bertweet-sa-corpus")

bertweet_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./bertweet-sa-corpus",
    device_map="auto",
    num_labels=3)

In [12]:
corpus_test_df = test_dataset.to_pandas()
corpus_test_df.head()

Unnamed: 0,text,labels,input_ids,token_type_ids,attention_mask
0,iPhone battery is going quickly. Guy behind me...,0,"[0, 1381, 5390, 17, 117, 20171, 1747, 4, 4199,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Oops that's going to kill many people's dreams...,0,"[0, 11872, 6139, 20, 117, 9, 897, 239, 20644, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,RT {{MENTION}} #BREAKING:Are they SERIOUSLY st...,0,"[0, 246, 61450, 61450, 15757, 31521, 57362, 30...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Get Death Stranding with NVIDIA RTX NOW! {{UR...,0,"[0, 388, 4085, 8344, 20102, 30, 450, 14107, 46...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,{{MENTION}} - #Apple is the classiest fascist ...,0,"[0, 61450, 61450, 15757, 31521, 57362, 3030, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [13]:
import torch

def bert_pred(model, tokenizer, df, text_col):
    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df[text_col][start_idx:end_idx].tolist()

        batch_tokens = tokenizer(batch_texts, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred_sa"] = predictions

bert_pred(bertweet_corpus, bertweet_corpus_t, corpus_test_df, "text")

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def performance_metrics(df, label, prediction):
    accuracy = accuracy_score(df[label], df[prediction])
    precision = precision_score(df[label], df[prediction], average="weighted")
    recall = recall_score(df[label], df[prediction], average="weighted")
    f1 = f1_score(df[label], df[prediction], average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    print("\nClassification Report:")
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(df[label], df[prediction], target_names=label_names, digits=4)
    print(report)

performance_metrics(corpus_test_df, "labels", "bert_pred_sa")

Accuracy: 0.8163
Precision: 0.8165
Recall: 0.8163
F1-Score: 0.8151

Classification Report:
              precision    recall  f1-score   support

    negative     0.8083    0.8740    0.8398       960
     neutral     0.8175    0.7315    0.7721       998
    positive     0.8231    0.8447    0.8337      1030

    accuracy                         0.8163      2988
   macro avg     0.8163    0.8167    0.8152      2988
weighted avg     0.8165    0.8163    0.8151      2988



### applying model to case study data

In [15]:
import pandas as pd

df_dell = pd.read_json("./dell_cs_after_ner.json", orient="records")
df_dell.head()

Unnamed: 0,Datetime,text,tokens,covid_bert_pred
0,2022-09-30 23:29:15,Logitech Apple Google Microsoft Dell Lenovo #W...,"[Logitech, Apple, Google, Microsoft, Dell, Len...","[0, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10,..."
1,2022-09-30 21:46:35,{{MENTION}} {{MENTION}} {{MENTION}} {{MENTION}...,"[{{MENTION}}, {{MENTION}}, {{MENTION}}, {{MENT...","[10, 10, 10, 10, 10, 10, 10, 10, 4, 10, 10, 10..."
2,2022-09-30 21:18:02,As {{MENTION}} celebrates its 40th anniversary...,"[As, {{MENTION}}, celebrates, its, 40th, anniv...","[10, 0, 10, 10, 10, 10, 3, 8, 10, 3, 10, 10, 1..."
3,2022-09-30 20:05:24,Dell your customer service is horrible especia...,"[Dell, your, customer, service, is, horrible, ...","[0, 10, 10, 10, 10, 10, 10, 10, 3, 10, 10, 10,..."
4,2022-09-30 20:03:17,{{MENTION}} Dell Dellcares Dell give the man w...,"[{{MENTION}}, Dell, Dellcares, Dell, give, the...","[10, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10]"


In [16]:
bert_pred(bertweet_corpus, bertweet_corpus_t, df_dell, "text")

In [17]:
df_dell.to_json("./dell_cs_after_predictions.json", orient="records")

# Twitter Roberta base sentiment

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
roberta_sentiment_t = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

roberta_sentiment = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you e

In [None]:
# testing model with portion of sentiment corpus to see performance on diverse data
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc_10ksample = df_sc.sample(10000, random_state=42)
df_sc_10ksample.head()

Unnamed: 0,text,labels
4618,"{{MENTION}} \nGood news, we got fixed.",2
15595,Have you Embraced the Grace?? Listen to Fr. Bo...,2
7203,{{MENTION}} DAL is due for sleet Sun. eve-didn...,1
3999,{{MENTION}} I'm a UA 1k. I think it's {{MENTIO...,1
16849,Thanks {{MENTION}} {{MENTION}} had a question ...,1


In [None]:
# applying preprocessing from model's hf page to data before predicting def preprocess(text):
# def preprocess_text(text):
#     new_text = []
#     for t in text.split(" "):
#         t = '@user' if t == "{{MENTION}}" else t
#         t = 'http' if t == "{{URL}}" else t
#         new_text.append(t)
#     return " ".join(new_text)

# df_sc_10ksample["text"] = df_sc_10ksample["text"].apply(preprocess_text)

# didnt improve performance

In [None]:
bert_pred(roberta_sentiment, roberta_sentiment_t, df_sc_10ksample, "text")

In [None]:
performance_metrics(df_sc_10ksample, "labels", "bert_pred_sa")

Accuracy: 0.6315
Precision: 0.6551255252262623
Recall: 0.6315
F1-Score: 0.6326440294565018


In [None]:
df_sc_10ksample["labels"].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
0,3341
2,3339
1,3320


In [None]:
df_sc_10ksample["bert_pred_sa"].value_counts()

Unnamed: 0_level_0,count
bert_pred_sa,Unnamed: 1_level_1
1,4016
2,3724
0,2260


In [None]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
s_classifier = pipeline("sentiment-analysis", model= model_path, tokenizer=model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you e

In [None]:
df_sc_10ksample["prediction_pipeline"] = df_sc_10ksample["text"].apply(lambda x: s_classifier(x))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
df_sc_10ksample["pipeline_label"] = df_sc_10ksample["prediction_pipeline"].apply(lambda x: x[0]["label"])
df_sc_10ksample["pipeline_label"] = df_sc_10ksample["pipeline_label"].replace({"negative":0, "neutral":1, "positive":2})
df_sc_10ksample.head()

  df_sc_10ksample["pipeline_label"] = df_sc_10ksample["pipeline_label"].replace({"negative":0, "neutral":1, "positive":2})


Unnamed: 0,text,labels,prediction_pipeline,pipeline_label
9387,State transition restriction rules for Azure B...,0,"[{'label': 'neutral', 'score': 0.8305568695068...",1
19731,{{MENTION}} pilot says we expect a choppy land...,1,"[{'label': 'neutral', 'score': 0.5290448665618...",1
18736,{{MENTION}} Any idea why TSA Pre-Check isn't s...,1,"[{'label': 'negative', 'score': 0.651662945747...",0
4420,RT {{MENTION}} Westside SEO develops | impleme...,2,"[{'label': 'neutral', 'score': 0.6808052659034...",1
1692,"The recent movie ""Walk. Ride. Rodeo."" is based...",0,"[{'label': 'neutral', 'score': 0.9155369400978...",1


In [None]:
performance_metrics(df_sc_10ksample, "labels", "pipeline_label")

Accuracy: 0.6315
Precision: 0.6551255252262623
Recall: 0.6315
F1-Score: 0.6326440294565018


## fine-tuning with corpus

In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

Unnamed: 0,text,labels
0,{{MENTION}} {{MENTION}} {{MENTION}} 226 x #tes...,1
1,{{MENTION}} Our fleet's on fleek. {{URL}} LMFA...,1
2,So many kitchen blenders are missing dick. #T...,0
3,#tesla. Not too rare anymore. {{URL}},1
4,iPhone users at #SXSW - any of you have your G...,0


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

def tokenize_function(example):
    return roberta_sentiment_t(example['text'], truncation=True,
                       padding='max_length', max_length=128, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/19915 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./twitter-roberta-sentiment',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=roberta_sentiment,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=roberta_sentiment_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.520776,0.803937,0.806775,0.803937,0.801077
2,0.604800,0.505232,0.818504,0.818591,0.818504,0.81752


TrainOutput(global_step=900, training_loss=0.5133906724717882, metrics={'train_runtime': 310.4808, 'train_samples_per_second': 92.676, 'train_steps_per_second': 2.899, 'total_flos': 1892706370426368.0, 'train_loss': 0.5133906724717882, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.5228689312934875,
 'eval_accuracy': 0.8075635876840697,
 'eval_precision': 0.8083219465917031,
 'eval_recall': 0.8075635876840697,
 'eval_f1': 0.8060765964846164,
 'eval_runtime': 9.3503,
 'eval_samples_per_second': 319.563,
 'eval_steps_per_second': 10.053,
 'epoch': 2.0}

In [None]:
trainer.save_model("./twitter-roberta-sentiment-after-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

twitter_roberta_corpus_t = AutoTokenizer.from_pretrained("./twitter-roberta-sentiment-after-corpus")

twitter_roberta_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./twitter-roberta-sentiment-after-corpus",
    num_labels=3)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

Unnamed: 0,text,labels,input_ids,attention_mask
0,iPhone battery is going quickly. Guy behind me...,0,"[0, 43688, 3822, 16, 164, 1335, 4, 6959, 639, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Oops that's going to kill many people's dreams...,0,"[0, 673, 5090, 14, 18, 164, 7, 3549, 171, 82, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,RT {{MENTION}} #BREAKING:Are they SERIOUSLY st...,0,"[0, 13963, 47517, 12613, 7744, 46961, 849, 375...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Get Death Stranding with NVIDIA RTX NOW! {{UR...,0,"[0, 14181, 11644, 5997, 20515, 19, 31146, 4081...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,{{MENTION}} - #Apple is the classiest fascist ...,0,"[0, 49452, 12613, 7744, 46961, 111, 849, 20770...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
bert_pred(twitter_roberta_corpus, twitter_roberta_corpus_t, df_corpus_test, "text")

In [None]:
performance_metrics(df_corpus_test, "labels", "bert_pred_sa")

Accuracy: 0.8076
Precision: 0.8083
Recall: 0.8076
F1-Score: 0.8061

Classification Report:
              precision    recall  f1-score   support

    negative     0.7953    0.8740    0.8328       960
     neutral     0.8172    0.7124    0.7612       998
    positive     0.8119    0.8379    0.8247      1030

    accuracy                         0.8076      2988
   macro avg     0.8081    0.8081    0.8062      2988
weighted avg     0.8083    0.8076    0.8061      2988



# fine-tuning Bert-base-uncased with tweets bigtech

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_t = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

bert_base = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# using 25k rows of Tweets-BigTech-data to finetune
import pandas as pd
df_tweets_bigtech_25ksample = pd.read_csv("./tweets_bigtech_25ksample.csv")
df_tweets_bigtech_25ksample.head()

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,sentiment
0,0,2020-07-12 09:24:26,AMD,25,114,AMD,United Kingdom,0.0,moffphcgaming,#AMD,Been on holiday so back now. Gonna try get som...,1.282244e+18,🕹MoffPHC Gaming🕹,-0.3102,Technology,AMD,0
1,1,2020-07-12 08:16:45,AMD,1719,1,AMD,digitalocean,1.0,LinuxDreams,#AMD,RT @LinuxReviews: #Linux architect Linus Torva...,1.282227e+18,LinuxDreams,-0.3612,Technology,AMD,0
2,2,2020-07-12 08:11:41,AMD,69,135,AMD,Amsterdam,1.0,LinuxReviews,#AMD,"#Linux architect Linus Torvalds: AVX512 Is ""A ...",1.282226e+18,LinuxReviews,-0.3612,Technology,AMD,0
3,3,2020-07-12 02:22:50,AMD,34,155,AMD,San Francisco,0.0,NdrewGarcia,#AMD,#AMD stuck in a range box chart https://t.co/5...,1.282138e+18,Encino_Man,-0.25,Technology,AMD,0
4,4,2020-07-11 23:58:44,AMD,802,730,AMD,"New Jersey, USA",0.0,Roger_Clinton1,#AMD,$AMD Epyc Milan Leak – Three early Genesis sam...,1.282102e+18,Roger Ocasio-Clinton,-0.34,Technology,AMD,0


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_tweets_bigtech_25ksample)

In [None]:
def tokenize_function(example):
    return bert_base_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column('sentiment', 'labels')
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./bert-base-uncased-sa',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=bert_base,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=bert_base_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3611,0.287039,0.9066,0.906535,0.9066,0.90648
2,0.1933,0.285319,0.9118,0.911907,0.9118,0.911652


TrainOutput(global_step=1000, training_loss=0.27716554260253906, metrics={'train_runtime': 348.0426, 'train_samples_per_second': 91.943, 'train_steps_per_second': 2.873, 'total_flos': 2104907341824000.0, 'train_loss': 0.27716554260253906, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.28531941771507263,
 'eval_accuracy': 0.9118,
 'eval_precision': 0.9119073638565798,
 'eval_recall': 0.9118,
 'eval_f1': 0.9116519515593955,
 'eval_runtime': 16.3735,
 'eval_samples_per_second': 305.372,
 'eval_steps_per_second': 9.589,
 'epoch': 2.0}

In [None]:
bert_base.save_pretrained("bert-base-uncased-sa")
bert_base_t.save_pretrained("bert-base-uncased-sa")

('bert-base-uncased-sa/tokenizer_config.json',
 'bert-base-uncased-sa/special_tokens_map.json',
 'bert-base-uncased-sa/vocab.txt',
 'bert-base-uncased-sa/added_tokens.json',
 'bert-base-uncased-sa/tokenizer.json')

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_ft_t = AutoTokenizer.from_pretrained("bert-base-uncased-sa")

bert_base_ft = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased-sa",
    num_labels=3)

In [None]:
df_tweets_bigtech_test = test_dataset.to_pandas()
df_tweets_bigtech_test.head()

Unnamed: 0.1,Unnamed: 0,created_at,file_name,followers,friends,group_name,location,retweet_count,screenname,search_query,text,twitter_id,username,polarity,partition_0,partition_1,labels,input_ids,token_type_ids,attention_mask
0,19923,2020-07-18 00:12:32,Amazon,394,169,Amazon,"Hartford, CT",1.0,meetKushan,#Amazon OR #AWS,Amazon CloudFront announces new TLS1.2 securit...,1.28428e+18,Kushan,0.4767,Technology,Amazon,2,"[101, 9733, 6112, 12792, 17472, 2047, 1056, 48...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,18453,2020-07-12 04:16:01,Microsoft,5966,2315,Microsoft,Planet Earth,0.0,crpietschmann,#Microsoft,Top 11 #AzureIoT Announcements from Build 2020...,1.282167e+18,Chris Pietschmann,0.2023,Technology,Microsoft,2,"[101, 2327, 2340, 1001, 24296, 25185, 25674, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,12236,2020-07-08 21:38:05,Microsoft,2513,0,Microsoft,Raspberry Pi,2.0,LlnuxBot,#Microsoft,"RT @techdudeinc: via #BleepingComputer Blog: ""...",1.280979e+18,LinuxBot,0.0,Technology,Microsoft,1,"[101, 19387, 1030, 6627, 8566, 3207, 2378, 227...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,16941,2020-07-09 04:58:01,Tesla,400,677,Tesla,United Kingdom,4.0,appthisway,#Tesla,RT @mvollmer1: Here are six reasons why #hydro...,1.28109e+18,Appthisway®,0.5719,Technology,Tesla,2,"[101, 19387, 1030, 19842, 14511, 5017, 2487, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,8218,2020-08-29 15:37:00,Apple,544,356,Apple,"Kansas, USA",1.0,DuncsUnlimited,#Apple OR #Iphone,Episode 2 is here - special guest Race Car Dri...,1.299733e+18,delly,-0.2942,Technology,Apple,0,"[101, 2792, 1016, 2003, 2182, 1011, 2569, 4113...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
# move model parameters to GPU if not loaded correctly
# param_device = next(model.parameters()).device

In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_tweets_bigtech_test, "text")

In [None]:
performance_metrics(df_tweets_bigtech_test, "labels", "bert_pred")

Accuracy: 0.912
Precision: 0.9121157265185696
Recall: 0.912
F1-Score: 0.9118522077610691


In [None]:
# also evaluating model on Brand-SA-data
df_brd_sa_p = pd.read_csv("./brand_sentiment_analysis_preprocessed.csv")
df_brd_sa_p.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,0
1,@jessedee Know about {@fludapp@} ? Awesome iPa...,Positive emotion,2
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,2
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,2


In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_brd_sa_p, "tweet_text")

In [None]:
performance_metrics(df_brd_sa_p, "sentiment", "bert_pred")

Accuracy: 0.5525151374010246
Precision: 0.6335749784291748
Recall: 0.5525151374010246
F1-Score: 0.5743773762349313


In [None]:
# testing with Tweets Sentiment Classification data (only negative and positive)
df_tweets_sc = pd.read_csv("./tweets_sentiment_classification_preprocessed.csv")
df_tweets_sc.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test {{URL}} #android ...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,2,What amazing service! Apple won't even talk to...


In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_tweets_sc, "tweet")

In [None]:
performance_metrics(df_tweets_sc, "label", "bert_pred")

Accuracy: 0.12121212121212122
Precision: 0.2681728937897897
Recall: 0.12121212121212122
F1-Score: 0.15126459593881567


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# performance for posts where model didnt predict neutral
df_tweets_sc_np = df_tweets_sc[df_tweets_sc["bert_pred"] != 1]
performance_metrics(df_tweets_sc_np, "label", "bert_pred")

Accuracy: 0.21080368906455862
Precision: 0.25069564105360503
Recall: 0.21080368906455862
F1-Score: 0.2176650264958071


# fine-tuning bert base uncased with corpus

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_t = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

bert_base = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

Unnamed: 0,text,labels
0,{{MENTION}} {{MENTION}} {{MENTION}} 226 x #tes...,1
1,{{MENTION}} Our fleet's on fleek. {{URL}} LMFA...,1
2,So many kitchen blenders are missing dick. #T...,0
3,#tesla. Not too rare anymore. {{URL}},1
4,iPhone users at #SXSW - any of you have your G...,0


In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

In [None]:
def tokenize_function(example):
    return bert_base_t(example['text'], truncation=True,
                       padding='max_length', max_length=128, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/19915 [00:00<?, ? examples/s]

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./bert-base-uncased-sa',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=bert_base,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bert_base_t,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhausbichler-georg[0m ([33mhausbichler-georg-wirtschaftsuniversit-t-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.5428,0.791732,0.79936,0.791732,0.787968
2,0.656000,0.489832,0.814567,0.815469,0.814567,0.813388


TrainOutput(global_step=900, training_loss=0.5461560397677951, metrics={'train_runtime': 306.1459, 'train_samples_per_second': 93.988, 'train_steps_per_second': 2.94, 'total_flos': 1892706370426368.0, 'train_loss': 0.5461560397677951, 'epoch': 2.0})

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.5272891521453857,
 'eval_accuracy': 0.8028781793842035,
 'eval_precision': 0.8034445750980845,
 'eval_recall': 0.8028781793842035,
 'eval_f1': 0.8014900010509911,
 'eval_runtime': 10.1147,
 'eval_samples_per_second': 295.411,
 'eval_steps_per_second': 9.293,
 'epoch': 2.0}

In [None]:
# bert_base.save_pretrained("bert-base-sentiment-corpus")
# bert_base_t.save_pretrained("bert-base-sentiment-corpus")
trainer.save_model("bert-base-sentiment-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_base_corpus_t = AutoTokenizer.from_pretrained("./bert-base-sentiment-corpus")

bert_base_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./bert-base-sentiment-corpus",
    num_labels=3)

In [None]:
param_device = next(bert_base_corpus.parameters()).device

In [None]:
df_corpus_test = test_dataset.to_pandas()

In [None]:
import torch

# version where tokenization happens at CPU and prediction on GPU,
# bert tokenizer cant be moved to GPU
def bert_pred(model, tokenizer, df, text_col):
    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df[text_col][start_idx:end_idx].tolist()

        batch_tokens = tokenizer(batch_texts, padding="max_length", truncation=True,
                                 return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred_sa"] = predictions

In [None]:
bert_pred(bert_base_corpus, bert_base_corpus_t, df_corpus_test, "text")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def performance_metrics(df, label, prediction):
    accuracy = accuracy_score(df[label], df[prediction])
    precision = precision_score(df[label], df[prediction], average="weighted")
    recall = recall_score(df[label], df[prediction], average="weighted")
    f1 = f1_score(df[label], df[prediction], average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    print("\nClassification Report:")
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(df[label], df[prediction], target_names=label_names, digits=4)
    print(report)

In [None]:
performance_metrics(df_corpus_test, "labels", "bert_pred_sa")

Accuracy: 0.8029
Precision: 0.8034
Recall: 0.8029
F1-Score: 0.8015

Classification Report:
              precision    recall  f1-score   support

    negative     0.7902    0.8750    0.8304       960
     neutral     0.8073    0.7134    0.7574       998
    positive     0.8121    0.8223    0.8172      1030

    accuracy                         0.8029      2988
   macro avg     0.8032    0.8036    0.8017      2988
weighted avg     0.8034    0.8029    0.8015      2988



In [None]:
df_tbt_app = pd.read_json("./tweets_bigtech_10k_application_afterNER.json", orient="records")
df_tbt_app.head()

Unnamed: 0,text,tokens,labels,bert_pred
0,Microsoft Build 2020: Empowering developers to...,"[Microsoft, Build, 2020:, Empowering, develope...",1,"[0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
1,I will do photoshop editing retouching documen...,"[I, will, do, photoshop, editing, retouching, ...",1,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
2,#Laney #Amps 🎶 Will Steal Your #Face Right Off...,"[#Laney, #Amps, 🎶, Will, Steal, Your, #Face, R...",0,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1..."
3,RT @ophierian_vp: Geralt &amp; Aerondight Silv...,"[RT, @ophierian_vp:, Geralt, &amp;, Aerondight...",1,"[10, 10, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
4,"#Google, Amazon funnel at least $25 million to...","[#Google,, Amazon, funnel, at, least, $25, mil...",0,"[10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."


In [None]:
bert_pred(bert_base_corpus, bert_base_corpus_t, df_tbt_app, "text")

In [None]:
performance_metrics(df_tbt_app, "labels", "bert_pred_sa")

NameError: name 'df_tbt_app' is not defined

In [None]:
df_tbt_app.to_json("tweets_bigtech_10k_application_afterSA.json", orient="records")