In [1]:
# installations for colab
#!pip install transformers accelerate torch huggingface_hub datasets emoji

# BerTweet Sentiment Analysis

In [None]:
# using bertweet-sentiment analysis
# https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis

from transformers import AutoTokenizer, AutoModelForSequenceClassification
bertweet_t = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

bertweet = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis",
    device_map="auto",
    num_labels=3)

In [None]:
import torch

def sentiment_score(text, tokenizer, model):
    tokens = tokenizer.encode(text, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))

In [None]:
import torch

#def bert_pred(model, tokenizer, df, text_col):

    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(model.device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_tokens = tokenizer(df[text_col][start_idx:end_idx].tolist(),
                                 padding=True, truncation=True, return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred"] = predictions

In [None]:
import pandas as pd

#df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")

# read in data after NER csv
df_brd_sa = pd.read_csv("./data after NER.csv")

df_brd_sa.head()

In [None]:
df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].unique()

In [None]:
df_brd_sa.dropna(inplace=True)

In [None]:
df_brd_sa["sentiment_prediction"] = df_brd_sa["tweet_text"].apply(lambda x: sentiment_score(x[:500], bertweet_t, bertweet))

In [None]:
df_brd_sa["sentiment_prediction"].unique()

In [None]:
# 0 negative
# 1 neutral
# 2 positive
df_brd_sa["sentiment"] = df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].replace({"Negative emotion" : 0, "Positive emotion" : 2, "No emotion toward brand or product" : 1, "I can't tell" : 1})
df_brd_sa.head()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def performance_metrics(df, label, prediction):
     accuracy = accuracy_score(df[label], df[prediction])
     precision = precision_score(df[label], df[prediction], average="weighted")
     recall = recall_score(df[label], df[prediction], average="weighted")
     f1 = f1_score(df[label], df[prediction], average="weighted")

     print(f"Accuracy: {accuracy}")
     print(f"Precision: {precision}")
     print(f"Recall: {recall}")
     print(f"F1-Score: {f1}")

In [None]:
performance_metrics(df_brd_sa, "sentiment", "sentiment_prediction")

In [None]:
df_brd_sa.to_csv("./data after SA.csv")

In [None]:
bert_pred(bertweet, bertweet_t, df_brd_sa, "tweet_text")

In [None]:
df_brd_sa.head()

In [None]:
performance_metrics(df_brd_sa, "sentiment", "bertweet_pred")

## testing with sentiment corpus

In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

In [None]:
bert_pred(bertweet, bertweet_t, df_sc, "text")

In [None]:
performance_metrics(df_sc, "labels", "bert_pred")

## bertweet fine-tuning

In [None]:
# trying to fine-tune this model (goal of accuracy > 72,89% and precision > 88,1%)
from datasets import Dataset

dataset = Dataset.from_pandas(df_brd_sa)

In [None]:
def tokenize_function(example):
    return tokenizer(example['tweet_text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column('sentiment', 'labels')
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, # adjust depending on resources
    per_device_eval_batch_size=4,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
results = trainer.evaluate(test_dataset)
print(results)

In [None]:
model.save_pretrained('./models/bertweetSA_after_inital_finetuning')
tokenizer.save_pretrained('./models/bertweetSA_after_inital_finetuning')

# Testing fine-tuned BERTweet-SA

In [None]:
#loading and trying the saved model
tokenizer2 = AutoTokenizer.from_pretrained("./models/bertweetSA_after_inital_finetuning")

model2 = AutoModelForSequenceClassification.from_pretrained("./models/bertweetSA_after_inital_finetuning")

In [None]:
df_brd_sa["sentiment_prediction_finetuned"] = df_brd_sa["tweet_text"].apply(lambda x: sentiment_score(x[:500], tokenizer2, model2))

In [None]:
# very high results, but may not be too telling because it was tested on same data it was finetuned on
performance_metrics(df_brd_sa, "sentiment", "sentiment_prediction_finetuned")

## preprocessing Tweets Big Tech data

In [None]:
import pandas as pd
df_tweets_bigtech_10ksample = pd.read_csv("tweets_bigtech_10ksample.csv")

In [None]:
# creating small test dataset with random sampling for later
df_tweets_bigtech_test = df_tweets_bigtech.sample(n=4000, seed=42)
df_tweets_bigtech_test.to_csv("./tweets_bigtech_test.csv")

In [None]:
df_tweets_bigtech_10ksample["sentiment_prediction_finetuned"] = df_tweets_bigtech_10ksample["text"].apply(lambda x: sentiment_score(x[:500], tokenizer2, model2))

In [None]:
df_tweets_bigtech_10ksample.head()

In [None]:
performance_metrics(df_tweets_bigtech_10ksample, "sentiment", "sentiment_prediction_finetuned")

In [None]:
# comparison with not fine-tuned model on tweets-bigtech-data
df_tweets_bigtech_10ksample["sentiment_prediction"] = df_tweets_bigtech_10ksample["text"].apply(lambda x: sentiment_score(x[:500], tokenizer, model))

In [None]:
performance_metrics(df_tweets_bigtech_10ksample, "sentiment", "sentiment_prediction")

# Second Fine-tuning of BERTweet-SA with big tech data

In [None]:
# finetuning on tweets_bigtech_sample
import pandas as pd
df_tweets_bigtech_sample = pd.read_json("./tweets_bigtech_sample.json", orient="records")
df_tweets_bigtech_sample.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_tweets_bigtech_sample)

In [None]:
def tokenize_function(example):
    return bertweet_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
#tokenized_datasets = tokenized_datasets.rename_column('sentiment', 'labels')
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split1 = tokenized_datasets.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bertweet_t,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16, # adjust depending on resources
    per_device_eval_batch_size=16,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

In [None]:
trainer.save_model("./models/bertweet-sa-after-tweetsbigtech")
bertweet_t.save_pretrained("./models/bertweet-sa-after-tweetsbigtech")

In [None]:
model_after_tweetsbigtech = AutoModelForSequenceClassification.from_pretrained(
    "./models/bertweet-sa-after-tweetsbigtech",
    num_labels=3)
tokenizer_after_tweetsbigtech = AutoTokenizer.from_pretrained("./models/bertweet-sa-after-tweetsbigtech")

In [None]:
bert_pred(model_after_tweetsbigtech, tokenizer_after_tweetsbigtech, df_sc_10ksample, "text")

In [None]:
performance_metrics(df_sc_10ksample, "labels", "bert_pred_sa")

In [None]:
df_tweets_bigtech_test = pd.read_csv("./tweets_bigtech_test.csv")
df_tweets_bigtech_test.head()

In [None]:
import torch

def sentiment_score(text, tokenizer, model):
    tokens = tokenizer.encode(text, return_tensors='pt', truncation = True, padding="max_length") #max_length=128 necessary?
    result = model(tokens)
    return int(torch.argmax(result.logits))

In [None]:
df_tweets_bigtech_test["sentiment_prediction_finetuned"] = df_tweets_bigtech_test["text"].apply(lambda x: sentiment_score(x[:500], tokenizer_after_tweetsbigtech, model_after_tweetsbigtech))

In [None]:
performance_metrics(df_tweets_bigtech_test, "sentiment", "sentiment_prediction_finetuned")

# finetuning bertweet sa with corpus

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bertweet_t = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

bertweet = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/bertweet-base-sentiment-analysis",
    device_map="auto",
    num_labels=3)

In [3]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

In [4]:
# examining length of tokens in the dataset to determine maxlength
df_sc["text"].str.split().str.len().agg(["mean","max","std"])

In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

def tokenize_function(example):
    return bertweet_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [6]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [7]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score
from transformers import Trainer, TrainingArguments

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bertweet_t,
    compute_metrics=compute_metrics
)

In [8]:
trainer.train()

In [9]:
trainer.evaluate(test_dataset)

In [10]:
trainer.save_model("./bertweet-sa-corpus")

## applying the model

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

bertweet_corpus_t = AutoTokenizer.from_pretrained("./bertweet-sa-corpus")

bertweet_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./bertweet-sa-corpus",
    device_map="auto",
    num_labels=3)

In [12]:
corpus_test_df = test_dataset.to_pandas()
corpus_test_df.head()

In [13]:
import torch

def bert_pred(model, tokenizer, df, text_col):
    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df[text_col][start_idx:end_idx].tolist()

        batch_tokens = tokenizer(batch_texts, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred_sa"] = predictions

bert_pred(bertweet_corpus, bertweet_corpus_t, corpus_test_df, "text")

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def performance_metrics(df, label, prediction):
    accuracy = accuracy_score(df[label], df[prediction])
    precision = precision_score(df[label], df[prediction], average="weighted")
    recall = recall_score(df[label], df[prediction], average="weighted")
    f1 = f1_score(df[label], df[prediction], average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    print("\nClassification Report:")
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(df[label], df[prediction], target_names=label_names, digits=4)
    print(report)

performance_metrics(corpus_test_df, "labels", "bert_pred_sa")

### applying model to case study data

In [15]:
import pandas as pd

df_dell = pd.read_json("./dell_cs_after_ner.json", orient="records")
df_dell.head()

In [16]:
bert_pred(bertweet_corpus, bertweet_corpus_t, df_dell, "text")

In [17]:
df_dell.to_json("./dell_cs_after_predictions.json", orient="records")

# Twitter Roberta base sentiment

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
roberta_sentiment_t = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

roberta_sentiment = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=3)

In [None]:
# testing model with portion of sentiment corpus to see performance on diverse data
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc_10ksample = df_sc.sample(10000, random_state=42)
df_sc_10ksample.head()

In [None]:
# applying preprocessing from model's hf page to data before predicting def preprocess(text):
# def preprocess_text(text):
#     new_text = []
#     for t in text.split(" "):
#         t = '@user' if t == "{{MENTION}}" else t
#         t = 'http' if t == "{{URL}}" else t
#         new_text.append(t)
#     return " ".join(new_text)

# df_sc_10ksample["text"] = df_sc_10ksample["text"].apply(preprocess_text)

# didnt improve performance

In [None]:
bert_pred(roberta_sentiment, roberta_sentiment_t, df_sc_10ksample, "text")

In [None]:
performance_metrics(df_sc_10ksample, "labels", "bert_pred_sa")

In [None]:
df_sc_10ksample["labels"].value_counts()

In [None]:
df_sc_10ksample["bert_pred_sa"].value_counts()

In [None]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
s_classifier = pipeline("sentiment-analysis", model= model_path, tokenizer=model_path)

In [None]:
df_sc_10ksample["prediction_pipeline"] = df_sc_10ksample["text"].apply(lambda x: s_classifier(x))

In [None]:
df_sc_10ksample["pipeline_label"] = df_sc_10ksample["prediction_pipeline"].apply(lambda x: x[0]["label"])
df_sc_10ksample["pipeline_label"] = df_sc_10ksample["pipeline_label"].replace({"negative":0, "neutral":1, "positive":2})
df_sc_10ksample.head()

In [None]:
performance_metrics(df_sc_10ksample, "labels", "pipeline_label")

## fine-tuning with corpus

In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

def tokenize_function(example):
    return roberta_sentiment_t(example['text'], truncation=True,
                       padding='max_length', max_length=128, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./twitter-roberta-sentiment',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=roberta_sentiment,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=roberta_sentiment_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.save_model("./twitter-roberta-sentiment-after-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

twitter_roberta_corpus_t = AutoTokenizer.from_pretrained("./twitter-roberta-sentiment-after-corpus")

twitter_roberta_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./twitter-roberta-sentiment-after-corpus",
    num_labels=3)

In [None]:
df_corpus_test = test_dataset.to_pandas()
df_corpus_test.head()

In [None]:
bert_pred(twitter_roberta_corpus, twitter_roberta_corpus_t, df_corpus_test, "text")

In [None]:
performance_metrics(df_corpus_test, "labels", "bert_pred_sa")

# fine-tuning Bert-base-uncased with tweets bigtech

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_t = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

bert_base = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=3)

In [None]:
# using 25k rows of Tweets-BigTech-data to finetune
import pandas as pd
df_tweets_bigtech_25ksample = pd.read_csv("./tweets_bigtech_25ksample.csv")
df_tweets_bigtech_25ksample.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_tweets_bigtech_25ksample)

In [None]:
def tokenize_function(example):
    return bert_base_t(example['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column('sentiment', 'labels')
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./bert-base-uncased-sa',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=bert_base,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=bert_base_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
bert_base.save_pretrained("bert-base-uncased-sa")
bert_base_t.save_pretrained("bert-base-uncased-sa")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_ft_t = AutoTokenizer.from_pretrained("bert-base-uncased-sa")

bert_base_ft = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased-sa",
    num_labels=3)

In [None]:
df_tweets_bigtech_test = test_dataset.to_pandas()
df_tweets_bigtech_test.head()

In [None]:
# move model parameters to GPU if not loaded correctly
# param_device = next(model.parameters()).device

In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_tweets_bigtech_test, "text")

In [None]:
performance_metrics(df_tweets_bigtech_test, "labels", "bert_pred")

In [None]:
# also evaluating model on Brand-SA-data
df_brd_sa_p = pd.read_csv("./brand_sentiment_analysis_preprocessed.csv")
df_brd_sa_p.head()

In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_brd_sa_p, "tweet_text")

In [None]:
performance_metrics(df_brd_sa_p, "sentiment", "bert_pred")

In [None]:
# testing with Tweets Sentiment Classification data (only negative and positive)
df_tweets_sc = pd.read_csv("./tweets_sentiment_classification_preprocessed.csv")
df_tweets_sc.head()

In [None]:
bert_pred(bert_base_ft, bert_base_ft_t, df_tweets_sc, "tweet")

In [None]:
performance_metrics(df_tweets_sc, "label", "bert_pred")

In [None]:
# performance for posts where model didnt predict neutral
df_tweets_sc_np = df_tweets_sc[df_tweets_sc["bert_pred"] != 1]
performance_metrics(df_tweets_sc_np, "label", "bert_pred")

# fine-tuning bert base uncased with corpus

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bert_base_t = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

bert_base = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased",
    num_labels=3)

In [None]:
import pandas as pd
df_sc = pd.read_json("sentiment_corpus.json", orient="records")
df_sc.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_sc)

In [None]:
def tokenize_function(example):
    return bert_base_t(example['text'], truncation=True,
                       padding='max_length', max_length=128, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
train_test_split1 = tokenized_dataset.train_test_split(test_size=0.15, seed=42)
train_val_dataset = train_test_split1['train']
test_dataset = train_test_split1['test']

train_test_split2 = train_val_dataset.train_test_split(test_size=0.15, seed=42)
train_dataset = train_test_split2['train']
val_dataset = train_test_split2['test']

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir='./bert-base-uncased-sa',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32, # adjust depending on resources
    per_device_eval_batch_size=32,
    num_train_epochs=2, # adjust depending on resources
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=bert_base,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=bert_base_t,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
# bert_base.save_pretrained("bert-base-sentiment-corpus")
# bert_base_t.save_pretrained("bert-base-sentiment-corpus")
trainer.save_model("bert-base-sentiment-corpus")

## applying the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

bert_base_corpus_t = AutoTokenizer.from_pretrained("./bert-base-sentiment-corpus")

bert_base_corpus = AutoModelForSequenceClassification.from_pretrained(
    "./bert-base-sentiment-corpus",
    num_labels=3)

In [None]:
param_device = next(bert_base_corpus.parameters()).device

In [None]:
df_corpus_test = test_dataset.to_pandas()

In [None]:
import torch

# version where tokenization happens at CPU and prediction on GPU,
# bert tokenizer cant be moved to GPU
def bert_pred(model, tokenizer, df, text_col):
    # moving model to GPU and setting it to evaluation-mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    batch_size = 32
    num_batches = len(df) // batch_size + 1
    predictions = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df[text_col][start_idx:end_idx].tolist()

        batch_tokens = tokenizer(batch_texts, padding="max_length", truncation=True,
                                 return_tensors="pt", max_length=128)
        batch_tokens = {key: value.to(device) for key, value in batch_tokens.items()}

        with torch.no_grad():
            outputs = model(**batch_tokens)
            batch_predictions = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(batch_predictions.cpu().numpy())

    df["bert_pred_sa"] = predictions

In [None]:
bert_pred(bert_base_corpus, bert_base_corpus_t, df_corpus_test, "text")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def performance_metrics(df, label, prediction):
    accuracy = accuracy_score(df[label], df[prediction])
    precision = precision_score(df[label], df[prediction], average="weighted")
    recall = recall_score(df[label], df[prediction], average="weighted")
    f1 = f1_score(df[label], df[prediction], average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    print("\nClassification Report:")
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(df[label], df[prediction], target_names=label_names, digits=4)
    print(report)

In [None]:
performance_metrics(df_corpus_test, "labels", "bert_pred_sa")

In [None]:
df_tbt_app = pd.read_json("./tweets_bigtech_10k_application_afterNER.json", orient="records")
df_tbt_app.head()

In [None]:
bert_pred(bert_base_corpus, bert_base_corpus_t, df_tbt_app, "text")

In [None]:
performance_metrics(df_tbt_app, "labels", "bert_pred_sa")

In [None]:
df_tbt_app.to_json("tweets_bigtech_10k_application_afterSA.json", orient="records")