In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed
from sklearn.metrics import classification_report
import datasets
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# Load data
df_orig = pd.read_csv('twitter_validation.csv', names=["Twitter ID","Topic","Sentiment","Text"])
df = df_orig.iloc[0:100]

# Preprocessing (delete username and url)
def preprocess(text):
    temp = []

    for t in text.split(" "): # split a sentence into words by spaces " ".
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        temp.append(t)
    return " ".join(temp)

# Change Irrelevant label into Neutral as mentioned in the data set description.
def adjust_ori_sentiment(sentiment):
    if sentiment == "Irrelevant":
        temp_str = "Neutral"
        return temp_str
    else:
        return sentiment

df['Text'] = df['Text'].apply(preprocess)
df['Sentiment'] = df['Sentiment'].apply(adjust_ori_sentiment)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Text'] = df['Text'].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = df['Sentiment'].apply(adjust_ori_sentiment)


Unnamed: 0,Twitter ID,Topic,Sentiment,Text
0,3364,Facebook,Neutral,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@user Why do I pay for WORD when it functions ...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
95,9456,Overwatch,Negative,@user so when i try to buy overwatch with a cr...
96,11687,Verizon,Negative,@user Can you waive some data overage charges?...
97,1589,Battlefield,Negative,No one buy battlefield 3 on steam! It doesn’t ...
98,3526,Facebook,Neutral,Our #HISAPerth #OBIawards ceremony is taking p...


In [None]:
LR = 2e-5
EPOCHS = 30
BATCH_SIZE = 64
MODEL = "cardiffnlp/twitter-roberta-base-2021-124m" # use this to finetune the language model
#MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest" # use this to finetune the sentiment classifier
MAX_TRAINING_EXAMPLES = 7500 # set this to -1 if you want to use the whole training set

In [None]:
# set transformers seed
seed = 223
set_seed(seed)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',                   # output directory
    num_train_epochs=EPOCHS,                  # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
    warmup_steps=100,                         # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                        # strength of weight decay
    logging_dir='./logs',                     # directory for storing logs
    logging_steps=160,                        # when to print log
    evaluation_strategy='steps',              # evaluate every n number of steps.
    eval_steps=160,                           # how often to evaluate. If not set defaults to number of logging_steps
    load_best_model_at_end=True,              # to load or not the best model at the end
    save_steps=160,                           # create a checkpoint every time we evaluate,
    seed=seed                                 # seed for consistent results

)


num_labels = len(set(train_dataset['labels'])) if 'labels' in train_dataset.features.keys() else len(set(train_dataset['label']))

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

In [None]:
trainer = Trainer(
    model=model,                               # the instantiated 🤗 Transformers model to be trained
    tokenizer=tokenizer,                       # tokenizer to be used to pad the inputs
    args=training_args,                        # training arguments, defined above
    train_dataset=train_dataset,               # training dataset
    eval_dataset=val_dataset,                  # evaluation dataset
    callbacks = [EarlyStoppingCallback(3, 0.001)], # early stopping which stops the training after 3 evaluation calls with no improvement of performance of at least 0.001
)

trainer.train()

In [None]:
trainer.save_model("./results/best_model") # save best model

In [None]:
# for every prediction the model ouptuts logits where largest value indicates the predicted class
test_preds_raw, test_labels , _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
print(classification_report(test_labels, test_preds, digits=3))

In [None]:
from scipy.special import softmax

scores = softmax(test_preds_raw, axis=1)
scores

In [None]:
def get_predictions(tweets):
    """ wrapper function to predict sentiment of tweets"""
    with torch.no_grad():
        encoded_input = tokenizer(
            tweets, padding=True, truncation=True, return_tensors='pt'
        )

        # set model on evaluation mode to deactivate Dropout
        trainer.model.eval()
        # pass encoded text to model
        output = trainer.model(**{k: v.to('cuda') for k, v in encoded_input.items()})
        # get logits and move them to cpu to get the predictions
        output = output.logits.detach().cpu().numpy()
        predictions = np.argmax(output, axis=1)

    return predictions

tweets = ["RT @UKLabour: Britain is facing the biggest rail strike in a generation but @GrantShapps hasn’t spent a single second in talks to avert it…",
          "Good news in today’s jobs stats: the number of employees on payrolls increased again in March.",
          "I'm #live in Gladstone with my Labor team: https://t.co/chWrHtumLc"]

# get predictions
predictions = get_predictions(tweets)
print(predictions)

# map predictions to negative/neutral/positive
sentiment_mapping = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

predictions = [sentiment_mapping[x] for x in predictions]
print(predictions)

In [None]:
# read data into a dataframe and only keep the tweet_id (id), text,  author username, and date of tweet (created_at)
df = pd.read_json('workshop_tweets.json', lines=True)
df['username'] = df['author'].apply(lambda x: x['username'])
df = df[['id', 'text', 'username', 'created_at']]

In [None]:
# convert pandas to huggingface Dataset & tokenize
df = datasets.Dataset.from_pandas(df)
df = df.map(lambda e: tokenizer(e['text'], truncation=True), batched=True)

In [None]:
# make predicitons
output = trainer.predict(df)
predictions = np.argmax(output.predictions, axis=1)

In [None]:
# recast to pandas for easier visualizations
df = df.to_pandas()
df['sentiment'] = predictions

In [None]:
# consider only UK
df_uk = df[df['username'].isin(['BorisJohnson', 'Keir_Starmer'])]

plot_uk = df_uk.groupby('sentiment')['username'].value_counts()
for idx in plot_uk.index:
    user_count = len(df_uk[df_uk['username'] == idx[1]])
    plot_uk.loc[idx] = (plot_uk.loc[idx]/user_count) * 100

ax = plot_uk.unstack().plot(figsize=(12,8), kind='bar',  xlabel='', legend=True, ylabel='Tweets %',  width=0.4)
ax.set_xticklabels(['Negative', 'Neutral', 'Positive'],rotation=0)

In [None]:
# consider only Australia
df_aus = df[df['username'].isin(['AlboMP', 'ScottMorrisonMP'])]

plot_aus = df_aus.groupby('sentiment')['username'].value_counts()
for idx in plot_aus.index:
    user_count = len(df_aus[df_aus['username'] == idx[1]])
    plot_aus.loc[idx] = (plot_aus.loc[idx]/user_count) * 100

ax = plot_aus.unstack().plot(figsize=(12,8), kind='bar',  xlabel='', legend=True, ylabel='Tweets %',  width=0.4)
ax.set_xticklabels(['Negative', 'Neutral', 'Positive'],rotation=0)

In [None]:
df_aus['month'] = df_aus['created_at'].dt.strftime('%m')

In [None]:
# Consider only negative and positive sentiments
to_plot = (df_aus.groupby(['month','username'])['sentiment'].value_counts(normalize=True)*100).unstack().unstack().fillna(0)
to_plot[[(0, 'AlboMP'), (0, 'ScottMorrisonMP'), (2, 'AlboMP'), (2, 'ScottMorrisonMP')]].plot(figsize=(19,12),
                                                                                             color = ['red', 'red', 'blue', 'blue'],
                                                                                             style=['-','--','-','--'],
                                                                                             ylabel='Tweets %')

plt.legend(title='',labels=['AlboMP: negative', 'ScottMorrisonMP: negative', 'AlboMP: positive','ScottMorrisonMP: positive'])