In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import *
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import tqdm
import joblib

In [2]:
df_news = pd.read_csv('data/general-news-lines.csv')
df_NGO = pd.read_csv('data/general-ngo-lines.csv')
df_IGO = pd.read_csv('data/general-igo-lines.csv')

In [3]:
def cleanData(df, stopwordList= []):
    # convert 'Content' to 'Snippet'
    df.rename(columns={'Content': 'Snippet'}, inplace=True)

    # Remove rows with NaN in 'Snippet' column
    df = df.dropna(subset=['Snippet'])

    # Convert 'Snippet' column to strings in case there are any non-string types (wordnet breaks otherwise)
    df['Snippet'] = df['Snippet'].astype(str)

    # Lowercase all words
    df['Snippet'] = df['Snippet'].str.lower()

    # Remove html/markdown tags like \n etc
    df['Snippet'] = df['Snippet'].str.replace(r'<[^>]*>', '')

    # Remove punctuation
    df['Snippet'] = df['Snippet'].str.replace('[{}]'.format(string.punctuation), '')

    # Remove special characters
    df['Snippet'] = df['Snippet'].str.replace('[^A-Za-z ]+', '')

    # Remove single characters
    df['Snippet'] = df['Snippet'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')

    # Remove stopwords
    stop_words = set(stopwords.words('english'))

    # Add excluded words to stopwords
    stop_words.update(['said', 'would', 'also', 'could', 'year', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])
    # Add custom stopwords
    stop_words.update(stopwordList)

    df['Snippet'] = df['Snippet'].apply(lambda x: ' '.join([item for item in x.split() if item.lower() not in stop_words]))

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    df['Snippet'] = df['Snippet'].apply(lambda x: ' '.join([lemmatizer.lemmatize(item) for item in x.split()]))
    
    return df

In [5]:
df_news = cleanData(df_news)
df_NGO = cleanData(df_NGO)
df_IGO = cleanData(df_IGO)

  df['Snippet'] = df['Snippet'].str.replace(r'<[^>]*>', '')
  df['Snippet'] = df['Snippet'].str.replace('[{}]'.format(string.punctuation), '')
  df['Snippet'] = df['Snippet'].str.replace('[^A-Za-z ]+', '')
  df['Snippet'] = df['Snippet'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
  df['Snippet'] = df['Snippet'].str.replace(r'<[^>]*>', '')
  df['Snippet'] = df['Snippet'].str.replace('[{}]'.format(string.punctuation), '')
  df['Snippet'] = df['Snippet'].str.replace('[^A-Za-z ]+', '')
  df['Snippet'] = df['Snippet'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
  df['Snippet'] = df['Snippet'].str.replace(r'<[^>]*>', '')
  df['Snippet'] = df['Snippet'].str.replace('[{}]'.format(string.punctuation), '')
  df['Snippet'] = df['Snippet'].str.replace('[^A-Za-z ]+', '')
  df['Snippet'] = df['Snippet'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')


Unnamed: 0.1,Unnamed: 0,Source,Snippet
0,0,FT-2008-1.txt,financial timeslondon edcompanies uk tuesday j...
1,1,FT-2008-1.txt,page pv crystalox solar capitalise climate cha...
2,2,FT-2008-1.txt,company biggest manufacturer silicon component...
3,3,FT-2008-1.txt,earnings interest tax excluding onetime cost l...
4,4,FT-2008-1.txt,december


In [None]:
# Ensure CUDA (GPU support) is available and enabled.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and model.
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Model evaluation mode, dropout layers are disabled. Used when making predictions and using the pretrained model.
# Pytorch assumes the model is in training mode by default.
model.eval()

def classify_emotion(snippet):
    # Tokenize the batch of snippets.
    inputs = tokenizer(snippet, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Make predictions.
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted class index.
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions.cpu().numpy()

def getSentiment(df, batch_size = 16):
    # Process the snippets in batches.
    batch_size = batch_size 
    sentiments = []

    for i in tqdm(range(0, len(df['Snippet']), batch_size), desc="Classifying"):
        batch_snippets = df['Snippet'][i:i + batch_size].tolist()
        batch_predictions = classify_emotion(batch_snippets)
        sentiments.extend(batch_predictions)

    # Assign the predictions to the DataFrame.
    df['Sentiment'] = sentiments

    return df

# Get the sentiment
df_news = getSentiment(df_news)
df_NGO = getSentiment(df_NGO)
df_IGO = getSentiment(df_IGO)