In [None]:
import os
import re
from glob import glob
import pickle
import torch
from transformers import pipeline, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from scipy.special import softmax
import glove
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import warnings
warnings.filterwarnings("ignore")

# Define new functions

In [None]:
def preprocess(text):
    new_text = []
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        t = "" if t == "RT" else t
        new_text.append(t)
    
    sentence = " ".join(new_text)
    sentence = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', sentence)
    sentence = re.sub("\.\.\.", '', sentence)
    sentence = re.sub("\.\.", '', sentence)
    sentence = re.sub("#", '', sentence)
    
    return sentence


# Get file info

In [None]:
# List all files in the BERT folder
results = glob('../raw/Full/*.csv')
results.sort()

f = open("corpus_full.json")
corpus_full = json.load(f)
f.close()

full_raw = {}

# BERT

In [None]:
# Load pre-trained sentiment model

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, use_fast = True)

# Get pretrained models
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.save_pretrained(model_path)

In [None]:
# Open dict
with open('bert_sentiment.pickle', 'rb') as filename:
    sentiment = pickle.load(filename)

# Open dict
with open('bert_sentiment_overall.pickle', 'rb') as filename:
    sentiment_overall = pickle.load(filename)


In [None]:
for r in results[len(sentiment.keys())-1:]:
    day = r[-14:-4]
    print(day)
    if day not in sentiment.keys():
        sentiment[day] = {}
    if day not in sentiment_overall.keys():
        sentiment_overall[day] = []

    sentiment[day]['sentence'] = []
    sentiment[day]['sentiment'] = []
    overall = 0
    
    df = pd.read_csv(r, lineterminator='\n')
    df = df[df.lemma_length > 0]
    corpus = df.text.tolist()
    
    for sentence in corpus:
        sentence = preprocess(sentence)
        sentiment[day]['sentence'].append(sentence)

        encoded_input = tokenizer(sentence, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        # Get sentence sentiment
        # Positive words are 0.5, negative -0.5, neutral is 0
        # Take the sum, the larger the sum the more positive the sentence
        total = 0
        for index, label in config.id2label.items():
            if label == "negative":
                total += -0.5
            elif label == "neutral":
                total += 0
            elif label == "positive":
                total += 0.5

        if total > 0:
            sentiment[day]['sentiment'].append("POS")
            overall += 0.5
        elif total < 0:
            sentiment[day]['sentiment'].append("NEG")
            overall += -0.5
        else:
            sentiment[day]['sentiment'].append("NEU")
            overall += 0
    
    if overall > 0:
        sentiment_overall[day] = "POS"
    elif overall < 0:
        sentiment_overall[day] = "NEG"
    else:
        sentiment_overall[day] = "NEU"


In [None]:
with open('bert_sentiment.pickle', 'wb') as filename:
    pickle.dump(sentiment, filename)
with open('bert_sentiment_overall.pickle', 'wb') as filename:
    pickle.dump(sentiment_overall, filename)

In [None]:
sentiment_df = pd.DataFrame.from_dict({(i,j): sentiment[i][j]
                               for i in sentiment.keys()
                               for j in sentiment[i].keys()}, 
                              orient="columns")

In [None]:
sentiment_overall_df = pd.DataFrame.from_dict(sentiment_overall, 
                              orient="index")
sentiment_overall_df = sentiment_overall_df.reset_index()
sentiment_overall_df.columns = ["day", "sentiment"]