In [1]:
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax
from datasets import load_dataset, load_from_disk
import glob
import tqdm


In [2]:

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

def sentiment_scoring(text):
    # text = "Covid cases are increasing fast!"
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # # TF
    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
    # model.save_pretrained(MODEL)
    # text = "Covid cases are increasing fast!"
    # encoded_input = tokenizer(text, return_tensors='tf')
    # output = model(encoded_input)
    # scores = output[0][0].numpy()
    # scores = softmax(scores)
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
csv_2021_full = pd.read_csv('data/CES_tweets_2021_fully_cleaned_with_dates.csv')
csv_2021_no_dups = csv_2021_full.drop_duplicates(subset=['text'])
csv_2021_no_dups[['text', 'date']].to_csv('data/2021_scraped.csv')


In [None]:
csv_files = glob.glob('data\\*_scraped.csv')[:-1]

df_append = pd.DataFrame()
#append all files together
for file in csv_files:
            df_temp = pd.read_csv(file)
            df_append = df_append.append(df_temp[['text', 'date']], ignore_index=True)
df_append.to_csv('data/collected_scraped2.csv')

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors= 'pt' ,padding=True, truncation=True)

In [None]:
loaded_dataset = pd.read_csv('data/collected_scraped2.csv')

In [None]:
loaded_full_scraped = load_dataset("csv", data_files='data/collected_scraped2.csv')

In [None]:
tokenized_dataset = loaded_full_scraped.map(tokenize_function, batched = True)

In [None]:
tokenized_dataset.save_to_disk("tokenized_dataset2.hf")


In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'text', 'date', 'input_ids', 'attention_mask'],
        num_rows: 38988087
    })
})

In [2]:
ds = load_from_disk("tokenized_dataset.hf")

In [4]:
drop_col_mapping = ds.map(remove_columns=['Unnamed: 0.1', 'Unnamed: 0'])

Map:   0%|          | 0/38988087 [00:00<?, ? examples/s]

In [5]:
drop_col_mapping.save_to_disk("tokenized_dataset_dropped.hf")


Saving the dataset (0/82 shards):   0%|          | 0/38988087 [00:00<?, ? examples/s]

In [40]:
ds = load_from_disk("tokenized_dataset_dropped.hf")

In [27]:
feature_list = []
for file in tqdm.tqdm(glob.glob('data/CES_*.csv')):
    feature_list.append( pd.unique(pd.read_csv(file).feature))
pd.DataFrame((list(pd.unique(np.concatenate(feature_list)))),columns=['keyword']).to_csv('data/initial_keywords.csv')

100%|██████████| 5/5 [07:52<00:00, 94.42s/it]
