## Google Colaboratory Notebook

In [None]:
!pip install transformers
!pip install emoji

In [None]:
# Normalisation for BertTweet
# The same normalization was applied during training 
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

# https://huggingface.co/vinai/bertweet-base
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    if lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
          return token

def normalizeTweet(tweet):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])
    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

def replaceInTweet(tweet):
  tokenizer = TweetTokenizer()
  tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
  normTweet = " ".join([normalizeToken(token) for token in tokens])
  return " ".join(normTweet.split())


In [None]:
# Import file to classify
from google.colab import drive
drive.mount('/content/gdrive/')
home = "gdrive/MyDrive/pfe/" # Add your path if data is on your google drive or you can import it

In [None]:
import pandas as pd

# Reading the reddit posts
reddit_posts = pd.read_csv(home + 'reddit_posts_2022_07_21-10_16_58_AM_no_duplicates.csv', index_col=0)
reddit_posts.reset_index(inplace=True)
reddit_posts['titled_selftext'] = reddit_posts['title'] + " " + reddit_posts['selftext']

In [None]:
# Defining help functions for the model
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

class RedditDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        for key in self.encodings.keys():
            return len(self.encodings[key])
        #return len(self.labels)

def proba_to_category(row):
    score_0, score_1 = row.iloc[0], row.iloc[1]
    if score_0 < 0.5 and score_1 >= 0.5:
        return 1
    else:
        return 0

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("device: {}".format(device))

In [None]:
# Importing tokenizer and model

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModelForSequenceClassification.from_pretrained(home +  "/classifieur_personnel").to(device)

In [None]:
import torch.nn.functional as F

reddit_posts_list = reddit_posts.titled_selftext.to_list()
normalized_text = [normalizeTweet(post) for post in reddit_posts_list]

In [None]:
# Splitting posts into smaller portions of text similarly to the text samples the model was trained on
# List to keep track of split portions of text
split_reddit_titled_posts = []
# Index to keep track of the original post each portion of text belongs to 
j = 0

for text in normalized_text:
  double_sentences = []
  # Split normalized reddit post to sentences 
  sentences = text.split('.')
  nb_sentences = len(sentences)
  # Index to keep track of the sentences  
  i = 0
  while i < nb_sentences:
    if i+1 < nb_sentences:
      text_portion = sentences[i] + "." + sentences[i+1]
      if len(text_portion) > 1:
        split_reddit_titled_posts.append((j, text_portion))
      i=i+2
    else:
      text_portion = sentences[i]
      if len(text_portion) > 1:
        split_reddit_titled_posts.append((j, text_portion))
      break
  # Incrementing the post index
  j = j + 1

In [None]:
# Creating a dataframe containing the portions of text and the original post index
reddit_posts_indices = [v[0] for v in split_reddit_titled_posts]
reddit_posts_splits = [v[1] for v in split_reddit_titled_posts]
reddit_split_posts = pd.DataFrame(list(zip(reddit_posts_splits, reddit_posts_indices)),
               columns =['text', 'file_index'])

# truncation, padding = true ensures that all sentences are padded to the same length and are truncated to be no longer model's max input lengts
# => allows to feed batches of sequences 
# Preparing the text portions to be fed to the model
normalized_text = reddit_split_posts.text.map(normalizeTweet).to_list()
reddit_posts_encodings = tokenizer(normalized_text, truncation=True, padding=True, return_tensors="pt").to(device)
reddit_posts_DataSet = RedditDataSet(reddit_posts_encodings)

In [None]:
# Loading the prepared input into the model
model.eval()
reddit_posts_Loader = DataLoader(reddit_posts_DataSet, batch_size=32)
print("len reddit_posts_Loader: {}".format(len(reddit_posts_Loader)))

In [None]:
# Computing the prediction results
predicted = pd.Series()
for (i, batch) in enumerate(reddit_posts_Loader): 
  print(batch)
  if i % 2000 == 0 : print(i)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  outputs = model(input_ids, attention_mask=attention_mask)
  proba = F.softmax(outputs[0]).detach().cpu().numpy()  # get probabilities from output
  predicted_labels = pd.DataFrame(proba).apply(proba_to_category, axis=1) # get predicted class (highest proba)
  predicted = predicted.append(predicted_labels, ignore_index=True)

print("predicted: {}".format(predicted.shape))
print(predicted.value_counts())