In [None]:
!pip install transformers
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Normalisation for BertTweet
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

# https://huggingface.co/vinai/bertweet-base
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    if lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
          return token

def normalizeTweet(tweet):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])
    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
    
    return " ".join(normTweet.split())

def replaceInTweet(tweet):
  tokenizer = TweetTokenizer()
  tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
  normTweet = " ".join([normalizeToken(token) for token in tokens])
  return " ".join(normTweet.split())


In [None]:
#Import file to classify
from google.colab import drive
drive.mount('/content/gdrive/')
home = "gdrive/MyDrive/pfe/" #Add your path if data is on your google drive or you can import it

Mounted at /content/gdrive/


In [None]:
import pandas as pd

# Reading the reddit posts
reddit_posts = pd.read_csv(home + 'reddit_posts_2022_07_21-10_16_58_AM_no_duplicates.csv', index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
reddit_posts.reset_index(inplace=True)

In [None]:
reddit_posts['titled_selftext'] = reddit_posts['title'] + " " + reddit_posts['selftext']

In [None]:
reddit_posts['selftext']

0        I got sick in March 2020, was not hospitalized...
1         Have any of you gotten worse at around 8 months?
2        A pretty personal question, but a very strange...
3        It amazes me how something so small can cause ...
4        I looked back at my records since i've been si...
                               ...                        
26440    I thought I had my long Covid (since December)...
26441     Hi all 44m never tested positive for covid bu...
26442    This might be useful for anyone struggling wit...
26443    It’s been a few weeks since I have felt awful ...
26444    You may have read about the trial with a probi...
Name: selftext, Length: 26445, dtype: object

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

class RedditDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        for key in self.encodings.keys():
            return len(self.encodings[key])
        #return len(self.labels)

def proba_to_category(row):
  #print(row)
  score_0, score_1 = row.iloc[0], row.iloc[1]

  if score_0 < 0.5 and score_1 >= 0.5:
    return 1
  else: return 0

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("device: {}".format(device))
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModelForSequenceClassification.from_pretrained(home +  "/classifieur_personnel").to(device)

device: cuda:0


Downloading config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading bpe.codes:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Including titles

In [None]:
import torch.nn.functional as F

reddit_posts_list = reddit_posts.titled_selftext.to_list()
normalized_text = [normalizeTweet(post) for post in reddit_posts_list]
#normalized_text = reddit_posts_list

In [None]:
split_reddit_titled_posts = []

j = 0

for text in normalized_text:
  double_sentences = []
  sentences = text.split('.')
  nb_sentences = len(sentences)
  i = 0
  while i < nb_sentences:
    if i+1 < nb_sentences:
      text_portion = sentences[i] + "." + sentences[i+1]
      if len(text_portion) > 1:
        split_reddit_titled_posts.append((j, text_portion))
      i=i+2
    else:
      text_portion = sentences[i]
      if len(text_portion) > 1:
        split_reddit_titled_posts.append((j, text_portion))
      break
  j = j + 1
 

In [None]:
half_normalized_text = [replaceInTweet(post) for post in reddit_posts_list]
split_reddit_half_normalized_titled_posts = []

j = 0

for text in half_normalized_text:
  double_sentences = []
  sentences = text.split('.')
  nb_sentences = len(sentences)
  i = 0
  while i < nb_sentences:
    if i+1 < nb_sentences:
      text_portion = sentences[i] + "." + sentences[i+1]
      if len(text_portion) > 1:
        split_reddit_half_normalized_titled_posts.append((j, text_portion))
      i=i+2
    else:
      text_portion = sentences[i]
      if len(text_portion) > 1:
        split_reddit_half_normalized_titled_posts.append((j, text_portion))
      break
  j = j + 1

In [None]:
split_reddit_half_normalized_titled_posts

In [None]:
reddit_posts_indices = [v[0] for v in split_reddit_titled_posts]
reddit_posts_splits = [v[1] for v in split_reddit_titled_posts]

In [None]:
reddit_posts_indices_2 = [v[0] for v in split_reddit_half_normalized_titled_posts]
reddit_posts_splits_2 = [v[1] for v in split_reddit_half_normalized_titled_posts]

In [None]:
import functools 

if functools.reduce(lambda x, y : x and y, map(lambda p, q: p == q,reddit_posts_indices,reddit_posts_indices_2), True): 
    print ("The lists l1 and l2 are the same") 
else: 
    print ("The lists l1 and l2 are not the same") 

The lists l1 and l2 are the same


In [None]:
reddit_split_posts = pd.DataFrame(list(zip(reddit_posts_splits, reddit_posts_indices)),
               columns =['text', 'file_index'])

In [None]:
reddit_split_posts_2 = pd.DataFrame(list(zip(reddit_posts_splits_2, reddit_posts_indices_2)),
               columns =['text', 'file_index'])

In [None]:
# truncation, padding = true ensures that all sentences are padded to the same length and are truncated to be no longer model's max input lengts
# => allows to feed batches of sequences 
normalized_text = reddit_split_posts.text.map(normalizeTweet).to_list()

In [None]:
for t in normalized_text[:1000]:
  print(t)

In [None]:
reddit_posts_encodings = tokenizer(normalized_text, truncation=True, padding=True, return_tensors="pt").to(device)
reddit_posts_DataSet = RedditDataSet(reddit_posts_encodings)

In [None]:
model.eval()
reddit_posts_Loader = DataLoader(reddit_posts_DataSet, batch_size=32)
print("len reddit_posts_Loader: {}".format(len(reddit_posts_Loader)))

predicted = pd.Series()
for (i, batch) in enumerate(reddit_posts_Loader): 
  print(batch)
  if i % 2000 == 0 : print(i)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  outputs = model(input_ids, attention_mask=attention_mask)
  proba = F.softmax(outputs[0]).detach().cpu().numpy()  # get probabilities from output
  predicted_labels = pd.DataFrame(proba).apply(proba_to_category, axis=1) # get predicted class (highest proba)
  predicted = predicted.append(predicted_labels, ignore_index=True)

print("predicted: {}".format(predicted.shape))
print(predicted.value_counts())

len reddit_posts_Loader: 3816
{'input_ids': tensor([[    0,  2250,  8665,  ...,     1,     1,     1],
        [    0,  2023,   400,  ...,     1,     1,     1],
        [    0, 10548,    15,  ...,     1,     1,     1],
        ...,
        [    0,   569,  3697,  ...,     1,     1,     1],
        [    0,   569,  1298,  ...,     1,     1,     1],
        [    0,  1890,    66,  ...,     1,     1,     1]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
0


  """
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
{'input_ids': tensor([[    0,   213,    48,  ...,     1,     1,     1],
        [    0, 11504,  9226,  ...,     1,     1,     1],
        [    0,  1050,   484,  ...,     1,     1,     1],
        ...,
        [    0,   126,    90,  ...,     1,     1,     1],
        [    0,  5529,   164,  ...,     1,     1,     1],
        [    0,   122,   121,  ...,     1,     1,     1]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],

In [None]:
reddit_titled_posts_split_predicted = pd.DataFrame({
                                            'sentences': reddit_posts_splits,
                                            'half_processed_sentences': reddit_posts_splits_2,
                                            'posts_indices': reddit_posts_indices,
                                            'prediction': predicted})

In [None]:
reddit_titled_posts_split_predicted = pd.pivot_table(reddit_titled_posts_split_predicted,
                    values=['sentences', 'half_processed_sentences', 'prediction'],
                    index='posts_indices',
                    aggfunc={'sentences': list,
                             'half_processed_sentences': list,
                             'prediction': list})

In [None]:
reddit_titled_posts_split_predicted['final_prediction'] = reddit_titled_posts_split_predicted['prediction'].apply(lambda x: 1 in x)

In [None]:
personal_reddit_titled_posts = reddit_titled_posts_split_predicted[reddit_titled_posts_split_predicted['final_prediction'] == True]

In [None]:
personal_reddit_titled_posts.shape

(20377, 4)

In [None]:
#(20377, 3)

In [None]:
personal_reddit_titled_posts

Unnamed: 0_level_0,half_processed_sentences,prediction,sentences,final_prediction
posts_indices,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,[Post Covid gastrointestinal symptoms I got si...,[1],[Post Covid gastrointestinal symptoms I got si...,True
1,[Has anyone gotten worse at 8 months ? Have an...,[1],[Has anyone gotten worse at 8 months ? Have an...,True
2,[Bit of an odd sexual question . A pretty pers...,"[0, 0, 1, 0]",[Bit of an odd sexual question . A pretty pers...,True
4,[Timeline I looked back at my records since i'...,"[1, 0]",[Timeline I looked back at my records since i ...,True
5,[Workout Intolerance ? Plenty of times I've he...,"[1, 1, 1, 0]",[Workout Intolerance ? Plenty of times I 've h...,True
...,...,...,...,...
26438,[Recommendation for best OTC antihistamine for...,"[1, 0]",[Recommendation for best OTC antihistamine for...,True
26440,[Starting a trial I thought I had my long Covi...,"[1, 1, 0, 0, 1, 1, 0, 0, 1]",[Starting a trial I thought I had my long Covi...,True
26441,[Suspected long covid Hi all 44m never tested ...,[1],[Suspected long covid Hi all 44m never tested ...,True
26442,[Scottish study looking for participants for r...,"[0, 0, 0, 0, 1, 0, 0]",[Scottish study looking for participants for r...,True


In [None]:
predictions_list_of_lists = personal_reddit_titled_posts.prediction.to_list()

In [None]:
sentences_list = personal_reddit_titled_posts.half_processed_sentences.to_list()

In [None]:
positive_predictions_indices = [[i for i, x in enumerate(l) if x == 1] for l in predictions_list_of_lists]

In [None]:
positive_predictions_sentences = [] 
for i in range(len(sentences_list)):
  temp_list = []
  indices_list = positive_predictions_indices[i]
  sentences = sentences_list[i]
  for index in indices_list:
    temp_list.append(sentences[index])
  positive_predictions_sentences.append(temp_list)

In [None]:
positive_predictions_sentences

[['Post Covid gastrointestinal symptoms I got sick in March 2020 , was not hospitalized and because of that was not able to get tested so it is not confirmed Covid , but the symptoms seem to align with long haul Covid . Has anyone else had severe GI issues such as acid reflux , SIBO , food intolerances , and new sensitivity to any supplements / medications ? If so , has it resolved ? What helped you ?'],
 ['Has anyone gotten worse at 8 months ? Have any of you gotten worse at around 8 months ?'],
 [" Every time would ejaculate , the pain would radiate around my body , most intensely in my face . Not sure if this is a COVID symptom , or something else entirely , but I'd thought I'd ask "],
 ["Timeline I looked back at my records since i've been sick on august 21 . Sick for 13 days Good for 11 days Sick for 19 days Good for 21 days Sick for 30 days Good for 20 days And today im not feeling the best but i will see how it goes "],
 ["Workout Intolerance ? Plenty of times I've heard that us

In [None]:
positive_predictions_sentences_concatenated = ['.'.join(l) for l in positive_predictions_sentences]

In [None]:
positive_predictions_sentences_concatenated

In [None]:
personal_reddit_titled_posts['concatenated_sentences'] = positive_predictions_sentences_concatenated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
personal_reddit_titled_posts

Unnamed: 0_level_0,half_processed_sentences,prediction,sentences,final_prediction,concatenated_sentences
posts_indices,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,[Post Covid gastrointestinal symptoms I got si...,[1],[Post Covid gastrointestinal symptoms I got si...,True,Post Covid gastrointestinal symptoms I got sic...
1,[Has anyone gotten worse at 8 months ? Have an...,[1],[Has anyone gotten worse at 8 months ? Have an...,True,Has anyone gotten worse at 8 months ? Have any...
2,[Bit of an odd sexual question . A pretty pers...,"[0, 0, 1, 0]",[Bit of an odd sexual question . A pretty pers...,True,"Every time would ejaculate , the pain would r..."
4,[Timeline I looked back at my records since i'...,"[1, 0]",[Timeline I looked back at my records since i ...,True,Timeline I looked back at my records since i'v...
5,[Workout Intolerance ? Plenty of times I've he...,"[1, 1, 1, 0]",[Workout Intolerance ? Plenty of times I 've h...,True,Workout Intolerance ? Plenty of times I've hea...
...,...,...,...,...,...
26438,[Recommendation for best OTC antihistamine for...,"[1, 0]",[Recommendation for best OTC antihistamine for...,True,Recommendation for best OTC antihistamine for ...
26440,[Starting a trial I thought I had my long Covi...,"[1, 1, 0, 0, 1, 1, 0, 0, 1]",[Starting a trial I thought I had my long Covi...,True,Starting a trial I thought I had my long Covid...
26441,[Suspected long covid Hi all 44m never tested ...,[1],[Suspected long covid Hi all 44m never tested ...,True,Suspected long covid Hi all 44m never tested p...
26442,[Scottish study looking for participants for r...,"[0, 0, 0, 0, 1, 0, 0]",[Scottish study looking for participants for r...,True,but personally between being so physically li...


In [None]:
non_personal_reddit_titled_posts = reddit_titled_posts_split_predicted[reddit_titled_posts_split_predicted['final_prediction'] == False]

In [None]:
len(non_personal_reddit_titled_posts)

6068

In [None]:
#len(non_personal_reddit_titled_posts)

In [None]:
len(personal_reddit_titled_posts)

20377

In [None]:
#len(personal_reddit_titled_posts)

In [None]:
reddit_posts.iloc[non_personal_reddit_titled_posts.index.to_list()].to_csv(home + 'reddit_posts_2022_07_21-10_16_58_AM_no_duplicates_non_personnal_corrected.csv')

In [None]:
personal_df = reddit_posts.iloc[personal_reddit_titled_posts.index.to_list()]

In [None]:
personal_df['concatenated_sentences'] = personal_reddit_titled_posts['concatenated_sentences'].to_list()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
personal_df

Unnamed: 0,level_0,index,_id,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,...,distinguished,discussion_type,suggested_sort,call_to_action,category,event_end,event_is_live,event_start,titled_selftext,concatenated_sentences
0,0,0,62d6699037075c826c2d1c0b,[],0.0,Nb827618,,[],,text,...,,,,,,,,,Post Covid gastrointestinal symptoms I got sic...,Post Covid gastrointestinal symptoms I got sic...
1,1,1,62d6699037075c826c2d1c0c,[],False,TheChaosBringer993,,[],,text,...,,,,,,,,,Has anyone gotten worse at 8 months? Have any ...,Has anyone gotten worse at 8 months ? Have any...
2,2,2,62d6699037075c826c2d1c0d,[],False,sunflower_1970,,[],,text,...,,,,,,,,,Bit of an odd sexual question. A pretty person...,"Every time would ejaculate , the pain would r..."
4,4,4,62d6699037075c826c2d1c0f,[],False,goalan2112,,[],,text,...,,,,,,,,,Timeline I looked back at my records since i'v...,Timeline I looked back at my records since i'v...
5,5,5,62d6699037075c826c2d1c10,[],False,DarkHeros01,,[],,text,...,,,,,,,,,Workout Intolerance ? Plenty of times I’ve he...,Workout Intolerance ? Plenty of times I've hea...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26438,27208,27208,62d6bf9037075c826c2d8653,[],0.0,Obvious-Explorer7211,,[],,text,...,,,,,,,,,Recommendation for best OTC antihistamine for ...,Recommendation for best OTC antihistamine for ...
26440,27210,27210,62d6bf9037075c826c2d8655,[],0.0,Monkeyboogaloo,,[],,text,...,,,,,,,,,Starting a trial I thought I had my long Covid...,Starting a trial I thought I had my long Covid...
26441,27211,27211,62d6bf9037075c826c2d8656,[],0.0,Apprehensive-Pay3364,,[],,text,...,,,,,,,,,Suspected long covid Hi all 44m never tested ...,Suspected long covid Hi all 44m never tested p...
26442,27212,27212,62d6bf9037075c826c2d8657,[],0.0,Spiderweb12,,[],,text,...,,,,,,,,,Scottish study looking for participants for re...,but personally between being so physically li...


In [None]:
personal_df.to_csv(home + 'reddit_posts_2022_07_21-10_16_58_AM_no_duplicates_personnal_corrected.csv')

## posts without titles

In [None]:
split_reddit_posts = []

j = 0

for text in reddit_posts.selftext:
  double_sentences = []
  sentences = text.split('.')
  nb_sentences = len(sentences)
  i = 0
  while i < nb_sentences:
    if i+1 < nb_sentences:
      text_portion = sentences[i] + "." + sentences[i+1]
      if len(text_portion) > 1:
        split_reddit_posts.append((j, text_portion))
      i=i+2
    else:
      text_portion = sentences[i]
      if len(text_portion) > 1:
        split_reddit_posts.append((j, text_portion))
      break
  j = j + 1

In [None]:
reddit_posts_indices = [v[0] for v in split_reddit_posts]
reddit_posts_splits = [v[1] for v in split_reddit_posts]

In [None]:
reddit_posts_splits

['I got sick in March 2020, was not hospitalized and because of that was not able to get tested so it is not confirmed Covid, but the symptoms seem to align with long haul Covid. Has anyone else had severe GI issues such as acid reflux, SIBO, food intolerances, and new sensitivity to any supplements/medications? If so, has it resolved? What helped you?',
 'Have any of you gotten worse at around 8 months?',
 'A pretty personal question, but a very strange health symptom, that I feel must be nervous system/vascular related.\n\nSomething I remember happening last year, one day, after I started feeling sick, I ejaculated/orgasmed, I felt pain around my body as it happened, almost rhythmically to the ejaculation process',
 " Has anybody had such an odd, weird feeling like that post-COVID infection? I don't remember it happening again, but it was like something that had never happened before. Every time would ejaculate, the pain would radiate around my body, most intensely in my face",
 "\n\

In [None]:
reddit_split_posts = pd.DataFrame(list(zip(reddit_posts_splits, reddit_posts_indices)),
               columns =['text', 'file_index'])

In [None]:
reddit_split_posts.text.to_list()

['I got sick in March 2020, was not hospitalized and because of that was not able to get tested so it is not confirmed Covid, but the symptoms seem to align with long haul Covid. Has anyone else had severe GI issues such as acid reflux, SIBO, food intolerances, and new sensitivity to any supplements/medications? If so, has it resolved? What helped you?',
 'Have any of you gotten worse at around 8 months?',
 'A pretty personal question, but a very strange health symptom, that I feel must be nervous system/vascular related.\n\nSomething I remember happening last year, one day, after I started feeling sick, I ejaculated/orgasmed, I felt pain around my body as it happened, almost rhythmically to the ejaculation process',
 " Has anybody had such an odd, weird feeling like that post-COVID infection? I don't remember it happening again, but it was like something that had never happened before. Every time would ejaculate, the pain would radiate around my body, most intensely in my face",
 "\n\

In [None]:
import torch.nn.functional as F

# truncation, padding = true ensures that all sentences are padded to the same length and are truncated to be no longer model's max input lengts
# => allows to feed batches of sequences 
reddit_posts_encodings = tokenizer(reddit_split_posts.text.map(normalizeTweet).values.tolist(), truncation=True, padding=True, return_tensors="pt").to(device)
reddit_posts_DataSet = RedditDataSet(reddit_posts_encodings)

TypeError: ignored

In [None]:
print(reddit_posts_encodings)

In [None]:
model.eval()
reddit_posts_Loader = DataLoader(reddit_posts_DataSet, batch_size=32)
print("len reddit_posts_Loader: {}".format(len(reddit_posts_Loader)))

predicted = pd.Series()
for (i, batch) in enumerate(reddit_posts_Loader): 
  print(batch)
  if i % 2000 == 0 : print(i)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  outputs = model(input_ids, attention_mask=attention_mask)
  proba = F.softmax(outputs[0]).detach().cpu().numpy()  # get probabilities from output
  predicted_labels = pd.DataFrame(proba).apply(proba_to_category, axis=1) # get predicted class (highest proba)
  predicted = predicted.append(predicted_labels, ignore_index=True)

print("predicted: {}".format(predicted.shape))
print(predicted.value_counts())

In [None]:
reddit_posts_split_predicted = pd.DataFrame({'sentences': reddit_posts_splits,
                                            'posts_indices': reddit_posts_indices,
                                            'prediction': predicted})

In [None]:
pt = pd.pivot_table(reddit_posts_split_predicted,
                    values=['sentences', 'prediction'],
                    index='posts_indices',
                    aggfunc={'sentences': list,
                             'prediction': list})

In [None]:
pt

In [None]:
pt['final_prediction'] = pt['prediction'].apply(lambda x: 1 in x)

In [None]:
personal_reddit_posts = pt[pt['final_prediction'] == True]

In [None]:
reddit_posts.iloc[personal_reddit_posts.index.to_list()].title.to_list()

In [None]:
non_personal_reddit_posts = pt[pt['final_prediction'] == False]
reddit_posts.iloc[non_personal_reddit_posts.index.to_list()].title.to_list()

In [None]:
len(personal_reddit_posts)