In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from transformers import pipeline

In [2]:
#Cleaned File from the Reddit data collection File
df = pd.read_pickle('df_reddit_cleaned.pkl')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [5]:
df = df.drop_duplicates()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93 entries, 0 to 99
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  93 non-null     object
dtypes: object(1)
memory usage: 1.5+ KB


In [17]:
#Reducing the size of comments to only 250 characters as the models are restricted to around 256 characters 
#for input
df['comments'] = df['comments'].apply(lambda x:x[:250])

In [18]:
Data = df['comments'].to_list()

In [19]:
Data[1:10]

['for me i try to think of it at a very simple fundamental level why do currencies and centralized systems exist what problem needed to be solved at one point people could just trade goods but what if you have cow and want to trade for a chicken a cow ',
 'crypto is actually harder to launder than paper money because all of the transactions are recorded and public it is made itself useful for criminal purposes like ransomware because the criminals are either based in a country that does not carewill no',
 'one more the world was a changed place the effects of climate change were everywhere from the endless droughts in the midwest to the rising sea levels that threatened coastal cities but in the midst of all this chaos a new technology had emerged bitc',
 'nobody has time to fact check all incorrect or misleading information so i will just talk about gov can shut down the internet just like they can shut down gold mines or any other enterprise it does not change the asset they cannot d

In [20]:
#Using Default Model from Transformers(distilbert-base-uncasesd)
sentiment_pipeline = pipeline("sentiment-analysis")
sentiment_pipeline(Data)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9829002022743225},
 {'label': 'NEGATIVE', 'score': 0.9963653087615967},
 {'label': 'NEGATIVE', 'score': 0.9979150891304016},
 {'label': 'POSITIVE', 'score': 0.7807169556617737},
 {'label': 'NEGATIVE', 'score': 0.9995717406272888},
 {'label': 'NEGATIVE', 'score': 0.9853954911231995},
 {'label': 'NEGATIVE', 'score': 0.9953818917274475},
 {'label': 'NEGATIVE', 'score': 0.9863816499710083},
 {'label': 'NEGATIVE', 'score': 0.9866582751274109},
 {'label': 'POSITIVE', 'score': 0.9024071097373962},
 {'label': 'NEGATIVE', 'score': 0.9957551956176758},
 {'label': 'NEGATIVE', 'score': 0.9983192086219788},
 {'label': 'NEGATIVE', 'score': 0.9894406199455261},
 {'label': 'NEGATIVE', 'score': 0.9992349147796631},
 {'label': 'NEGATIVE', 'score': 0.9994063377380371},
 {'label': 'NEGATIVE', 'score': 0.9885151982307434},
 {'label': 'NEGATIVE', 'score': 0.9977388381958008},
 {'label': 'NEGATIVE', 'score': 0.9594597220420837},
 {'label': 'POSITIVE', 'score': 0.999508380889

In [23]:
#Reducing the comment size to 64 to be able to work with the berttweet model.
df['comments'] = df['comments'].apply(lambda x:x[:64])
Data2 = df['comments'].to_list()

In [24]:
#Using berttweet Model from Transformers which was trained on twitter data
model2 = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
model2(Data2)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


[{'label': 'NEU', 'score': 0.9797791242599487},
 {'label': 'NEU', 'score': 0.9167025089263916},
 {'label': 'NEG', 'score': 0.7816319465637207},
 {'label': 'NEU', 'score': 0.9501328468322754},
 {'label': 'NEG', 'score': 0.9398619532585144},
 {'label': 'NEU', 'score': 0.919883668422699},
 {'label': 'NEU', 'score': 0.9793980717658997},
 {'label': 'NEU', 'score': 0.9410568475723267},
 {'label': 'NEG', 'score': 0.9632286429405212},
 {'label': 'POS', 'score': 0.8840638995170593},
 {'label': 'NEG', 'score': 0.9572646617889404},
 {'label': 'NEU', 'score': 0.6719188690185547},
 {'label': 'NEU', 'score': 0.9320522546768188},
 {'label': 'NEU', 'score': 0.7747042775154114},
 {'label': 'NEG', 'score': 0.5834870934486389},
 {'label': 'NEU', 'score': 0.810034453868866},
 {'label': 'NEG', 'score': 0.8714921474456787},
 {'label': 'NEU', 'score': 0.8571383953094482},
 {'label': 'POS', 'score': 0.9457829594612122},
 {'label': 'NEG', 'score': 0.962552547454834},
 {'label': 'NEU', 'score': 0.95958226919174

In [22]:
#Using Reoberta Model from Transformers 
model2 = pipeline(model="roberta-large-mnli")
model2(Data)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'CONTRADICTION', 'score': 0.5425612330436707},
 {'label': 'NEUTRAL', 'score': 0.6877266764640808},
 {'label': 'NEUTRAL', 'score': 0.7620202898979187},
 {'label': 'NEUTRAL', 'score': 0.9658099412918091},
 {'label': 'NEUTRAL', 'score': 0.8688977360725403},
 {'label': 'NEUTRAL', 'score': 0.6635711193084717},
 {'label': 'NEUTRAL', 'score': 0.6886267066001892},
 {'label': 'NEUTRAL', 'score': 0.6029835343360901},
 {'label': 'NEUTRAL', 'score': 0.9607135653495789},
 {'label': 'NEUTRAL', 'score': 0.660656750202179},
 {'label': 'NEUTRAL', 'score': 0.9216315150260925},
 {'label': 'NEUTRAL', 'score': 0.8821895122528076},
 {'label': 'NEUTRAL', 'score': 0.5924975872039795},
 {'label': 'NEUTRAL', 'score': 0.9567058086395264},
 {'label': 'ENTAILMENT', 'score': 0.5282959938049316},
 {'label': 'ENTAILMENT', 'score': 0.4489707052707672},
 {'label': 'NEUTRAL', 'score': 0.6935535073280334},
 {'label': 'NEUTRAL', 'score': 0.6955918073654175},
 {'label': 'NEUTRAL', 'score': 0.5245481133460999},
 