In [4]:
# for text processing and cleaning 
import re
import nltk 
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string 
string.punctuation 
nltk.download('averaged_perceptron_tagger')

# remove warnings 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# for sentiment analysis 
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator

# for topic modeling using LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel 

# plotting tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
from plotly.subplots import make_subplots
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

print('‚úîÔ∏è Libraries Imported!')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


‚úîÔ∏è Libraries Imported!


## 1. NLTK with Vader
[NLTK Vader Documentation](https://www.nltk.org/howto/sentiment.html)

### 1.1 Understand the Raw Dataset

In [6]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')
import nltk 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [7]:
df = pd.read_csv("/Users/apple/Desktop/Quantilope/Reddit_api data.csv")
df.shape

(886, 7)

In [8]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Title,Subreddit,Post ID,Post URL,Comments
0,0,0.0,Brought to you by Nike . Oh wait,Nbamemes,ymumjy,https://i.redd.it/kx6pa20g15y91.jpg,Kanye Irving ladies and gentlemen
1,1,1.0,Brought to you by Nike . Oh wait,Nbamemes,ymumjy,https://i.redd.it/kx6pa20g15y91.jpg,Black man tweets link to amazon. Never said a ...
2,2,2.0,Brought to you by Nike . Oh wait,Nbamemes,ymumjy,https://i.redd.it/kx6pa20g15y91.jpg,Spot the kike
3,3,3.0,Summertime Beauty [Plymouth by nike],AzureLane,yjou5n,https://i.redd.it/ebg5800bbfx91.jpg,Plymouth is beautiful. I know Ohisashiburi mig...
4,4,4.0,Summertime Beauty [Plymouth by nike],AzureLane,yjou5n,https://i.redd.it/ebg5800bbfx91.jpg,[Sauce](https://www.pixiv.net/en/artworks/1024...


In [9]:
text_raw = df['Comments']
text_raw.head()

0                    Kanye Irving ladies and gentlemen
1    Black man tweets link to amazon. Never said a ...
2                                        Spot the kike
3    Plymouth is beautiful. I know Ohisashiburi mig...
4    [Sauce](https://www.pixiv.net/en/artworks/1024...
Name: Comments, dtype: object

### 1.2 Text Preprocessing 

In [10]:
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [11]:
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

# Example
text1 = "Hilarious üòÇ. The feeling of making a sale üòé, The feeling of actually fulfilling orders üòí"
convert_emojis(text1)

'Hilarious face_with_tears_of_joy. The feeling of making a sale smiling_face_with_sunglasses, The feeling of actually fulfilling orders unamused_face'

In [12]:
# Function for converting emoticons into word
from emot.emo_unicode import EMOTICONS_EMO
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, EMOTICONS_EMO[emot].replace(" ","_"))
    return text


text = "Hello :-) :-)"
convert_emoticons(text)

'Hello Happy_face_smiley Happy_face_smiley'

In [13]:
def clean_text_lemma_show(var):
    """
    Function for text preprocessing with Lemmatizing with POS tag.
    """
    # remove the stop words 
    sw = set(stopwords.words('english'))
    my_text = [word for word in str(var).split() if word not in sw]
    my_text = " ".join(my_text)
    print(f'after removing stop words: {my_text}')

    # lowercase 
    my_text = my_text.lower()
    print(f'after lower case: {my_text}')

    # removal of URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    my_text = url_pattern.sub(r'', my_text)
    print(f'after removing urls: {my_text}')

    # removal of HTML Tags
    my_text = BeautifulSoup(my_text, "lxml").text
    print(f'after removing HTMLs: {my_text}')

    # tokenize the word using nltk  
    my_text = nltk.word_tokenize(my_text)
    print(f'after tokenize: {my_text}')

    # lemmatizing and using grouped word chuncks 
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN,"V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    # lemmatizing 
    pos_tagged_text = nltk.pos_tag(my_text) 
    my_text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    print(f'after lemma: {my_text}')
    
    # change emojis into words 
    for emot in UNICODE_EMOJI:
        my_text = my_text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    print(f'after changing emojis: {my_text}')

    # remove not english characters, lower case and split the text 
    my_text = re.sub('[^A-Za-z0-9]+', " ", my_text).lower().strip() 
    print(f'after removing mentions: {my_text}')

    # convert the text to list as the vectorized words  
    ## my_text = my_text.split(" ")

    return my_text

In [14]:
test = text_raw[10:15]
test

10    we are like manjuus. they might work hard righ...
11                                 Plymouth; Just do it
12                                            [deleted]
13    I want this image to be the last image I see b...
14    Just a heads up: this post has been locked, as...
Name: Comments, dtype: object

In [15]:

# snippet of the text preprocessing results
test.apply(lambda test: clean_text_lemma_show(test))

after removing stop words: like manjuus. might work hard right, enjoy, right now. manjuu cultures, I say
after lower case: like manjuus. might work hard right, enjoy, right now. manjuu cultures, i say
after removing urls: like manjuus. might work hard right, enjoy, right now. manjuu cultures, i say
after removing HTMLs: like manjuus. might work hard right, enjoy, right now. manjuu cultures, i say
after tokenize: ['like', 'manjuus', '.', 'might', 'work', 'hard', 'right', ',', 'enjoy', ',', 'right', 'now', '.', 'manjuu', 'cultures', ',', 'i', 'say']
after lemma: like manjuus . might work hard right , enjoy , right now . manjuu culture , i say
after changing emojis: like manjuus . might work hard right , enjoy , right now . manjuu culture , i say
after removing mentions: like manjuus might work hard right enjoy right now manjuu culture i say
after removing stop words: Plymouth; Just
after lower case: plymouth; just
after removing urls: plymouth; just
after removing HTMLs: plymouth; just
a

10    like manjuus might work hard right enjoy right...
11                                        plymouth just
12                                               delete
13    i want image last image i see i die i imagine ...
14    just head up post lock post r dirtyr4r be this...
Name: Comments, dtype: object

In [16]:
def clean_text_lemma(var):
    """
    Function for text preprocessing with Lemmatizing with POS tag.
    """
    # remove the stop words 
    sw = set(stopwords.words('english'))
    my_text = [word for word in str(var).split() if word not in sw]
    my_text = " ".join(my_text)

    # lowercase 
    my_text = my_text.lower()

    # removal of URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    my_text = url_pattern.sub(r'', my_text)

    # removal of HTML Tags
    my_text = BeautifulSoup(my_text, "lxml").text

    # tokenize the word using nltk  
    my_text = nltk.word_tokenize(my_text)

    # lemmatizing and using grouped word chuncks 
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN,"V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    # lemmatizing 
    pos_tagged_text = nltk.pos_tag(my_text) 
    my_text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    
    # change emojis into words 
    for emot in UNICODE_EMOJI:
        my_text = my_text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))

    # remove not english characters, lower case and split the text 
    my_text = re.sub('[^A-Za-z0-9]+', " ", my_text).lower().strip() 

    # convert the text to list as the vectorized words  
    ## my_text = my_text.split(" ")

    return my_text

In [17]:
# implement on the whole text files 
text_clean_object = text_raw.apply(lambda text: clean_text_lemma(text))

In [18]:
len(text_clean_object)

886

In [19]:
text_clean = pd.DataFrame(text_clean_object)
text_clean['Id'] = [num for num in range(0,len(text_clean_object))]
text_clean.head()

Unnamed: 0,Comments,Id
0,kanye irving lady gentleman,0
1,black man tweet link amazon never say word giv...,1
2,spot kike,2
3,plymouth beautiful i know ohisashiburi might a...,3
4,sauce u repostsleuthbot,4


In [20]:
text_raw_df = pd.DataFrame(text_raw)
text_raw_df['Id'] = [num for num in range(0,len(text_raw_df))]
text_raw_df.head()

Unnamed: 0,Comments,Id
0,Kanye Irving ladies and gentlemen,0
1,Black man tweets link to amazon. Never said a ...,1
2,Spot the kike,2
3,Plymouth is beautiful. I know Ohisashiburi mig...,3
4,[Sauce](https://www.pixiv.net/en/artworks/1024...,4


### 1.3 VADER Sentiment Scoring 
VADER (Valence Aware Dictionary and Sentiment Reasoner) : Bag of words approach 

*__Notes:__* <br>
This model does not include relationship in words 

*__Steps:__* <br>
1. Stop words are removed.
2. Each word is scored and combined to a total score. 

In [21]:
# nltk packages 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# progress bar tracker for some loops 
from tqdm.notebook import tqdm 

In [22]:
# create the sentiment analysis object, the returning scores are from 0 to 1
sia = SentimentIntensityAnalyzer()

# show case of the returns, the final `compound` result is from -1 to 1 
print(sia.polarity_scores("I'm so happy"))
print(sia.polarity_scores("This is the worst thing ever"))

{'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6115}
{'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}


In [23]:
res = {}
for i, row in tqdm(text_clean.iterrows(), total = len(text_clean)):
    text = row['Comments']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/886 [00:00<?, ?it/s]

In [24]:
# combine the sentiment score back to the dataframe
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(text_clean, how = 'left')

# now we have sentiment score and metadata
vaders.head()

Unnamed: 0,Id,neg,neu,pos,compound,Comments
0,0,0.0,1.0,0.0,0.0,kanye irving lady gentleman
1,1,0.278,0.584,0.138,-0.7906,black man tweet link amazon never say word giv...
2,2,0.0,1.0,0.0,0.0,spot kike
3,3,0.0,0.653,0.347,0.8779,plymouth beautiful i know ohisashiburi might a...
4,4,0.0,1.0,0.0,0.0,sauce u repostsleuthbot


In [25]:
# a snippet of how it performs 
pd.set_option('display.max_colwidth', None)
vaders[['compound','Comments']][50:53]

Unnamed: 0,compound,Comments
50,0.6908,lol good make youtube platform might nothing
51,0.2516,oh goodness really obsessed
52,0.0,be anyway legit check pair the toebox panel look really thick


## 2. Deep Learning Models with Hugging Face
[Reference from HuggingFace](https://huggingface.co/blog/sentiment-analysis-python) <br>
HuggingFace: A hub that provides the collection of pre-trained models, and some are state of art. <br>

In [26]:
# import hugging face library Transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax 

In [2]:
# use pre-trained models with the Roberta 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [27]:
# run example for VADER moel 
example = 'i need tht nike ski mask shit looks fire'
print(sia.polarity_scores(example))

{'neg': 0.5, 'neu': 0.5, 'pos': 0.0, 'compound': -0.7184}


In [28]:
# run example on roBERTa model 

# return the 0 or 1 tensors that embeding models will understand
encoded_text = tokenizer(example, return_tensors = 'pt') # pt for PyTorch
# the output is a tensor 
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
# apply softmax to turn into 0 to 1 range 
scores = softmax(scores)
scores_dict = {'roberta_neg': scores[0],
                'roberta_neu': scores[1],
                'roberta_pos': scores[2]}

print(scores_dict)

{'roberta_neg': 0.013050259, 'roberta_neu': 0.13767216, 'roberta_pos': 0.8492776}


In [29]:
def polarity_scores_roerta( var ):
    encoded_text = tokenizer(var, return_tensors = 'pt') # pt for PyTorch
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {'roberta_neg': scores[0],
                    'roberta_neu': scores[1],
                    'roberta_pos': scores[2]}
    return scores_dict

In [31]:
res = {}
for i, row in tqdm(text_clean.iterrows(), total = len(text_raw_df)):
    try: 
        text = row['Comments']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
                vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roerta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both 
    except RuntimeError: 
        print(f'Broke for id{myid}')

  0%|          | 0/886 [00:00<?, ?it/s]

Broke for id57
Broke for id454
Broke for id462
Broke for id665
Broke for id760


In [32]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'Id'})
results_df = results_df.merge(text_raw_df, how = 'left')

In [33]:
results_df.head()

Unnamed: 0,Id,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Comments
0,0,0.0,1.0,0.0,0.0,0.026723,0.778464,0.194813,Kanye Irving ladies and gentlemen
1,1,0.278,0.584,0.138,-0.7906,0.546833,0.423505,0.029663,Black man tweets link to amazon. Never said a word. Given every label in the book. \n\nWhite man that profits off the book and movie. No mass outrage for the white man. \n\nThe outrage is fake. They want another black man to say ‚Äútoby‚Äù.
2,2,0.0,1.0,0.0,0.0,0.125244,0.73286,0.141896,Spot the kike
3,3,0.0,0.653,0.347,0.8779,0.002503,0.056462,0.941035,"Plymouth is beautiful. I know Ohisashiburi might not be able to use this exact design for a possible swimsuit skin, but I‚Äôd love it if they got the chance to make an official one.\n\nEspecially if she gets to keep the hat."
4,4,0.0,1.0,0.0,0.0,0.408988,0.552274,0.038739,[Sauce](https://www.pixiv.net/en/artworks/102444636)\n\n&#x200B;\n\nu/RepostSleuthBot


### 2.2 bertweet-base-sentiment-analysis <br> 
- [Model](https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis?text=I+like+you.+I+love+you) trained with SemEval 2017 corpus (around ~40k tweets). Base model is BERTweet, a RoBERTa model trained on _**English tweets**_.

- Uses POS, NEG, NEU labels.

In [34]:
# use pre-trained models with the Roberta in English 
MODEL = f"finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [35]:
# run example on RoBERTa model 
example = 'i need tht nike ski mask shit looks fire'

# return the 0 or 1 tensors that embeding models will understand
encoded_text = tokenizer(example, return_tensors = 'pt') # pt for PyTorch
# the output is a tensor 
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
# apply softmax to turn into 0 to 1 range 
scores = softmax(scores)
scores_dict = {'roberta_neg': scores[0],
                'roberta_neu': scores[1],
                'roberta_pos': scores[2]}

print(scores_dict)

{'roberta_neg': 0.0065631904, 'roberta_neu': 0.17408511, 'roberta_pos': 0.8193516}


### 2.3 Flair 
This is the large 18-class NER(named entity recognition) model for English that ships with Flair.
[Reference](https://huggingface.co/flair/ner-english-ontonotes-large?text=On+September+1st+George+won+1+dollar+while+watching+Game+of+Thrones.)

In [38]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")

# make example sentence
sentence = Sentence("On September 1st George won 1 dollar while watching Game of Thrones.")

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)


Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2022-11-06 22:27:58,460 loading file /Users/apple/.flair/models/ner-english-ontonotes-large/2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1


Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2022-11-06 22:28:44,908 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
Sentence: "On September 1st George won 1 dollar while watching Game of Thrones ." ‚Üí ["September 1st"/DATE, "George"/PERSON, "1 dollar"/MONEY, "Game of Thrones"/WORK_OF_ART]
The following NER tags are found:
Span[1:3]: "September 1st" ‚Üí DATE (1.0)
Span[3:4]: "George" ‚Üí PERSON (1.0)
Span[5:7]: "1 dollar" ‚Üí MONEY (1.0)
Span[9:12]: "Game of Thrones" ‚Üí WORK_OF_ART (1.0)


## 3.Compare Model Performance on Benchmark Labeled Dataset

In [50]:
sentiment_score = results_df[['roberta_neg','roberta_neu','roberta_pos','Comments']]
sentiment_score.to_csv('c:\\Users\\hs324\\OneDrive\\Desktop\\Class_Files\\06_2022Fall\\04_Practicum\\Quantilope_Core\\data\\sentiment_score_text.csv')
sentiment_score

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos,Comments
0,0.026723,0.778464,0.194813,Kanye Irving ladies and gentlemen
1,0.546833,0.423505,0.029663,Black man tweets link to amazon. Never said a word. Given every label in the book. \n\nWhite man that profits off the book and movie. No mass outrage for the white man. \n\nThe outrage is fake. They want another black man to say ‚Äútoby‚Äù.
2,0.125244,0.732860,0.141896,Spot the kike
3,0.002503,0.056462,0.941035,"Plymouth is beautiful. I know Ohisashiburi might not be able to use this exact design for a possible swimsuit skin, but I‚Äôd love it if they got the chance to make an official one.\n\nEspecially if she gets to keep the hat."
4,0.408988,0.552274,0.038739,[Sauce](https://www.pixiv.net/en/artworks/102444636)\n\n&#x200B;\n\nu/RepostSleuthBot
...,...,...,...,...
876,0.236839,0.710536,0.052625,So far only the player can use in the normal way Super tier magic. But i still ask myself if the Elf King who is half-player can not use Super tier magic or doesn't know is existance and so has never learned a Super tier spell in the first place.\n\nThis is just speculation from my part in the same ways that i ask myself if Ainz skill Dark Wisdom would work on a player child (especially if this is a child between 2 players).
877,0.047229,0.839690,0.113081,The corps of the Abbys have told us that they have supposedly reach up to 6-9tier magic so I am going to go ahead n say you will need to be a long living race to accomplish super tier magic.
878,0.018320,0.090529,0.891151,Great story! It made my pants shrink a little
879,0.122516,0.803214,0.074270,"Please make a top-level comment detailing your desired roleplay scenario. Once done, reply to this comment and your submission will be approved. \n\nDo not reply to this comment unless you have done as instructed. A top-level comment is a comment on this post that is **NOT** a reply to another comment."
