# Sentiment Analysis on User Text 

In [1]:
# for text processing and cleaning 
import re
import nltk 
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string 
string.punctuation

# remove warnings 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# for sentiment analysis 
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator

# for topic modeling using LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel 

# plotting tools
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
from plotly.subplots import make_subplots
import pyLDAvis
# import pyLDAvis.gensim 
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

print('‚úîÔ∏è Libraries Imported!')

‚úîÔ∏è Libraries Imported!


## 1. NLTK with Vader
[NLTK Vader Documentation](https://www.nltk.org/howto/sentiment.html)

### 1.1 Understand the Raw Dataset 

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('ggplot')
import nltk 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# Load data into dataframe, skip the error lines
df = pd.read_csv('c:\\Users\\hs324\\OneDrive\\Desktop\\Class_Files\\06_2022Fall\\04_Practicum\\Quantilope_Core\\data\\five_brands_text.csv')
df.shape

(11648, 12)

In [4]:
df.head()

Unnamed: 0,author_id,username,author_followers,author_tweets,author_description,author_location,text,created_at,retweets,replies,likes,quote_count
0,87144412,GarrettKGray,376,12086,Land Economist & Economic Development Speciali...,"Coos Bay, OR",@ShaneDaleAZ Totally. The Nike uniforms since ...,2022-09-30 23:46:59+00:00,0,0,0,0
1,492330913,LockDown_Lopes,470,97876,"@nicekicks, sports, & memes | University of Ar...","Scottsdale, AZ",Hats off to Tom Sachs and the marketing team a...,2022-09-30 23:43:45+00:00,0,0,0,0
2,37706001,RyanGensler,6683,13623,315 Born and Raised: Assistant Basketball Coac...,"Champaign, IL",The look on @makiracook face! üòÇ \n\nThanks @Ni...,2022-09-30 23:38:15+00:00,1,1,27,0
3,17417435,ShellzBoss,564,22093,"#TeamLibra #TeamLesbian Hibernating, should be...","Maryland, Michigan",Check out my new pickup from Nike‚Å† SNKRS: http...,2022-09-30 23:35:28+00:00,0,1,0,0
4,853714067692806144,DJKingJam,395,3774,Jordan Shoe collector || DJ Jamez || Music Pro...,"Seattle, WA",@jameslfreelance @Jumpman23 @Nike @nikestore O...,2022-09-30 23:15:57+00:00,0,0,2,0


In [8]:
text_raw = df['text']
text_raw.head()

0    @ShaneDaleAZ Totally. The Nike uniforms since ...
1    Hats off to Tom Sachs and the marketing team a...
2    The look on @makiracook face! üòÇ \n\nThanks @Ni...
3    Check out my new pickup from Nike‚Å† SNKRS: http...
4    @jameslfreelance @Jumpman23 @Nike @nikestore O...
Name: text, dtype: object

### 1.2 Text Preprocessing 

#### 1.2.1 Transform emojis 

In [4]:
# import library for emoji handling 
from emot.emo_unicode import UNICODE_EMOJI # For emojis
from emot.emo_unicode import EMOTICONS_EMO # For EMOTICONS

In [7]:
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

# Example
text1 = "Hilarious üòÇ. The feeling of making a sale üòé, The feeling of actually fulfilling orders üòí"
convert_emojis(text1)

'Hilarious face_with_tears_of_joy. The feeling of making a sale smiling_face_with_sunglasses, The feeling of actually fulfilling orders unamused_face'

In [8]:
# Function for converting emoticons into word
from emot.emo_unicode import EMOTICONS_EMO
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, EMOTICONS_EMO[emot].replace(" ","_"))
    return text


text = "Hello :-) :-)"
convert_emoticons(text)

'Hello Happy_face_smiley Happy_face_smiley'

As __*lemmatization*__ will return the words to its original form based on its semantic meaning, while __*stemming*__ basically will chop off the suffix and sometimes loose information. <br><br>
Although that stemming being a rule-based approach, it runs faster than lemmatization, considering our corpuses size isn't too big and the more accurate nature of the canonical dictionary-based approach of lemma, we'll use lemmatization in this case. 

In [9]:
def clean_text_lemma_show(var):
    """
    Function for text preprocessing with Lemmatizing with POS tag.
    """
    # remove the stop words 
    sw = set(stopwords.words('english'))
    my_text = [word for word in str(var).split() if word not in sw]
    my_text = " ".join(my_text)
    print(f'after removing stop words: {my_text}')

    # lowercase 
    my_text = my_text.lower()
    print(f'after lower case: {my_text}')

    # removal of URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    my_text = url_pattern.sub(r'', my_text)
    print(f'after removing urls: {my_text}')

    # removal of HTML Tags
    my_text = BeautifulSoup(my_text, "lxml").text
    print(f'after removing HTMLs: {my_text}')

    # tokenize the word using nltk  
    my_text = nltk.word_tokenize(my_text)
    print(f'after tokenize: {my_text}')

    # lemmatizing and using grouped word chuncks 
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN,"V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    # lemmatizing 
    pos_tagged_text = nltk.pos_tag(my_text) 
    my_text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    print(f'after lemma: {my_text}')
    
    # change emojis into words 
    for emot in UNICODE_EMOJI:
        my_text = my_text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    print(f'after changing emojis: {my_text}')

    # remove not english characters, lower case and split the text 
    my_text = re.sub('[^A-Za-z0-9]+', " ", my_text).lower().strip() 
    print(f'after removing mentions: {my_text}')

    # convert the text to list as the vectorized words  
    ## my_text = my_text.split(" ")

    return my_text

In [10]:
# try the data in small bulk 
test = text_raw[10:15]
test

10    Whatcha think of these? #Leggings #scrunchy #F...
11    Sheesh. These Dunks go crazy @nike #justdoit h...
12    I'm at Nike Soho - @nikenyc in New York https:...
13    NEW KICKS üëÄ‚ô•Ô∏èüòçü´†üí∏\n#Nike #AF1 #airforce1 @ Orla...
14    ‚ÄúGood Times‚Äù last night at City Parks Foundati...
Name: text, dtype: object

In [11]:
# snippet of the text preprocessing results
test.apply(lambda test: clean_text_lemma_show(test))

after removing stop words: Whatcha think these? #Leggings #scrunchy #FridayVibes #Leggingsass #onlyfansgirl #onlyfans #nike #kicks #fire #AllNaturalBeauty #thick #smile #workmode #BREAKING #amazon #wishlist #AmazonWishList https://t.co/7OzpZ4jdEU
after lower case: whatcha think these? #leggings #scrunchy #fridayvibes #leggingsass #onlyfansgirl #onlyfans #nike #kicks #fire #allnaturalbeauty #thick #smile #workmode #breaking #amazon #wishlist #amazonwishlist https://t.co/7ozpz4jdeu
after removing urls: whatcha think these? #leggings #scrunchy #fridayvibes #leggingsass #onlyfansgirl #onlyfans #nike #kicks #fire #allnaturalbeauty #thick #smile #workmode #breaking #amazon #wishlist #amazonwishlist 
after removing HTMLs: whatcha think these? #leggings #scrunchy #fridayvibes #leggingsass #onlyfansgirl #onlyfans #nike #kicks #fire #allnaturalbeauty #thick #smile #workmode #breaking #amazon #wishlist #amazonwishlist 
after tokenize: ['whatcha', 'think', 'these', '?', '#', 'leggings', '#', 'scru

10    whatcha think these legging scrunchy fridayvib...
11             sheesh these dunk go crazy nike justdoit
12                       i m nike soho nikenyc new york
13    new kick eyesheart suit smiling face with hear...
14    good time last night city park foundation gala...
Name: text, dtype: object

In [5]:
def clean_text_lemma(var):
    """
    Function for text preprocessing with Lemmatizing with POS tag.
    """
    # remove the stop words 
    sw = set(stopwords.words('english'))
    my_text = [word for word in str(var).split() if word not in sw]
    my_text = " ".join(my_text)

    # lowercase 
    my_text = my_text.lower()

    # removal of URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    my_text = url_pattern.sub(r'', my_text)

    # removal of HTML Tags
    my_text = BeautifulSoup(my_text, "lxml").text

    # tokenize the word using nltk  
    my_text = nltk.word_tokenize(my_text)

    # lemmatizing and using grouped word chuncks 
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N": wordnet.NOUN,"V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    # lemmatizing 
    pos_tagged_text = nltk.pos_tag(my_text) 
    my_text = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
    
    # change emojis into words 
    for emot in UNICODE_EMOJI:
        my_text = my_text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))

    # remove not english characters, lower case and split the text 
    my_text = re.sub('[^A-Za-z0-9]+', " ", my_text).lower().strip() 

    # convert the text to list as the vectorized words  
    ## my_text = my_text.split(" ")

    return my_text

In [9]:
# implement on the whole text files 
text_clean_object = text_raw.apply(lambda text: clean_text_lemma(text))

In [68]:
len(text_clean_object)

11648

In [10]:
# convert the clean dataset into dataframe 
text_clean = pd.DataFrame(text_clean_object)
text_clean['Id'] = [num for num in range(0,len(text_clean_object))]
text_clean.head()

Unnamed: 0,text,Id
0,shanedaleaz totally the nike uniform since rep...,0
1,hat tom sachs marketing team nike they release...,1
2,the look makiracook face face with tears of jo...,2
3,check new pickup nike snkrs,3
4,jameslfreelance jumpman23 nike nikestore oooo ...,4


In [20]:
# maybe try to put the raw data in frist 
text_raw_df = pd.DataFrame(text_raw)
text_raw_df['Id'] = [num for num in range(0,len(text_raw_df))]
text_raw_df.head()

Unnamed: 0,text,Id
0,@ShaneDaleAZ Totally. The Nike uniforms since ...,0
1,Hats off to Tom Sachs and the marketing team a...,1
2,The look on @makiracook face! üòÇ \n\nThanks @Ni...,2
3,Check out my new pickup from Nike‚Å† SNKRS: http...,3
4,@jameslfreelance @Jumpman23 @Nike @nikestore O...,4


### 1.3 VADER Sentiment Scoring 
VADER (Valence Aware Dictionary and Sentiment Reasoner) : Bag of words approach 

*__Notes:__* <br>
This model does not include relationship in words 

*__Steps:__* <br>
1. Stop words are removed.
2. Each word is scored and combined to a total score. 

In [17]:
# nltk packages 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# progress bar tracker for some loops 
from tqdm.notebook import tqdm 

In [22]:
# create the sentiment analysis object, the returning scores are from 0 to 1
sia = SentimentIntensityAnalyzer()

# show case of the returns, the final `compound` result is from -1 to 1 
print(sia.polarity_scores("I'm so happy"))
print(sia.polarity_scores("This is the worst thing ever"))

{'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6115}
{'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}


In [111]:
# sentiment analysis using the cleaned text file 
res = {}
for i, row in tqdm(text_clean.iterrows(), total = len(text_clean)):
    text = row['text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/11648 [00:00<?, ?it/s]

In [74]:
# combine the sentiment score back to the dataframe
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(text_clean, how = 'left')

# now we have sentiment score and metadata
vaders.head()

Unnamed: 0,Id,neg,neu,pos,compound,text
0,0,0.0,1.0,0.0,0.0,shanedaleaz totally the nike uniform since rep...
1,1,0.211,0.657,0.131,-0.0772,hat tom sachs marketing team nike they release...
2,2,0.114,0.482,0.404,0.7003,the look makiracook face face with tears of jo...
3,3,0.0,1.0,0.0,0.0,check new pickup nike snkrs
4,4,0.083,0.623,0.294,0.7269,jameslfreelance jumpman23 nike nikestore oooo ...


In [75]:
# a snippet of how it performs 
pd.set_option('display.max_colwidth', None)
vaders[['compound','text']][50:53]

Unnamed: 0,compound,text
50,-0.7184,need tht nike ski mask shit look fire
51,0.0,williamfeltner9 these come local shop pair nike hasn t ship yet either
52,0.6705,ethalorian nike yeah fine you see stock tech investment look like high last year lol recession hit everyone point


__*Analysis*__

The above three lines of test texts shows that since VADER doesn't take the relationship between words into consideration, in the 50th line that it made the wrong judgement of the sentiment text. 

## 2. Deep Learning Models with Hugging Face
[Reference from HuggingFace](https://huggingface.co/blog/sentiment-analysis-python) <br>
HuggingFace: A hub that provides the collection of pre-trained models, and some are state of art. <br>

### 2.1 twitter-roberta-base-sentiment <br>

[Reference for NLTK's VADER and Hugging Face Transformers using Roberta](https://www.youtube.com/watch?v=QpzMWQvxXWk)

- Bert : Transformer based deep learning models
- [roBERTa-based model](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment?text=I+like+you.+I+love+you): trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. This model is suitable for English.

In [11]:
# import hugging face library Transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax 

In [12]:
# use pre-trained models with the Roberta 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# run example for VADER moel 
example = 'i need tht nike ski mask shit looks fire'
print(sia.polarity_scores(example))

In [23]:
# run example on roBERTa model 

# return the 0 or 1 tensors that embeding models will understand
encoded_text = tokenizer(example, return_tensors = 'pt') # pt for PyTorch
# the output is a tensor 
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
# apply softmax to turn into 0 to 1 range 
scores = softmax(scores)
scores_dict = {'roberta_neg': scores[0],
                'roberta_neu': scores[1],
                'roberta_pos': scores[2]}

print(scores_dict)

{'roberta_neg': 0.013050234, 'roberta_neu': 0.13767199, 'roberta_pos': 0.8492778}


__*Analysis*__

Yay! roBERTa correctly classified the text of 
> i need tht nike ski mask shit looks fire 

as being positive, much more powerful when taking sementaic meaning into consideration. 

In [14]:
def polarity_scores_roerta( var ):
    encoded_text = tokenizer(var, return_tensors = 'pt') # pt for PyTorch
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {'roberta_neg': scores[0],
                    'roberta_neu': scores[1],
                    'roberta_pos': scores[2]}
    return scores_dict

In [23]:
res = {}
for i, row in tqdm(text_clean.iterrows(), total = len(text_raw_df)):
    try: 
        text = row['text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
                vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roerta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both 
    except RuntimeError: 
        print(f'Broke for id{myid}')

  0%|          | 0/11648 [00:00<?, ?it/s]

In [24]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'Id'})
results_df = results_df.merge(text_raw_df, how = 'left')

In [25]:
results_df.head()

Unnamed: 0,Id,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,text
0,0,0.0,1.0,0.0,0.0,0.029508,0.537855,0.432637,@ShaneDaleAZ Totally. The Nike uniforms since ...
1,1,0.211,0.657,0.131,-0.0772,0.071209,0.369753,0.559038,Hats off to Tom Sachs and the marketing team a...
2,2,0.114,0.482,0.404,0.7003,0.002428,0.077772,0.919799,The look on @makiracook face! üòÇ \n\nThanks @Ni...
3,3,0.0,1.0,0.0,0.0,0.041717,0.864787,0.093497,Check out my new pickup from Nike‚Å† SNKRS: http...
4,4,0.083,0.623,0.294,0.7269,0.001843,0.033002,0.965156,@jameslfreelance @Jumpman23 @Nike @nikestore O...


### 2.2 bertweet-base-sentiment-analysis <br> 
- [Model](https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis?text=I+like+you.+I+love+you) trained with SemEval 2017 corpus (around ~40k tweets). Base model is BERTweet, a RoBERTa model trained on _**English tweets**_.

- Uses POS, NEG, NEU labels.

In [26]:
# use pre-trained models with the Roberta in English 
MODEL = f"finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [27]:
# run example on RoBERTa model 
example = 'i need tht nike ski mask shit looks fire'

# return the 0 or 1 tensors that embeding models will understand
encoded_text = tokenizer(example, return_tensors = 'pt') # pt for PyTorch
# the output is a tensor 
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
# apply softmax to turn into 0 to 1 range 
scores = softmax(scores)
scores_dict = {'roberta_neg': scores[0],
                'roberta_neu': scores[1],
                'roberta_pos': scores[2]}

print(scores_dict)

{'roberta_neg': 0.00656319, 'roberta_neu': 0.17408508, 'roberta_pos': 0.8193517}


### 2.3 Flair 
This is the large 18-class NER(named entity recognition) model for English that ships with Flair.
[Reference](https://huggingface.co/flair/ner-english-ontonotes-large?text=On+September+1st+George+won+1+dollar+while+watching+Game+of+Thrones.)

In [33]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-large")

# make example sentence
sentence = Sentence("On September 1st George won 1 dollar while watching Game of Thrones.")

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)


Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2022-10-25 11:13:57,871 loading file C:\Users\hs324\.flair\models\ner-english-ontonotes-large\2da6c2cdd76e59113033adf670340bfd820f0301ae2e39204d67ba2dc276cc28.ec1bdb304b6c66111532c3b1fc6e522460ae73f1901848a4d0362cdf9760edb1


Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2022-10-25 11:14:12,415 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
Sentence: "On September 1st George won 1 dollar while watching Game of Thrones ." ‚Üí ["September 1st"/DATE, "George"/PERSON, "1 dollar"/MONEY, "Game of Thrones"/WORK_OF_ART]
The following NER tags are found:
Span[1:3]: "September 1st" ‚Üí DATE (1.0)
Span[3:4]: "George" ‚Üí PERSON (1.0)
Span[5:7]: "1 dollar" ‚Üí MONEY (1.0)
Span[9:12]: "Game of Thrones" ‚Üí WORK_OF_ART (1.0)


## 3.Compare Model Performance on Benchmark Labeled Dataset

### 3.1 Load benchmark labeled data

In [98]:
# import labeled dataset to test on 
labeled_df = pd.read_csv('c:\\Users\\hs324\\OneDrive\\Desktop\\Class_Files\\06_2022Fall\\04_Practicum\\Quantilope_Core\\data\\twitter_labeled_data.csv') 
labeled_df['Id'] = [num for num in range(0,len(labeled_df))]
labeled_df.rename(columns={'clean_text':'text'},inplace = True)

# have a look at the labeled dataframe
print(labeled_df.shape)
labeled_df.head()

(162980, 3)


Unnamed: 0,text,category,Id
0,when modi promised ‚Äúminimum government maximum...,-1.0,0
1,talk all the nonsense and continue all the dra...,0.0,1
2,what did just say vote for modi welcome bjp t...,1.0,2
3,asking his supporters prefix chowkidar their n...,1.0,3
4,answer who among these the most powerful world...,1.0,4


In [99]:
# implement on the whole text files 
labeled_text_clean_object = labeled_df['text'].apply(lambda text: clean_text_lemma(text)) 

# convert the clean dataset into dataframe 
labeled_text_clean = pd.DataFrame(labeled_text_clean_object)
labeled_text_clean['Id'] = [num for num in range(0,len(labeled_text_clean))]
labeled_text_clean.head()

Unnamed: 0,text,Id
0,modi promise minimum government maximum govern...,0
1,talk nonsense continue drama vote modi,1
2,say vote modi welcome bjp tell rahul main camp...,2
3,ask supporter prefix chowkidar name modi great...,3
4,answer among powerful world leader today trump...,4


In [47]:
# remove infinite and null values 
def remove_inf(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [113]:
# merge the cleaned dataset with the labeled sentiment score
labeled_clean_score =  labeled_text_clean.merge(labeled_df,how = 'left',on = 'Id')\
    .drop(columns={'text_y'}).\
        rename(columns={'text_x':'text'})
labeled_clean_score.head()

Unnamed: 0,text,Id,category
0,modi promise minimum government maximum govern...,0,-1.0
1,talk nonsense continue drama vote modi,1,0.0
2,say vote modi welcome bjp tell rahul main camp...,2,1.0
3,ask supporter prefix chowkidar name modi great...,3,1.0
4,answer among powerful world leader today trump...,4,1.0


### 3.2 Building Up Baseline Model
I'll use NLTK's VADER as the baseline model.

In [106]:
# Machine Learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import learning_curve

# evaluation metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, auc, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix
import scikitplot as skplt

import warnings
warnings.filterwarnings("ignore")

print('‚úîÔ∏è Libraries Imported!')

‚úîÔ∏è Libraries Imported!


In [117]:
# split the dataset into training and testing
y=labeled_clean_score['category']
train_df, test_df = train_test_split(labeled_clean_score, test_size=0.2, random_state=42, stratify = y)

# split the train and set into X_train and y_train sets
X_train = train_df.drop(columns='category')
y_train = train_df['category']

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f'Train set: {train_df.shape[0]} rows x {train_df.shape[1]} columns')
print(f'Test set: {test_df.shape[0]} rows x {test_df.shape[1]} columns')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [116]:
X_train

Unnamed: 0,text
124424,asia politics indian modi visit hugged whole w...
112795,luxury car own nirav modi auction economic time
39997,legitimate arrest antony make nirav modi vijay...
27583,respect sir sar madam kindly vaddi successfull...
45216,envision india think act two step ahead curren...
...,...
138908,shawl bjp symbol lotus batch like modi time
119194,thing do modi bhakts would waah modi waah mast...
86731,complete analysis pure facts employment growth...
64318,thats india need modi


In [87]:
# run the vader model over training set 
res = {}
for i, row in tqdm(X_train.iterrows(), total = len(X_train)):
    text = row['text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/130378 [00:00<?, ?it/s]

KeyError: 'Id'

In [85]:
res

{11647: {'neg': 0.0, 'neu': 0.698, 'pos': 0.302, 'compound': 0.5994}}

In [84]:
# combine the sentiment score back to the dataframe
vader_labeled = pd.DataFrame(res).T
vader_labeled = vader_labeled.reset_index().rename(columns={'index':'Id'})
# vader_labeled = vaders.merge(text_clean, how = 'left')

# now we have sentiment score and metadata
vader_labeled

Unnamed: 0,Id,compound,neg,neu,pos
0,11647,0.5994,0.0,0.698,0.302


_**Note:**_ <br>
Currently run into some errors and need more work on the performance comparision and building evaluation matrixs.

## 4.Export Sentiment Score

Before coming up with the evaluation for different model performance, I'll use RoBerta model and its output. 

In [124]:
sentiment_score = results_df[['roberta_neg','roberta_neu','roberta_pos','text']]
sentiment_score.to_csv('c:\\Users\\hs324\\OneDrive\\Desktop\\Class_Files\\06_2022Fall\\04_Practicum\\Quantilope_Core\\data\\sentiment_score_text.csv')
sentiment_score

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos,text
0,0.029508,0.537855,0.432637,@ShaneDaleAZ Totally. The Nike uniforms since ...
1,0.071209,0.369753,0.559038,Hats off to Tom Sachs and the marketing team a...
2,0.002428,0.077772,0.919799,The look on @makiracook face! üòÇ \n\nThanks @Ni...
3,0.041717,0.864787,0.093497,Check out my new pickup from Nike‚Å† SNKRS: http...
4,0.001843,0.033002,0.965156,@jameslfreelance @Jumpman23 @Nike @nikestore O...
...,...,...,...,...
11643,0.283802,0.640393,0.075805,"Armpit musty, Reebok crusty https://t.co/KjJ3z..."
11644,0.018777,0.809956,0.171267,@DoubleOhNegatve I believe that was the Reebok...
11645,0.253581,0.710993,0.035426,If you ever find yourself talking shit on will...
11646,0.046945,0.874524,0.078531,"Think the 4,5 shoes gon be Reebok, Doc‚Äôs and s..."
