In [48]:
# Import necessary libraries
import nltk
import pandas as pd
from collections import Counter
import unrar

In [49]:
# Load the csv file
df = pd.read_csv('DailyComments.csv')
df

Unnamed: 0,Day of Week,comments
0,Monday,"Hello, how are you?"
1,Tuesday,Today is a good day!
2,Wednesday,It's my birthday so it's a really special day!
3,Thursday,Today is neither a good day or a bad day!
4,Friday,I'm having a bad day.
5,Saturday,There' s nothing special happening today.
6,Sunday,Today is a SUPER good day!


***Looking at the comments, I've decided to use sentiment analysis, following the example of genderization in chapter 1, 
to determine whether each comment is positive or negative and return positive/negative percentages for the corpus as a 
whole.***

In [50]:
# To make this easy, I will concatenate all the strings into one text
text = df['comments'].str.cat(sep=' ')
text

"Hello, how are you? Today is a good day! It's my birthday so it's a really special day! Today is neither a good day or a bad day! I'm having a bad day. There' s nothing special happening today. Today is a SUPER good day!"

In [19]:
# I want to use the Hu and Lui Opinion Lexicon list of positive and negative words for my analysis, so I have to download it
!pip install pyunpack
!pip install patool
from pyunpack import Archive
Archive("opinion-lexicon-English.rar").extractall('C:/Users/myraw/Jupyter/DSC550')

Collecting patool
  Downloading patool-1.12-py2.py3-none-any.whl (77 kB)
Installing collected packages: patool
Successfully installed patool-1.12


In [51]:
# Read in the positive words file
f = open("positive-words.txt", "r")
pos_text = f.read()

In [52]:
# Split the positive words into a list
pos_list = pos_text.split()
pos_list

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation',
 'accolade',
 'accolades',
 'accommodative',
 'accomodative',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accomplishments',
 'accurate',
 'accurately',
 'achievable',
 'achievement',
 'achievements',
 'achievible',
 'acumen',
 'adaptable',
 'adaptive',
 'adequate',
 'adjustable',
 'admirable',
 'admirably',
 'admiration',
 'admire',
 'admirer',
 'admiring',
 'admiringly',
 'adorable',
 'adore',
 'adored',
 'adorer',
 'adoring',
 'adoringly',
 'adroit',
 'adroitly',
 'adulate',
 'adulation',
 'adulatory',
 'advanced',
 'advantage',
 'advantageous',
 'advantageously',
 'advantages',
 'adventuresome',
 'adventurous',
 'advocate',
 'advocated',
 'advocates',
 'affability',
 'affable',
 'affably',
 'affectation',
 'affection',
 'affectionate',
 'affinity',
 'affirm',
 'affirmation',
 'affirmative',
 'affluence',
 'affluent',
 'afford',
 'affordable',
 'af

In [53]:
# Convert the list to a set
pos_words = set(pos_list)

In [54]:
# Now I will do the same for the list of negative words
d = open("negative-words.txt", "r")
neg_text = d.read()
neg_list = neg_text.split()
neg_words = set(neg_list)

In [55]:
# Create a function that will identify positive and negative words in text
POSITIVE = 'positive'
NEGATIVE = 'negative'
UNKNOWN = 'unknown'
BOTH = 'both'

def wordSentiment(words):
    poslen = len(pos_words.intersection(words))
    neglen = len(neg_words.intersection(words))
    
    if poslen > 0 and neglen == 0:
        return POSITIVE
    elif poslen == 0 and neglen > 0:
        return NEGATIVE
    elif poslen > 0 and neglen > 0:
        return BOTH
    else:
        return UNKNOWN

In [56]:
# Create a function that will count the frequency of positive/negative words and sentences within a corpus

def countSentiment(sentences):
    sents = Counter()
    words = Counter()
    
    for sentence in sentences:
        sentiment = wordSentiment(sentence)
        sents[sentiment] += 1
        words[sentiment] += len(sentence)
        
    return sents, words

In [57]:
# Create a function that will parse/tokenize the text then pass the tokenized text to the sentiment counter

def parseSentiment(text):
    sentences = [
        [word.lower() for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]
    
    sents, words = countSentiment(sentences)
    total = sum(words.values())
    
    for sentiment, count in words.items():
        pcent = (count/total) * 100
        nsents = sents[sentiment]
        
        print("{}% {} ({} sentences)".format(pcent, sentiment, nsents))

In [58]:
# Run sentiment analysis on the text
parseSentiment(text)

45.614035087719294% unknown (3 sentences)
22.807017543859647% positive (2 sentences)
19.298245614035086% both (1 sentences)
12.280701754385964% negative (1 sentences)


***So the results are actually pretty accurate.
If I manually read them, my results would be: 2 unks, 3 pos, 1 both, 1 neg. 
Therefore using Hu & Lui Opionion Lexicon was only one off. 
If I were to make a manual list of positive/negative words based off the words in the text, I would have as follows:
positive words = special, super,  & good.
negative words = bad.
This manual method, would have returned the result: 1 unk, 4 pos, 1 both, 1 neg. Also, just one off, but with a false positive versus a positive marked as an unknown, which is less accurate.***

***Extra Credit***

In [59]:
# Load in my keys from a json file
import json

with open('twitter_keys.json') as f:
    keys = json.load(f)
    consumer_key = keys['consumer_key']
    consumer_secret = keys['consumer_secret']
    access_token_key = keys['access_token_key']
    access_token_secret = keys['access_token_secret']

In [60]:
# import twitter library using my keys & token
import twitter
api = twitter.Api(consumer_key=consumer_key,
    consumer_secret=consumer_secret,
    access_token_key=access_token_key,
    access_token_secret=access_token_secret)

In [83]:
# Conduct a search for latest 10 tweets that contained the term "Bellevue University", since Dec 1, 2020.
tweets = api.GetSearch(term='Bellevue University', since=2020-12-1, count=10)
tweets

[Status(ID=1345694201648996357, ScreenName=wmbua, Created=Sun Jan 03 11:31:14 +0000 2021, Text='@Ma3Route @KeNHAKenya @eonditi Along olesereni it will pass under along bellevue it will be elevated all the way to university rounderbout'),
 Status(ID=1345584850816872449, ScreenName=69NiceBot69, Created=Sun Jan 03 04:16:42 +0000 2021, Text="RT @viterbovhawks: Men's Basketball - Lost 84-69 to Bellevue University"),
 Status(ID=1345584243666264064, ScreenName=viterbovhawks, Created=Sun Jan 03 04:14:18 +0000 2021, Text="Men's Basketball - Lost 84-69 to Bellevue University"),
 Status(ID=1345584207591075842, ScreenName=viterbovhawks, Created=Sun Jan 03 04:14:09 +0000 2021, Text="Women's Basketball - Lost 72-64 to Bellevue University"),
 Status(ID=1345218883121262594, ScreenName=ViterboWBB, Created=Sat Jan 02 04:02:29 +0000 2021, Text='RT @LaCrosseTribune: The V-Hawks’ men’s and women’s teams open NSAA play Saturday at Bellevue University (Neb.), the teams’ first action si…'),
 Status(ID=1345161

In [81]:
# Pull the text element from the tweets
text_tweets = [[tweet.text] for tweet in tweets]
text_tweets

[['@Ma3Route @KeNHAKenya @eonditi Along olesereni it will pass under along bellevue it will be elevated all the way to university rounderbout'],
 ["RT @viterbovhawks: Men's Basketball - Lost 84-69 to Bellevue University"],
 ["Men's Basketball - Lost 84-69 to Bellevue University"],
 ["Women's Basketball - Lost 72-64 to Bellevue University"],
 ['RT @LaCrosseTribune: The V-Hawks’ men’s and women’s teams open NSAA play Saturday at Bellevue University (Neb.), the teams’ first action si…'],
 ['The V-Hawks’ men’s and women’s teams open NSAA play Saturday at Bellevue University (Neb.), the teams’ first action… https://t.co/NmKPGsRpkc'],
 ['Every Anime Girl went to Bellevue University'],
 ["RT @BellevueU: #HappyNewYear! Here's hoping that 2021 is as wholesome as 2016, the year that Bellevue University students absolutely owned…"],
 ["#HappyNewYear! Here's hoping that 2021 is as wholesome as 2016, the year that Bellevue University students absolute… https://t.co/aSHZJhCttx"],
 ['University Colle

In [89]:
# Convert my list of comments into a dataframe
t = pd.DataFrame(text_tweets, columns=['comments'])
t

Unnamed: 0,comments
0,@Ma3Route @KeNHAKenya @eonditi Along olesereni...
1,RT @viterbovhawks: Men's Basketball - Lost 84-...
2,Men's Basketball - Lost 84-69 to Bellevue Univ...
3,Women's Basketball - Lost 72-64 to Bellevue Un...
4,RT @LaCrosseTribune: The V-Hawks’ men’s and wo...
5,The V-Hawks’ men’s and women’s teams open NSAA...
6,Every Anime Girl went to Bellevue University
7,RT @BellevueU: #HappyNewYear! Here's hoping th...
8,#HappyNewYear! Here's hoping that 2021 is as w...
9,"University College London\nSchool of Science, ..."


In [90]:
# concatenate all the strings into one text
comments = t['comments'].str.cat(sep=' ')
comments

"@Ma3Route @KeNHAKenya @eonditi Along olesereni it will pass under along bellevue it will be elevated all the way to university rounderbout RT @viterbovhawks: Men's Basketball - Lost 84-69 to Bellevue University Men's Basketball - Lost 84-69 to Bellevue University Women's Basketball - Lost 72-64 to Bellevue University RT @LaCrosseTribune: The V-Hawks’ men’s and women’s teams open NSAA play Saturday at Bellevue University (Neb.), the teams’ first action si… The V-Hawks’ men’s and women’s teams open NSAA play Saturday at Bellevue University (Neb.), the teams’ first action… https://t.co/NmKPGsRpkc Every Anime Girl went to Bellevue University RT @BellevueU: #HappyNewYear! Here's hoping that 2021 is as wholesome as 2016, the year that Bellevue University students absolutely owned… #HappyNewYear! Here's hoping that 2021 is as wholesome as 2016, the year that Bellevue University students absolute… https://t.co/aSHZJhCttx University College London\nSchool of Science, University of Buenos Aires

In [91]:
# Run the sentiment parser on the comments
parseSentiment(comments)

40.4040404040404% negative (1 sentences)
26.767676767676768% unknown (2 sentences)
32.82828282828283% positive (2 sentences)


***Seems suspect. I'm not a twitter user and I would have to look more into how to parse out comments that are @ an account, so that they don't show up multiple times. Also, how to parse out hashtags. I'm not sure it that effected the results, but I did pull 10 comments and the results only show 5 sentences, but I see there are duplicate comments. But, this was my best shot without investing a significant amount of time into it.