In [94]:
import praw
import datetime
import json
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nltk.download('vader_lexicon')
import string
import pandas as pd
import collections
from pprint import pprint
from nltk.tokenize import word_tokenize, RegexpTokenizer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bruce/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
file = open('reddit_info.json')
info = json.load(file)

In [3]:
#Basically logs into the bot? I think
def create_reddit_object():
    reddit = praw.Reddit(client_id = info['client_id'],
                        client_secret = info['client_secret'],
                        user_agent = info['user_agent'],
                        username = info['username'],
                        password = info['password'])
    return reddit

In [59]:
def compile_popular():
    headlines = set()

    reddit = create_reddit_object()

    popular = reddit.subreddit("popular")
    popular_year = popular.top("year", limit=10)

    for submission in popular_year:
        headlines.add(submission.title)

    return headlines


In [60]:
def compile_news():
    headlines = set()

    # reddit = create_reddit_object()

    news = reddit.subreddit("science")
    news_year = news.top("year", limit=10)

    for submission in news_year:
        headlines.add(submission.title)

    return headlines


In [61]:
print(compile_popular())
print()
print(compile_news())

{'Please make this go viral. I am begging you. Police and National Guard patrolling neighborhood and shooting civilians on their own property. Make America see this, I beg you. [Minneapolis]', 'I’ve found a few funny memories during lockdown. This is from my 1st tour in 89, backstage in Vegas.', 'If this is you: Fuck you', 'Leaked Drone footage of shackled and blindfolded Uighur Muslims led from trains. As a German this is especially chilling.', 'Meet the newest member of the family, Dutch!', 'At a protest in Arizona', 'She did her best ok?', 'A short story', 'Joe Biden elected president of the United States', '"Everybody\'s trying to shame us"'}

{'LED lights found to kill coronavirus efficiently, quickly, and cheaply, a global first in fight against COVID-19. The finding suggests the UV-LEDs can be installed in air conditioning and water systems. It requires less than half a minute to destroy more than 99.9% of coronaviruses.', 'The first severe COVID-19 patient successfully treated 

In [71]:
def compile_subreddits(subreddits):
    headlines = set()

    reddit = create_reddit_object()

    for i in range(len(subreddits)):
        subreddit = reddit.subreddit(subreddits[i])
        top_headlines_in_year = subreddit.top("year", limit=None)

        for submission in top_headlines_in_year:
            headlines.add(submission.title)
    
    return headlines


# print(compile_subreddits(["popular", "news", "all", "politics", "worldnews"]))




In [87]:
def sentiment_analyzer(headlines):
    sia = SIA()
    results = []

    for line in headlines:
        pol_score = sia.polarity_scores(line)
        pol_score['headline'] = line
        results.append(pol_score)

    df = pd.DataFrame.from_records(results)

    return df
    # pprint(results[:3], width=100)

In [82]:
# all_headlines = compile_subreddits(["popular", "news", "all", "politics", "worldnews"])


In [95]:
df = sentiment_analyzer(all_headlines)
tokenizer = RegexpTokenizer(r'\w+')

df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

df2 = df[['headline', 'label']]

print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)

print(df.label.value_counts())

print(df.label.value_counts(normalize=True) * 100)

Positive headlines:

['It was fun doe',
 'LAPD shoots “less than lethal” rounds directly at an unarmed homeless man who was not protesting.',
 'The sun shining through my fish tank aligned perfectly on each knob',
 'Decorated Afghan pilot who protected US airmen in hiding after Pentagon reverses approval to come to US.',
 'Bollywood Actors Called Out For Supporting Black Lives Matter Movement While Promoting Skin Lightening Products']

Negative headlines:

['Prince Andrew has given ‘zero cooperation’ in Jeffrey Epstein sex crime investigation, federal prosecutors say',
 'Cincinnati church wipes out $46.5 million in medical debt for 45,000 families',
 'WHO sounds alarm as coronavirus cases rise by one million in five days',
 'Alabama blocked a man from voting because he owed $4',
 "Australia to pass surveillance bill that'll allow children as young as 14 to be interrogated by government agents and could see journalists jailed for 5 years for refusing to reveal sources. "
 'Authorities c

In [96]:
def process_text(headlines):
    tokens = []
    for line in headlines:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stopwords.words('english')]
        tokens.extend(toks)
    
    return tokens

In [102]:
df[df.label==1].head()

# pos_lines = list(df[df.label == 1].headline)

# pos_tokens = process_text(pos_lines)
# pos_freq = nltk.FreqDist(pos_tokens)

# pos_freq.most_common(100)

Unnamed: 0,neg,neu,pos,compound,headline,label
2,0.0,0.476,0.524,0.5106,It was fun doe,1
13,0.0,0.865,0.135,0.3252,LAPD shoots “less than lethal” rounds directly...,1
20,0.0,0.724,0.276,0.6369,The sun shining through my fish tank aligned p...,1
32,0.099,0.631,0.27,0.5859,Decorated Afghan pilot who protected US airmen...,1
41,0.0,0.649,0.351,0.6705,Bollywood Actors Called Out For Supporting Bla...,1


In [100]:
neg_lines = list(df2[df2.label == -1].headline)

neg_tokens = process_text(neg_lines)
neg_freq = nltk.FreqDist(neg_tokens)

neg_freq.most_common(100)

[('trump', 333),
 ('police', 153),
 ('says', 147),
 ('coronavirus', 134),
 ('covid', 113),
 ('us', 92),
 ('19', 88),
 ('people', 75),
 ('man', 66),
 ('protesters', 63),
 ('president', 59),
 ('death', 59),
 ('u', 58),
 ('black', 56),
 ('white', 53),
 ('arrested', 49),
 ('charged', 49),
 ('new', 49),
 ('one', 47),
 ('world', 45),
 ('election', 43),
 ('china', 43),
 ('say', 41),
 ('house', 41),
 ('years', 40),
 ('officer', 38),
 ('biden', 38),
 ('000', 37),
 ('killed', 37),
 ('year', 37),
 ('protests', 37),
 ('officers', 36),
 ('dead', 35),
 ('donald', 35),
 ('woman', 35),
 ('two', 34),
 ('fired', 34),
 ('stop', 34),
 ('calls', 33),
 ('said', 33),
 ('video', 33),
 ('first', 31),
 ('protest', 31),
 ('died', 31),
 ('floyd', 30),
 ('state', 29),
 ('ban', 29),
 ('pandemic', 29),
 ('family', 28),
 ('old', 28),
 ('anti', 28),
 ('news', 27),
 ('war', 27),
 ('george', 27),
 ('health', 27),
 ('could', 26),
 ('day', 26),
 ('fraud', 26),
 ('law', 26),
 ('would', 26),
 ('prison', 26),
 ('government',

In [11]:

''' Remove punctuation and digits from a string. '''
def _process_string(s):
    no_digits = []

    # Convert string to lowercase, remove punctuation.
    raw_string = s.lower()
    clean_string = raw_string.translate(str.maketrans('','', string.punctuation))

    # Remove all digits from string.
    for letter in clean_string:
        if not letter.isdigit():
            no_digits.append(letter)

    # Create the final string.
    result = ''.join(no_digits)

    return result

In [32]:
# /r/popular, /r/news, /r/all, /r/science, /r/politics, /r/worldnews
def find_common_words():

    reddit = create_reddit_object()


    #chooses subreddit
    subreddit = reddit.subreddit("popular")
    hot = subreddit.top("year", limit = None)

    headlines = set()

        

#     wordcounts = {}

#     for submission in hot:
#         for word in _process_string(submission.title).split():
#             if word not in stopwords.words('english') and len(word) > 2:
#                 if word not in wordcounts:
#                     wordcounts[word] = 1
#                 else:
#                     wordcounts[word] += 1
#     return wordcounts

# word_counter = collections.Counter(find_common_words())
# for word, count in word_counter.most_common(500):
#     print(word, ":", count)

find_common_words()


988
