# Final Project for ISA

In [58]:
# ---------------------------------------------
# Imports
# ---------------------------------------------
from time import sleep
from datetime import datetime
import tweepy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import creds
# CSV Files
import csv
import os
import requests
# Reddit
import praw
from praw.models import MoreComments
import numpy

import warnings
warnings.filterwarnings("ignore")

# config = configparser.RawConfigParser()
# config.read('config.py')
# ---------------------------------------------
# IBM Credentials for emotion
# ---------------------------------------------
ibm_url  = "https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/d6058b89-d39d-464c-a756-50658dd3124b"
ibm_api_key    = creds.IBM_KEY
# print(ibm_api_key)
# test = config.get("default", "IBM_KEY")

# print(test)
# print(config.get("default", "IBM_KEY"))


# ---------------------------------------------
# Twitter API
# ---------------------------------------------
consumer_key = creds.CONSUMER_KEY
consumer_secret = creds.CONSUMER_SECRET
access_token = creds.ACCESS_TOKEN
access_token_secret = creds.ACCESS_TOKEN_SECRET

# Authenticating
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# ---------------------------------------------
# Loading the Positive and Negative Words
# ---------------------------------------------

# Positive Words
file = open('positive-words.txt', 'r')
positive_words = file.read().splitlines() 

# Negative Words
file = open('negative-words.txt', 'r')
negative_words = file.read().splitlines() 


# ---------------------------------------------
# Querying Tweets
# ---------------------------------------------

# Create tracking variables for tweet iteration
tweets = []

# Topic to be searched for
topic = "facebook"

# Get and store the 100 latest tweets
all_tweets = api.search_tweets(q=f"{topic} -filter:retweets", lang="en", count=100)
all_tweets = [tweet.text.lower() for tweet in all_tweets]

# 2-d array storing sentiment and emotion
sentiment = numpy.empty((100, 2), numpy.str)

# Keep track of score of sentiment for source
num_pos_tweets = 0
num_neg_tweets = 0
num_neutral_tweets = 0
count = 0
emotionScore = {
    'sadness':0,
    'joy':0,
    'fear':0,
    'disgust':0,
    'anger':0
}

# Iterate through last 100 tweets
for tweet in all_tweets:
    key_values = {'version': '2021-08-01', 'text': tweet, 'features':'sentiment,emotion'}
    response = requests.get(ibm_url+"/v1/analyze", key_values, auth = ('apikey', ibm_api_key))
    #print(response.json())
    #  Make sure that tweet is able to be analyzed
    if response.json()["language"] == "en":
        # see if sentiment is positive or negative
        if response.json()["sentiment"]["document"]["label"] == "positive":
            num_pos_tweets += 1
            sentiment[count][0] = "positive"
        else:
            num_neg_tweets += 1
            sentiment[count][0] = "negative"

        # determine strongest emotion and store it
        sadness = response.json()["emotion"]["document"]["emotion"]["sadness"]
        joy = response.json()["emotion"]["document"]["emotion"]["joy"]
        fear = response.json()["emotion"]["document"]["emotion"]["joy"]
        disgust = response.json()["emotion"]["document"]["emotion"]["disgust"]
        anger = response.json()["emotion"]["document"]["emotion"]["anger"]
        emotionMax = max(sadness, joy, fear, disgust, anger)
        if emotionMax == sadness:
            emotionScore['sadness'] = emotionScore['sadness'] + 1
            sentiment[count][1] = 'sadness'
        elif emotionMax == joy:
            emotionScore['joy'] = emotionScore['joy'] + 1
            sentiment[count][1] = 'joy'
        elif emotionMax == fear:
            emotionScore['fear'] = emotionScore['fear'] + 1
            sentiment[count][1] = 'fear'
        elif emotionMax == disgust:
            emotionScore['disgust'] = emotionScore['disgust'] + 1
            sentiment[count][1] = 'disgust'
        else:
            emotionScore['anger'] = emotionScore['anger'] + 1
            sentiment[count][1] = 'anger'

        count += 1

# Once we are done analyzing the tweets, print overall score
print(f"{'Tweet Analysis:':<15} Score: {num_pos_tweets-num_neg_tweets:<4} Positive Tweets: {num_pos_tweets:<4} Negative Tweets: {num_neg_tweets:<4} Total Tweets: {count}")
print(emotionScore)


Tweet Analysis: Score: -57  Positive Tweets: 21   Negative Tweets: 78   Total Tweets: 99
{'sadness': 15, 'joy': 79, 'fear': 0, 'disgust': 3, 'anger': 2}


In [19]:
# ---------------------------------------------
# Querying Reddit
# ---------------------------------------------

# Reddit API Key Data
my_client_id = "hO52jacJFobA_5P5KyIw8A"
my_client_secret = "KHTg5c1aqtih_tj2qBjUdRM0GV9qiQ"
my_user_agent = "Scraping ISA414"

# CNBC Article
cnbcArticle = "qht57x"
# CNN Article
cnnArticle = "quv6cr"
# Verge Article
vergeArticle = "qjbmtn"
# IGN  Article
ignArticle = "qmij5v"
# Business Insider Article
biArticle = "qhweln"
# Associated Press Article
apArticle = "qla1aw"
# NBC News Article
nbcArticle = "qht8mn"


reddit = praw.Reddit(
    client_id = my_client_id,
    client_secret = my_client_secret,
    user_agent = my_user_agent,
)

# Define the reddit scraping method
def redditScraping(redditAPI, url):

    # Variables
    count = 0
    topcomments = []
    submission = reddit.submission(url)
    submission.comments.replace_more(limit=0)
    # Loop through the top comments and grab the text
    for top_level_comment in submission.comments:
        topcomments.append(top_level_comment.body)
    
    return topcomments


# Sources
companies = ["CNBC", "CNN", "The Verge", "IGN", "Business Insider", "Associated Press", "NBC News"]

sources_dict = dict.fromkeys(companies, [])


# Save the comments  
sources_dict["CNBC"].append(redditScraping(reddit, cnbcArticle))
sources_dict["CNN"].append(redditScraping(reddit, cnnArticle))
sources_dict["The Verge"].append(redditScraping(reddit, vergeArticle))
sources_dict["IGN"].append(redditScraping(reddit, ignArticle))
sources_dict["Business Insider"].append(redditScraping(reddit, biArticle))
sources_dict["Associated Press"].append(redditScraping(reddit, apArticle))
sources_dict["NBC News"].append(redditScraping(reddit, nbcArticle))

# Print the Dict
print("Data Stored!")
print(len(sources_dict["CNN"][0][0]))




# ---------------------------------------------
# Reddit Sentiment & Emotion
# ---------------------------------------------

# Keep track of score of sentiment for source
num_pos_reddit = 0
num_neg_reddit = 0
num_neutral_reddit = 0
count = 0
ibmpos = 0
ibmneg = 0
emotionScore = {
    'sadness':0,
    'joy':0,
    'fear':0,
    'disgust':0,
    'anger':0
}

# Iterate through the reddit data

# for source in sources_dict:
#     for data in source:
#         key_values = {'version': '2021-08-01', 'text': data, 'features':'sentiment,emotion'}
#         response = requests.get(ibm_url+"/v1/analyze", key_values, auth = ('apikey', ibm_api_key))
        
#         # make sure that tweet is analyzable
#         if response.json()["language"] == "en":
#             # see if sentiment is positive or negative
#             if response.json()["sentiment"]["document"]["label"] == "positive":
#                 num_pos_reddit += 1
#             else:
#                 num_neg_reddit += 1

#             # determine strongest emotion
#             sadness = response.json()["emotion"]["document"]["emotion"]["sadness"]
#             joy = response.json()["emotion"]["document"]["emotion"]["joy"]
#             fear = response.json()["emotion"]["document"]["emotion"]["joy"]
#             disgust = response.json()["emotion"]["document"]["emotion"]["disgust"]
#             anger = response.json()["emotion"]["document"]["emotion"]["anger"]
#             emotionMax = max(sadness, joy, fear, disgust, anger)
#             if emotionMax == sadness:
#                 emotionScore['sadness'] = emotionScore['sadness'] + 1
#             elif emotionMax == joy:
#                 emotionScore['joy'] = emotionScore['joy'] + 1
#             elif emotionMax == fear:
#                 emotionScore['fear'] = emotionScore['fear'] + 1
#             elif emotionMax == disgust:
#                 emotionScore['disgust'] = emotionScore['disgust'] + 1
#             else:
#                 emotionScore['anger'] = emotionScore['anger'] + 1

#             count += 1

# # Print the results
# print(f"{'Tweet Analysis:':<15} Score: {num_pos_reddit-num_neg_reddit:<4} Positive Tweets: {num_pos_reddit:<4} Negative Tweets: {num_neg_reddit:<4} Total Tweets: {count}")
# print(emotionScore)



Version 7.5.0 of praw is outdated. Version 7.6.0 was released Tuesday May 10, 2022.


KeyboardInterrupt: 

In [14]:
# ---------------------------------------------
# Outputting Data
# ---------------------------------------------

# --------------------------------------------- Twitter

output = ["Test1", "Test2"]
# File name for the data
twitterFile = 'twitter_data.csv'

# Delete current CSV file, if it exists
if(os.path.exists(twitterFile) and os.path.isfile(twitterFile)):
  os.remove(twitterFile)

# Writing to csv File
header = ["Tweet", "Sentiment", "Emotion"]

# Open the CSV to write to
with open(twitterFile, 'w', encoding='UTF8', newline='') as f:
    # Create the csv writer
    writer = csv.writer(f)

    # Create the header
    writer.writerow(header)

    # Write the data
    for i in range(0, len(all_tweets)):
      writer.writerow([all_tweets[i], sentiment[i][0], sentiment[i][1]])



# --------------------------------------------- Reddit

# File name for the data
redditFile = 'reddit_data.csv'

# Delete current CSV file, if it exists
if(os.path.exists(redditFile) and os.path.isfile(redditFile)):
  os.remove(redditFile)


# Writing to CSV file
header = ["News Source", "Tweet"]

# Open the CSV to write to
with open(redditFile, 'w', encoding='UTF8', newline='') as f:
    # Create the csv writer
    writer = csv.writer(f)

    # Create the header
    writer.writerow(header)

    # Write the data
    for key, val in sources_dict.items():
      for i in val: 
        writer.writerow([key, i])

In [15]:
# ---------------------------------------------
# Analyzing Data
# ---------------------------------------------

