In [7]:
import os
import pandas as pd
from pathlib import Path
import requests
import json
from newsapi import NewsApiClient
#from theguardian import theguardian_content
from dotenv import load_dotenv
load_dotenv()
from collections import Counter
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from pmaw import PushshiftAPI
import praw

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
api_key = os.getenv("NEWSAPI_KEY")

In [3]:
newsapi = NewsApiClient(api_key=api_key)

In [4]:
#Get news articles on certain topic based on keywords
def get_news(keywords):  
    news_article = newsapi.get_everything(
            q = keywords, language='en', sort_by= 'relevancy', page=5
    )
    return news_article

In [5]:
#Creates dataframe of the articles chosen 
def form_df(keywords):
    news = get_news(keywords)['articles']

    articles = []
    for article in news:
        try:
            title = article['title']
            description = article['description']
            text = article['content']
            date = article['publishedAt'][:10]

            articles.append({
                'title' : title,
                'description' : description,
                'text' : text,
                'date' : date,
                'language' : 'en'
            })
        except AttributeError:
            pass
    
    return pd.DataFrame(articles)

In [6]:
#newsapi.get_everything(q = 'Microsoft', language='en')

In [7]:
#Test to see if works
get_news("Microsoft")

{'status': 'ok',
 'totalResults': 15154,
 'articles': [{'source': {'id': None, 'name': 'Android Central'},
   'author': 'andrew.myrick@futurenet.com (Andrew Myrick)',
   'title': 'Android apps for Windows get their most significant update yet',
   'description': 'A new update to the Windows 11 Dev Channel brings an update to the Windows Subsystem for Android.',
   'url': 'https://www.androidcentral.com/apps-software/windows-11-android-12l-apps',
   'urlToImage': 'https://cdn.mos.cms.futurecdn.net/thimyQpk6bpV82doUaSJ4T-1200-80.jpg',
   'publishedAt': '2022-05-23T14:38:22Z',
   'content': '<ul><li>Microsoft has updated the Windows Subsystem for Android to Android 12L.\xa0</li><li>When the Subsystem was originally launched, it relied on Android 11.\xa0</li><li>The update is currently availabl… [+3377 chars]'},
  {'source': {'id': None, 'name': 'Digital Trends'},
   'author': 'Fionna Agomuoh',
   'title': 'Microsoft has finally brought Teams to its own app store',
   'description': 'Micro

In [8]:
get_news("Microsoft")['articles']

[{'source': {'id': None, 'name': 'Android Central'},
  'author': 'andrew.myrick@futurenet.com (Andrew Myrick)',
  'title': 'Android apps for Windows get their most significant update yet',
  'description': 'A new update to the Windows 11 Dev Channel brings an update to the Windows Subsystem for Android.',
  'url': 'https://www.androidcentral.com/apps-software/windows-11-android-12l-apps',
  'urlToImage': 'https://cdn.mos.cms.futurecdn.net/thimyQpk6bpV82doUaSJ4T-1200-80.jpg',
  'publishedAt': '2022-05-23T14:38:22Z',
  'content': '<ul><li>Microsoft has updated the Windows Subsystem for Android to Android 12L.\xa0</li><li>When the Subsystem was originally launched, it relied on Android 11.\xa0</li><li>The update is currently availabl… [+3377 chars]'},
 {'source': {'id': None, 'name': 'Digital Trends'},
  'author': 'Fionna Agomuoh',
  'title': 'Microsoft has finally brought Teams to its own app store',
  'description': 'Microsoft Teams is now available for download in the Microsoft Store a

In [9]:
#Test to see if works
form_df("Microsoft")

Unnamed: 0,title,description,text,date,language
0,Android apps for Windows get their most signif...,A new update to the Windows 11 Dev Channel bri...,<ul><li>Microsoft has updated the Windows Subs...,2022-05-23,en
1,Microsoft has finally brought Teams to its own...,Microsoft Teams is now available for download ...,Microsoft has now brought the Microsoft Teams ...,2022-05-17,en
2,JPM says Meta will become Broadcom's next bill...,"Facebook parent Meta Platforms Inc <a href=""ht...",May 31 (Reuters) - Facebook parent Meta Platfo...,2022-05-31,en
3,Xbox plans to launch a streaming puck and Sams...,Xbox cloud streaming is going to grow beyond t...,Did you miss a session from GamesBeat Summit 2...,2022-05-06,en
4,"This Week In Security: Follina, Open Redirect ...","Depending on who you ask, there’s either 2 vul...","Depending on who you ask, there’s either 2 vul...",2022-06-03,en
5,This is What It's Actually Like to Work a 4-Da...,We talked to founders of three companies that ...,Image Credit: Nicolás Ortega\r\nAt first it so...,2022-05-11,en
6,"Fuck yeah, passwordless logins! Google, Apple,...",The FIDO Alliance has been at the forefront of...,The FIDO Alliance has been at the forefront of...,2022-05-06,en
7,The metaverse could lead to an urban exodus,"Several companies, including Apple and Microso...","Several companies, including Apple and Microso...",2022-05-24,en
8,Microsoft Edge just got a new way to protect y...,Microsoft has officially announced a new way f...,Microsoft Edge just got even more secure. Afte...,2022-05-13,en
9,Microsoft has new tools to encourage the trans...,"At Microsoft Build 2022, Microsoft detailed Pr...",The transition to ARM chips on Windows has bee...,2022-05-24,en


In [10]:
form_df('BlackRock AND BLK')

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
def process_text(article):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [16]:
#Use pmaw for reddit apis
api = PushshiftAPI()

In [17]:
import datetime as dt
before = int(dt.datetime(2022,2,1,0,0).timestamp())
after = int(dt.datetime(2021,12,1,0,0).timestamp())

In [18]:
#Retrieve comments 
comments = api.search_comments(subreddit= 'StockMarket', limit = 10000, before=before, after=after)

In [None]:
comments_df = pd.DataFrame(comments)
comments_df.dropna()
comments_df.head()


Unnamed: 0,all_awardings,archived,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,subreddit_id,subreddit_name_prefixed,subreddit_type,top_awarded_type,total_awards_received,treatment_tags,unrepliable_reason,awarders,retrieved_on,author_cakeday
0,[],False,,Mentalilnes,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
1,[],False,,aurora4000,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
2,[],False,,3p1cBm4n9669,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
3,[],False,,Banabak,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
4,[],False,,Sergent1969,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,


In [None]:
reddit = praw.Reddit(client_id='uyIJlACcp5Qux9ubYsb2vA', client_secret='R7a--uqyrVelMGrfsPqA_qIcZX2Ugg', user_agent = 'Trade app bu u/SignificantRange8761')

In [None]:
def get_posts(sub):
    posts = []
    subreddit = reddit.subreddit(sub)
    for post in subreddit.hot(limit=1000):
        posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
    return posts

In [None]:
stock_market = get_posts('StockMarket')

In [None]:
stock_market.head()

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,Interesting.,236,v4o24m,StockMarket,https://i.redd.it/jojcfokqnl391.png,36,,1654346000.0
1,Interesting.,2101,v41hv2,StockMarket,https://i.redd.it/hgfyuinj9f391.png,513,,1654269000.0
2,Most Anticipated Earnings Releases for the wee...,26,v4o3eb,StockMarket,https://i.redd.it/n9qasqm5ol391.png,4,,1654347000.0
3,"Market close - Friday, June 3 2022 🟥🩸",128,v47rxg,StockMarket,https://i.redd.it/tz0zlfj9qg391.png,55,,1654287000.0
4,Elon Musk wants to cut Tesla 10% of jobs and h...,627,v3wmin,StockMarket,https://www.msn.com/en-us/autos/news/elon-musk...,267,,1654253000.0


In [None]:
stock_market.to_csv('test.csv')

In [47]:
#tokenizer
def tokenizer(df):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    for rows in df['text']:
        regex = re.compile("[^a-zA-Z ]")
        re_clean = regex.sub('', rows)

    # Create a tokenized list of the words
        words = word_tokenize(re_clean)
    
    # Lemmatize words into root words
        lemmatizer = WordNetLemmatizer()
        lem = [lemmatizer.lemmatize(word) for word in words]

    # Remove the stop words
        sw = set(stopwords.words('english'))
    
    # Convert the words to lowercase
        tokens = [word.lower() for word in lem if word.lower() not in sw]
    
    
    return tokens

In [None]:
#Binning
def binner(df):
    bins = [-1,-0.1,0.1,1]
    labels = ['negative','neutral', 'positive']
    df['Vader Sentiment'] = pd.cut(df["compound"], bins, labels=labels)
    return df

In [None]:
news_df = pd.read_csv("../Notebooks/Data/Cleaned_Data/news_sentiment.csv", index_col=[0])
news_df.head()

Unnamed: 0,date,ticker,pos,neg,neu,compound
0,2022-05-06,ABNB,0.0,0.0,1.0,0.0
1,2022-05-11,ABNB,0.075167,0.026,0.898667,0.199033
2,2022-05-12,ABNB,0.21,0.159,0.631,0.4912
3,2022-05-14,ABNB,0.169,0.0,0.831,0.70995
4,2022-05-16,ABNB,0.0,0.0,1.0,0.0


In [None]:
binner(news_df)

Unnamed: 0,date,ticker,pos,neg,neu,compound,Vader Sentiment
0,2022-05-06,ABNB,0.000000,0.000,1.000000,0.000000,neutral
1,2022-05-11,ABNB,0.075167,0.026,0.898667,0.199033,positive
2,2022-05-12,ABNB,0.210000,0.159,0.631000,0.491200,positive
3,2022-05-14,ABNB,0.169000,0.000,0.831000,0.709950,positive
4,2022-05-16,ABNB,0.000000,0.000,1.000000,0.000000,neutral
...,...,...,...,...,...,...,...
185,2022-05-13,UBER,0.000000,0.167,0.833000,-0.636900,negative
186,2022-05-14,UBER,0.081000,0.062,0.857000,0.202300,positive
187,2022-05-16,UBER,0.042667,0.000,0.957333,0.195600,positive
188,2022-05-17,UBER,0.000000,0.048,0.952000,-0.241100,negative


In [19]:
#Testing theguardian
apikey = os.getenv("GUARDIAN_KEY")
url = f"https://content.guardianapis.com/sections?" \
            f"api-key={apikey}" 

In [20]:
r = requests.get(url)
#print(r.json())

In [21]:
def pull_guardian(query):
    query_fields = 'body'
    section = "news"
    from_date = "2013-05-05"
    query_url = f"http://content.guardianapis.com/tags?" \
            f"api-key={apikey}" \
            f"&q={query}" \
            f"&query-fields={query_fields}" \
            f"§ion={section}" \
            f"&from-date={from_date}" 

    r = requests.get(query_url)
    print(r.json())

In [22]:
pull_guardian('netflix')

{'response': {'status': 'ok', 'userTier': 'developer', 'total': 6, 'startIndex': 1, 'pageSize': 10, 'currentPage': 1, 'pages': 1, 'results': [{'id': 'media/netflix', 'type': 'keyword', 'sectionId': 'media', 'sectionName': 'Media', 'webTitle': 'Netflix', 'webUrl': 'https://www.theguardian.com/media/netflix', 'apiUrl': 'https://content.guardianapis.com/media/netflix'}, {'id': 'netflix-love/netflix-love', 'type': 'keyword', 'sectionId': 'netflix-love', 'sectionName': 'Netflix Love', 'webTitle': 'Netflix Love', 'webUrl': 'https://www.theguardian.com/netflix-love/netflix-love', 'apiUrl': 'https://content.guardianapis.com/netflix-love/netflix-love'}, {'id': 'netflix-family-time/netflix-family-time', 'type': 'keyword', 'webTitle': 'Netflix family time', 'webUrl': 'https://www.theguardian.com/netflix-family-time/netflix-family-time', 'apiUrl': 'https://content.guardianapis.com/netflix-family-time/netflix-family-time'}, {'id': 'netflix-love/series/blog', 'type': 'series', 'sectionId': 'netflix-

In [23]:
def pull_pages(query):
    current_page = 1
    total_pages = 1386
    while current_page <= total_pages:
        try:
            r = pull_guardian(query)
            r.raise_for_status()
        except:
            SystemExit()
    current_page += 1
    total_pages = r.json()['response']['pages']

In [14]:
query = "netflix"
query_fields = "body"
section = "news"  # https://open-platform.theguardian.com/documentation/section
tag = "world/extreme-weather"  # https://open-platform.theguardian.com/documentation/tag
from_date = "2022-05-20"
query_url = f"https://content.guardianapis.com/search?" \
            f"api-key={apikey}" \
            f"&q={query}" \
            f"&query-fields={query_fields}" \
            f"§ion={section}" \
            f"&tag={tag}" \
            f"&from-date={from_date}" \
            f"&show-fields=headline,byline,starRating,shortUrl"

r = requests.get(query_url)
print(r.json())

{'response': {'status': 'ok', 'userTier': 'developer', 'total': 0, 'startIndex': 0, 'pageSize': 10, 'currentPage': 1, 'pages': 0, 'orderBy': 'relevance', 'results': []}}


In [38]:
df_wall = pd.read_csv('../Notebooks/Data/Cleaned_Data/wallstreetbets_comments.csv')

In [39]:
df_wall.tail()

Unnamed: 0,datetime,text
995,2021-05-16 14:21:57,Nice
996,2021-05-16 14:21:50,I have July 195 and I'm not selling. Straight ...
997,2021-05-16 14:21:46,This sub is honestly worse than qanon i think....
998,2021-05-16 14:21:43,Im holding too but investors will only look at...
999,2021-05-16 14:21:39,I get my news from they guy at the bowling all...


In [55]:
#Takes text column and turn into list of words to iterate and analyze
def text_splitter(df):
    df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()
    df["text splitted"].transform(lambda x: Counter(x)).sum()
    return df

In [57]:
text_splitter(df_wall)

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


Unnamed: 0,datetime,text,text splitted
0,2013-01-15 00:50:40,[deleted],[deleted]
1,2013-01-15 00:48:12,\r\n[**@wallstreetbets**](http://twitter.com/w...,"[wallstreetbetshttptwittercomwallstreetbets, g..."
2,2013-01-10 11:08:19,"Simple, you haven't been on irc in a while...o...","[simple, you, havent, been, on, irc, in, a, wh..."
3,2013-01-10 08:32:49,*Nice.*,[nice]
4,2013-01-10 03:39:18,haha! yea talon!,"[haha, yea, talon]"
...,...,...,...
995,2021-05-16 14:21:57,Nice,[nice]
996,2021-05-16 14:21:50,I have July 195 and I'm not selling. Straight ...,"[i, have, july, 195, and, im, not, selling, st..."
997,2021-05-16 14:21:46,This sub is honestly worse than qanon i think....,"[this, sub, is, honestly, worse, than, qanon, ..."
998,2021-05-16 14:21:43,Im holding too but investors will only look at...,"[im, holding, too, but, investors, will, only,..."


In [87]:
#Counts occurence of certain words in text to see how much a certain stock is talked about
def word_occurence(df, word):
    df = text_splitter(df)
    count = 0 
    for rows in df['text splitted']:
        for words in rows:
            if words == word:
                count += 1
    return count

In [93]:
word_occurence(df_wall,'nflx')

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


7