In [1]:
import os
import pandas as pd
from pathlib import Path
import requests
import json
from newsapi import NewsApiClient
#from theguardian import theguardian_content
from dotenv import load_dotenv
load_dotenv()
from collections import Counter
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from pmaw import PushshiftAPI
import praw

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
api_key = os.getenv("NEWSAPI_KEY")

In [3]:
newsapi = NewsApiClient(api_key=api_key)

In [4]:
#Get news articles on certain topic based on keywords
def get_news(keywords):  
    news_article = newsapi.get_everything(
            q = keywords, language='en', sort_by= 'relevancy', page=5
    )
    return news_article

In [5]:
#Creates dataframe of the articles chosen 
def form_df(keywords):
    news = get_news(keywords)['articles']

    articles = []
    for article in news:
        try:
            title = article['title']
            description = article['description']
            text = article['content']
            date = article['publishedAt'][:10]

            articles.append({
                'text' : text,
                'date' : date,
                'language' : 'en'
            })
        except AttributeError:
            pass
    
    return pd.DataFrame(articles)

In [6]:
#Test to see if works
get_news("Microsoft")

{'status': 'ok',
 'totalResults': 15944,
 'articles': [{'source': {'id': None, 'name': 'Fast Company'},
   'author': 'Jared Newman',
   'title': 'There’s a big problem with Apple and Google’s plans to nix passwords',
   'description': 'Eliminating the password means trusting that big tech companies won’t lock you into their platforms.\nApple, Google, and Microsoft have big plans to eliminate the password.Read Full Story',
   'url': 'https://www.fastcompany.com/90755838/theres-a-big-problem-with-apple-and-googles-plans-to-nix-passwords',
   'urlToImage': 'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/wp-cms/uploads/2022/05/poster-companies-to-eliminate-passwords.jpg',
   'publishedAt': '2022-05-27T00:00:48Z',
   'content': 'Apple, Google, and Microsoft have big plans to eliminate the password.\r\nWorking with a standards group called the FIDO Alliance, all three companies are backing a system in which your phone or comput… [+8278 chars]'},
  {'source': {'id':

In [7]:
get_news("Microsoft")['articles']

[{'source': {'id': None, 'name': 'Fast Company'},
  'author': 'Jared Newman',
  'title': 'There’s a big problem with Apple and Google’s plans to nix passwords',
  'description': 'Eliminating the password means trusting that big tech companies won’t lock you into their platforms.\nApple, Google, and Microsoft have big plans to eliminate the password.Read Full Story',
  'url': 'https://www.fastcompany.com/90755838/theres-a-big-problem-with-apple-and-googles-plans-to-nix-passwords',
  'urlToImage': 'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/wp-cms/uploads/2022/05/poster-companies-to-eliminate-passwords.jpg',
  'publishedAt': '2022-05-27T00:00:48Z',
  'content': 'Apple, Google, and Microsoft have big plans to eliminate the password.\r\nWorking with a standards group called the FIDO Alliance, all three companies are backing a system in which your phone or comput… [+8278 chars]'},
 {'source': {'id': None, 'name': 'VentureBeat'},
  'author': 'Ashleigh Hollowell

In [8]:
#Test to see if works
form_df("Microsoft")

Unnamed: 0,text,date,language
0,"Apple, Google, and Microsoft have big plans to...",2022-05-27,en
1,We are excited to bring Transform 2022 back in...,2022-05-17,en
2,Microsoft employees will be free to seek jobs ...,2022-06-09,en
3,Magic Leap's top two executives believe their ...,2022-05-27,en
4,Microsoft shares remained near their breakeven...,2022-05-20,en
5,Content Warning: Please note that we address t...,2022-05-31,en
6,We thank our sponsor for making this content p...,2022-05-17,en
7,<ul><li>Microsoft has updated the Windows Subs...,2022-05-23,en
8,Microsoft has now brought the Microsoft Teams ...,2022-05-17,en
9,May 31 (Reuters) - Facebook parent Meta Platfo...,2022-05-31,en


In [9]:
apple = form_df('apple')
netflix = form_df('netflix')
facebook = form_df('facebook')
uber = form_df('uber')
microchip = form_df('microchip technology')
airbnb = form_df('airbnb')
diamondback = form_df('diamond back energy')
marathon = form_df('marathon oil corp')
devon = form_df('devon energy corp')
sunpower = form_df('sunpower corp')
renewable = form_df('renewable energy group inc')
mckinsey = form_df('mckinsey & company')
blackrock = form_df('blackrock')
paypal = form_df('paypal')
mercadolibre = form_df('mercadolibre inc')
sofi = form_df('sofi')
all_df = pd.concat([apple, netflix,facebook,uber,microchip, airbnb,diamondback,marathon, devon, sunpower, renewable, mckinsey, blackrock, paypal, mercadolibre,sofi], 
                    keys=['apple', 'netflix','facebook','uber','microchip', 'airbnb','diamondback','marathon', 'devon', 'sunpower', 'renewable', 'mckinsey', 'blackrock', 'paypal', 'mercadolibre','sofi'], 
                    names=['Company'], axis=0)
all_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,date,language
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apple,0,Apple plans to launch a new version of the App...,2022-05-13,en
apple,1,"A long time ago, smartphones were small. Like,...",2022-05-20,en
apple,2,The drive to protect children online will soon...,2022-05-18,en
apple,3,The Apple Watch Series 8 could feature an all-...,2022-05-18,en
apple,4,"Every spring, Google has shown off some hardwa...",2022-05-11,en


In [10]:
all_df = all_df.reset_index()


In [11]:
all_df.tail()

Unnamed: 0,Company,level_1,text,date,language
235,sofi,15,<ul><li>SoFi Technologies (SOFI) discloses tha...,2022-05-16,en
236,sofi,16,The following slide deck was published by SoFi...,2022-05-11,en
237,sofi,17,The Issa Brothers and TDR Capital are some of ...,2022-06-09,en
238,sofi,18,"Now that the 2022 schedule has been announced,...",2022-05-13,en
239,sofi,19,SoFi Technologies (NASDAQ:SOFI) shares fell 12...,2022-05-13,en


In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

def vader_analyzer(df):
    analyzer = SentimentIntensityAnalyzer()
    df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['text']]
    df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['text']]
    df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['text']]
    df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['text']]

    df['date'] = pd.to_datetime(
    df['date'],
    infer_datetime_format = True,
    utc = True    
    )
    df['date'] = df['date'].dt.date
    
    return df

def daily_sentiment(df):
    vader_df = vader_analyzer(df)
    vader_df = vader_df.groupby(['Company','date'])['Company','pos','neg','neu','compound'].mean().reset_index()
    vader_df = vader_df[['date','ticker','pos','neg','neu','compound']]
    return vader_df


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
def process_text(article):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [15]:
#Use pmaw for reddit apis
api = PushshiftAPI()

In [16]:
import datetime as dt
before = int(dt.datetime(2022,2,1,0,0).timestamp())
after = int(dt.datetime(2021,12,1,0,0).timestamp())

In [17]:
#Retrieve comments 
comments = api.search_comments(subreddit= 'StockMarket', limit = 10000, before=before, after=after)

In [18]:
comments_df = pd.DataFrame(comments)
comments_df.dropna()
comments_df.head()


Unnamed: 0,all_awardings,archived,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,subreddit_id,subreddit_name_prefixed,subreddit_type,top_awarded_type,total_awards_received,treatment_tags,unrepliable_reason,author_cakeday,awarders,retrieved_on
0,[],False,,bossOnothin,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
1,[],False,,Embarrassed_Tax_9534,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
2,[],False,,MCKarum,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
3,[],False,,Investing1cash,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,
4,[],False,,EasternPrint8,,,[],,,,...,t5_2qjuv,r/StockMarket,public,,0,[],,,,


In [19]:
reddit = praw.Reddit(client_id='uyIJlACcp5Qux9ubYsb2vA', client_secret='R7a--uqyrVelMGrfsPqA_qIcZX2Ugg', user_agent = 'Trade app bu u/SignificantRange8761')

In [20]:
def get_posts(sub):
    posts = []
    subreddit = reddit.subreddit(sub)
    for post in subreddit.hot(limit=1000):
        posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
    return posts

In [21]:
stock_market = get_posts('StockMarket')

In [22]:
stock_market.head()

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created
0,“Unexpected” to Bloomberg.,704,v9caeo,StockMarket,https://i.redd.it/rkp2t6jhrt491.jpg,82,,1654880000.0
1,"Market close - Friday, June 10 2022 🩸🩸💉💉",125,v9ggpu,StockMarket,https://i.redd.it/iykj7b6wpu491.png,44,,1654892000.0
2,If you didn't break your browser's refresh but...,414,v97mza,StockMarket,https://i.redd.it/qqrqpmvbos491.gif,55,,1654867000.0
3,Average stock market return for the last 20 ye...,146,v9csfw,StockMarket,https://i.redd.it/uitf5md8vt491.png,13,,1654882000.0
4,BREAKING NEWS: Tesla files for 3-for-1 stock s...,42,v9gpdv,StockMarket,https://www.cnbc.com/2022/06/10/tesla-files-fo...,34,,1654893000.0


In [23]:
stock_market.to_csv('test.csv')

In [24]:
#tokenizer
def tokenizer(df):
    """Tokenizes text."""
    
    # Remove the punctuation from text
    for rows in df['text']:
        regex = re.compile("[^a-zA-Z ]")
        re_clean = regex.sub('', rows)

    # Create a tokenized list of the words
        words = word_tokenize(re_clean)
    
    # Lemmatize words into root words
        lemmatizer = WordNetLemmatizer()
        lem = [lemmatizer.lemmatize(word) for word in words]

    # Remove the stop words
        sw = set(stopwords.words('english'))
    
    # Convert the words to lowercase
        tokens = [word.lower() for word in lem if word.lower() not in sw]
    
    
    return tokens

In [25]:
#Binning
def binner(df):
    bins = [-1,-0.1,0.1,1]
    labels = ['negative','neutral', 'positive']
    df['Vader Sentiment'] = pd.cut(df["compound"], bins, labels=labels)
    return df

In [26]:
news_df = pd.read_csv("../Notebooks/Data/Cleaned_Data/news_sentiment.csv", index_col=[0])
news_df.head()

Unnamed: 0,date,ticker,pos,neg,neu,compound
0,2022-05-06,ABNB,0.0,0.0,1.0,0.0
1,2022-05-11,ABNB,0.075167,0.026,0.898667,0.199033
2,2022-05-12,ABNB,0.21,0.159,0.631,0.4912
3,2022-05-14,ABNB,0.169,0.0,0.831,0.70995
4,2022-05-16,ABNB,0.0,0.0,1.0,0.0


In [27]:
binner(news_df)

Unnamed: 0,date,ticker,pos,neg,neu,compound,Vader Sentiment
0,2022-05-06,ABNB,0.000000,0.000,1.000000,0.000000,neutral
1,2022-05-11,ABNB,0.075167,0.026,0.898667,0.199033,positive
2,2022-05-12,ABNB,0.210000,0.159,0.631000,0.491200,positive
3,2022-05-14,ABNB,0.169000,0.000,0.831000,0.709950,positive
4,2022-05-16,ABNB,0.000000,0.000,1.000000,0.000000,neutral
...,...,...,...,...,...,...,...
185,2022-05-13,UBER,0.000000,0.167,0.833000,-0.636900,negative
186,2022-05-14,UBER,0.081000,0.062,0.857000,0.202300,positive
187,2022-05-16,UBER,0.042667,0.000,0.957333,0.195600,positive
188,2022-05-17,UBER,0.000000,0.048,0.952000,-0.241100,negative


In [28]:
df_wall = pd.read_csv('../Notebooks/Data/Cleaned_Data/wallstreetbets_comments.csv')

In [29]:
df_wall.tail()

Unnamed: 0,datetime,text
995,2021-05-16 14:21:57,Nice
996,2021-05-16 14:21:50,I have July 195 and I'm not selling. Straight ...
997,2021-05-16 14:21:46,This sub is honestly worse than qanon i think....
998,2021-05-16 14:21:43,Im holding too but investors will only look at...
999,2021-05-16 14:21:39,I get my news from they guy at the bowling all...


In [30]:
#Takes text column and turn into list of words to iterate and analyze
def text_splitter(df):
    df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()
    df["text splitted"].transform(lambda x: Counter(x)).sum()
    return df

In [31]:
split_df = text_splitter(df_wall)
split_df.head()

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


Unnamed: 0,datetime,text,text splitted
0,2013-01-15 00:50:40,[deleted],[deleted]
1,2013-01-15 00:48:12,\r\n[**@wallstreetbets**](http://twitter.com/w...,"[wallstreetbetshttptwittercomwallstreetbets, g..."
2,2013-01-10 11:08:19,"Simple, you haven't been on irc in a while...o...","[simple, you, havent, been, on, irc, in, a, wh..."
3,2013-01-10 08:32:49,*Nice.*,[nice]
4,2013-01-10 03:39:18,haha! yea talon!,"[haha, yea, talon]"


In [32]:
#Counts occurence of certain words in text to see how much a certain stock is talked about
def word_occurence(df, word):
    df = text_splitter(df)
    count = 0 
    for rows in df['text splitted']:
        for words in rows:
            if words == word:
                count += 1
    return count

In [33]:
word_occurence(df_wall,'nflx')

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


7

In [34]:
#CSV with text put into list of words ready to be analyzed
split_df.to_csv('wallstreetbets_comments_splitted.csv')

In [35]:
import requests
def pull_articles(keyword):

	url = "https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/search/NewsSearchAPI"

	querystring = {"q":keyword,"pageNumber":"1","pageSize":"50","autoCorrect":"true","fromPublishedDate":"2012-06-01","toPublishedDate":"2022-06-01"}

	headers = {
	"X-RapidAPI-Key": os.getenv('RapidAPI_Key'),
	"X-RapidAPI-Host": os.getenv('RapidAPI_Host')
	}

	response = requests.request("GET", url, headers=headers, params=querystring)
	while 'pageNumber' <= 'pageSize':

		print(response.text)

In [36]:
#pull_articles('netflix')

In [37]:
#Testing theguardian
apikey = os.getenv("GUARDIAN_KEY")
url = f"https://content.guardianapis.com/sections?" \
            f"api-key={apikey}" 

In [38]:
r = requests.get(url)
#print(r.json())

In [39]:
def pull_guardian(query):
    query_fields = 'body'
    section = "news"
    from_date = "2013-05-05"
    query_url = f"http://content.guardianapis.com/tags?" \
            f"api-key={apikey}" \
            f"&q={query}" \
            f"&query-fields={query_fields}" \
            f"§ion={section}" \
            f"&from-date={from_date}" 

    r = requests.get(query_url)
    print(r.json())

In [40]:
wallstreet_df = pd.read_csv('../Notebooks/Data/Cleaned_Data/wallstreetbets_comments.csv')
wallstreet_df.tail()

Unnamed: 0,datetime,text
995,2021-05-16 14:21:57,Nice
996,2021-05-16 14:21:50,I have July 195 and I'm not selling. Straight ...
997,2021-05-16 14:21:46,This sub is honestly worse than qanon i think....
998,2021-05-16 14:21:43,Im holding too but investors will only look at...
999,2021-05-16 14:21:39,I get my news from they guy at the bowling all...


In [41]:
def keyword_filter(df, keywords):

    filtered_list = []
    try:
        for keyword in keywords:
            for text in df['text']:
                if keyword in str(text):
                    filtered_list.append(text)

    except ValueError:
        pass
    filtered_df = pd.concat([df['datetime'], pd.DataFrame(filtered_list).rename(columns = {0: 'text'})], axis = 1).dropna()

    return filtered_df

In [42]:
filtered = word_occurence(wallstreet_df,'uber')
filtered

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


3

In [43]:
def pull_comments(df, word):
    text_splitter(df)
    comments = []
    for rows in df['text splitted']:
        for words in rows:
            if words == word:
                return rows

In [44]:
apple_com = pull_comments(wallstreet_df, 'apple')
apple_com

  df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','').str.split()


['samsung',
 'seeing',
 'apple',
 'and',
 'ibms',
 'partnership',
 'could',
 'view',
 'a',
 'partnership',
 'with',
 'blackberry',
 'for',
 'enterprise',
 'as',
 'a',
 'next',
 'major',
 'step']

In [45]:
security_df = pd.read_csv('../Notebooks/Data/Cleaned_Data/securityanalysis_comments.csv')
security_df.head()

Unnamed: 0,datetime,text
0,2015-02-15 01:08:48,I'm surprised anyone else has heard about EIF....
1,2015-02-14 21:06:46,I recommend Apple's financial statements. I a...
2,2015-02-14 20:31:32,"IMO, one of the better value blogs out there. ..."
3,2015-02-14 00:07:56,Wal-mart has their old annual reports from ear...
4,2015-02-13 18:29:30,"mediums a great platform, i think they already..."


In [46]:
#Takes text column and turn into list of words to iterate and analyze
def text_splitter_2(df):
    df["text splitted"] = df.text.str.lower().str.replace('[^\w\s]','', regex=True).str.split()
    return df

In [47]:
text_splitter_2(security_df)

Unnamed: 0,datetime,text,text splitted
0,2015-02-15 01:08:48,I'm surprised anyone else has heard about EIF....,"[im, surprised, anyone, else, has, heard, abou..."
1,2015-02-14 21:06:46,I recommend Apple's financial statements. I a...,"[i, recommend, apples, financial, statements, ..."
2,2015-02-14 20:31:32,"IMO, one of the better value blogs out there. ...","[imo, one, of, the, better, value, blogs, out,..."
3,2015-02-14 00:07:56,Wal-mart has their old annual reports from ear...,"[walmart, has, their, old, annual, reports, fr..."
4,2015-02-13 18:29:30,"mediums a great platform, i think they already...","[mediums, a, great, platform, i, think, they, ..."
...,...,...,...
10000,2017-01-20 11:44:25,Question. This company claims ~210m under mana...,"[question, this, company, claims, 210m, under,..."
10001,2017-01-20 11:38:23,[Pershing Square](https://assets.pershingsquar...,"[pershing, squarehttpsassetspershingsquarehold..."
10002,2017-01-20 11:20:39,I gotcha man. I just feel you asked a bunch of...,"[i, gotcha, man, i, just, feel, you, asked, a,..."
10003,2017-01-20 11:04:08,"I would say to avoid retail, unless you have s...","[i, would, say, to, avoid, retail, unless, you..."


In [51]:
def count_comments(df,keywords):
    df = text_splitter_2(df)
    df = df.groupby('datetime')
    print(pd.DataFrame(df['text splitted'].value_count()[keywords]))

In [52]:
count_comments(security_df, keywords='uber')

AttributeError: 'SeriesGroupBy' object has no attribute 'value_count'