In [13]:
import os
import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv
load_dotenv()
from collections import Counter
from nltk.corpus import reuters, stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from pmaw import PushshiftAPI
import praw

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krist\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
api_key = os.getenv("NEWSAPI_KEY")

In [34]:
newsapi = NewsApiClient(api_key=api_key)

In [35]:
#Get news articles on certain topic based on keywords
def get_news(keywords):  
    news_article = newsapi.get_everything(
            q = keywords, language='en', sort_by= 'relevancy'
    )
    return news_article

In [36]:
#Creates dataframe of the articles chosen 
def form_df(keywords):
    news = get_news(keywords)['articles']

    articles = []
    for article in news:
        try:
            title = article['title']
            description = article['description']
            text = article['content']
            date = article['publishedAt'][:10]

            articles.append({
                'title' : title,
                'description' : description,
                'text' : text,
                'date' : date,
                'language' : 'en'
            })
        except AttributeError:
            pass
    
    return pd.DataFrame(articles)

In [1]:
#newsapi.get_everything(q = 'Microsoft', language='en')

In [47]:
#Test to see if works
get_news("Microsoft")

{'status': 'ok',
 'totalResults': 16596,
 'articles': [{'source': {'id': 'engadget', 'name': 'Engadget'},
   'author': 'Mariella Moon',
   'title': 'Microsoft open sources the code for 3D Movie Maker',
   'description': 'Microsoft has open sourced the code for the 27-year-old program 3D Movie Maker — and it\'s all thanks to someone who asked on Twitter. Foone, whose Twitter profile says they\'re a "hardware and software necromancer," asked the tech giant for the source code to …',
   'url': 'https://www.engadget.com/microsoft-open-sources-3d-movie-maker-code-190053035.html',
   'urlToImage': 'https://s.yimg.com/os/creatr-uploaded-images/2022-05/94baa250-ce1f-11ec-b3ff-45068bb80318',
   'publishedAt': '2022-05-07T19:00:53Z',
   'content': 'Microsoft has open sourced the code for the 27-year-old program 3D Movie Maker and it\'s all thanks to someone who asked on Twitter. Foone, whose Twitter profile says they\'re a "hardware and software … [+1217 chars]'},
  {'source': {'id': 'engadget',

In [48]:
get_news("Microsoft")['articles']

[{'source': {'id': 'engadget', 'name': 'Engadget'},
  'author': 'Mariella Moon',
  'title': 'Microsoft open sources the code for 3D Movie Maker',
  'description': 'Microsoft has open sourced the code for the 27-year-old program 3D Movie Maker — and it\'s all thanks to someone who asked on Twitter. Foone, whose Twitter profile says they\'re a "hardware and software necromancer," asked the tech giant for the source code to …',
  'url': 'https://www.engadget.com/microsoft-open-sources-3d-movie-maker-code-190053035.html',
  'urlToImage': 'https://s.yimg.com/os/creatr-uploaded-images/2022-05/94baa250-ce1f-11ec-b3ff-45068bb80318',
  'publishedAt': '2022-05-07T19:00:53Z',
  'content': 'Microsoft has open sourced the code for the 27-year-old program 3D Movie Maker and it\'s all thanks to someone who asked on Twitter. Foone, whose Twitter profile says they\'re a "hardware and software … [+1217 chars]'},
 {'source': {'id': 'the-verge', 'name': 'The Verge'},
  'author': 'Emma Roth',
  'title': 'H

In [48]:
#Test to see if works
form_df("Microsoft")

Unnamed: 0,title,description,text,date,language
0,Microsoft open sources the code for 3D Movie M...,Microsoft has open sourced the code for the 27...,Microsoft has open sourced the code for the 27...,2022-05-07,en
1,Microsoft Edge will soon include a free built-...,Microsoft has consistently tried to get more p...,Microsoft has consistently tried to get more p...,2022-04-30,en
2,"How to watch AMD, Nvidia, and Microsoft’s Comp...",With Computex 2022 kicking off in a matter of ...,Where and when to watch each keynote\r\nComput...,2022-05-22,en
3,Clippy is in Halo Infinite,"Clippy, the (in)famous Microsoft Office assist...","The smugness remains.\r\n\n \n\n Clippy, the b...",2022-05-04,en
4,Amazon re-awarded $10 billion NSA cloud contra...,Microsoft failed in its attempt to challenge A...,Microsoft failed in its attempt to challenge A...,2022-04-28,en
5,Microsoft confirms it's taking a 'new approach...,"Earlier this month, a rumor suggested that Mic...","Earlier this month, a rumor suggested that Mic...",2022-05-27,en
6,Windows 11 will get third-party widgets later ...,Windows 11\r\n widgets could become more usefu...,Windows 11\r\n widgets could become more usefu...,2022-05-24,en
7,"Apple, Google, and Microsoft will soon impleme...","On World Password Day, Apple, Google, and Micr...",Illustration by Alex Castro / The Verge\r\n\n ...,2022-05-05,en
8,Microsoft’s Xbox streaming stick and TV app ar...,Microsoft is working on a Roku-esque Xbox stre...,I dont think anything is going to stop us from...,2022-05-06,en
9,Microsoft teams up with VW to make HoloLens wo...,Microsoft has announced a new “moving platform...,Moving vehicles have previously broken the AR ...,2022-05-05,en


In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def process_text(article):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [28]:
#Word counter code
def word_counter(categories):
    ids = reuters.fileids(categories) 
    corpus = [reuters.raw(i) for i in ids] 
    joined_string = " ".join(corpus)
    process = process_text(joined_string)
    counter = dict(Counter(process).most_common(10))
    return pd.DataFrame(list(counter.items()), columns = ['word', 'count']) 


In [84]:
#Code that counts number of articles present about a topic
def news_density(keywords):
    count = 0 
    for dict in get_news(keywords)['articles']:
        count += 1
    return count

In [85]:
news_density('Microsoft')

20

In [4]:
#Use pmaw for reddit apis
api = PushshiftAPI()

In [2]:
import datetime as dt
before = int(dt.datetime(2022,2,1,0,0).timestamp())
after = int(dt.datetime(2021,12,1,0,0).timestamp())

In [6]:
#Retrieve comments 
comments = api.search_comments(subreddit= 'StockMarket', limit = 1000, before=before, after=after)

In [10]:
comments_df = pd.DataFrame(comments)


In [14]:
reddit = praw.Reddit(client_id='my_client_id', client_secret='my_client_secret', user_agent='my_user_agent')

In [15]:
def get_posts(sub):
    posts = []
    ml_subreddit = reddit.subreddit(sub)
    for post in ml_subreddit.hot(limit=10):
        posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
    print(posts)

In [18]:
get_posts('StockMarket')

ResponseException: received 401 HTTP response