In [1]:
import pandas as pd
import datetime as dt
import pmaw
import newsapi
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newsapi import NewsApiClient
from dotenv import load_dotenv

In [2]:
def articles_pull(keywords):

    newsapi_response = newsapi.get_everything(q = keywords, language = 'en', sort_by = 'publishedAt')['articles']
    articles_list = []
    for article in newsapi_response:
        try:
            title = article['title']
            description = article['description']
            text = article['content']
            date = article['publishedAt']
            articles_list.append({'date' : date, 'text' : text})
            articles = pd.DataFrame(articles_list).rename(columns = {'date': 'datetime'}).sort_values('datetime')
            articles['datetime'] = pd.to_datetime(articles['datetime'], infer_datetime_format = True, errors = 'coerce')
        except AttributeError:
            pass
    
    return articles

In [3]:
def subreddit_pull(subreddit, limit, after, before):

    pushshift = pmaw.PushshiftAPI()
    comments_response = pushshift.search_comments(subreddit = subreddit, limit = limit, after = after, before = before)
    comments_original = pd.DataFrame(comments_response)
    comments_original['datetime'] = comments_original.apply(lambda row : dt.datetime.fromtimestamp(row['created_utc']), axis = 1)
    comments_original['datetime'] = pd.to_datetime(comments_original['datetime'])
    comments = comments_original[['datetime', 'body']].rename(columns = {'body': 'text'}).set_index('datetime')
    
    return comments

In [4]:
def keyword_filter(df, keywords):

    filtered_list = []
    for keyword in keywords:
        for text in df['text']:
            if keyword in str(text):
                filtered_list.append(text)

    filtered_df = pd.concat([df['datetime'], pd.DataFrame(filtered_list).rename(columns = {0: 'text'})], axis = 1).dropna()
    return filtered_df

In [5]:
def articles_vader_analyzer(df):
    
    analyzer = SentimentIntensityAnalyzer()
    df['articles_compound_sentiment'] = [analyzer.polarity_scores(x)['compound'] for x in df['text']]
    df['articles_positive_sentiment'] = [analyzer.polarity_scores(x)['pos'] for x in df['text']]
    df['articles_neutral_sentiment'] = [analyzer.polarity_scores(x)['neu'] for x in df['text']]
    df['articles_negative_sentiment'] = [analyzer.polarity_scores(x)['neg'] for x in df['text']]
    
    return df

In [6]:
def reddit_vader_analyzer(subreddit, df):
    
    analyzer = SentimentIntensityAnalyzer()
    df[f'{subreddit}_compound_sentiment'] = [analyzer.polarity_scores(x)['compound'] for x in df['text']]
    df[f'{subreddit}_positive_sentiment'] = [analyzer.polarity_scores(x)['pos'] for x in df['text']]
    df[f'{subreddit}_neutral_sentiment'] = [analyzer.polarity_scores(x)['neu'] for x in df['text']]
    df[f'{subreddit}_negative_sentiment'] = [analyzer.polarity_scores(x)['neg'] for x in df['text']]
    
    return df

In [7]:
def daily_mean(df):

    df['datetime'] = pd.to_datetime(df['datetime'])
    daily_mean_df = df.set_index('datetime').groupby(pd.Grouper(freq='d')).mean()

    return daily_mean_df

In [8]:
load_dotenv()

True

In [9]:
newsapi_key = os.getenv('NEWSAPI_KEY')
newsapi = NewsApiClient(api_key = newsapi_key)

In [10]:
apple_articles = articles_pull('AAPL OR Apple OR apple')
apple_articles.to_csv('./Data/Cleaned_Data/apple_articles.csv')

In [11]:
apple_articles_sentiment = daily_mean(articles_vader_analyzer(apple_articles))

In [12]:
#after = int(dt.datetime(2012, 1, 1, 0, 0).timestamp())
#before = int(dt.datetime(2022, 6, 1, 0, 0).timestamp())

#stockmarket_comments = subreddit_pull('stockmarket', limit = 10000, after = after, before = before)
#stockmarket_comments.to_csv('./Data/Cleaned_Data/stockmarket_comments.csv')

#securityanalysis_comments = subreddit_pull('securityanalysis', limit = 10000, after = after, before = before)
#securityanalysis_comments.to_csv('./Data/Cleaned_Data/securityanalysis_comments.csv')

#algotrading_comments = subreddit_pull('algotrading', limit = 10000, after = after, before = before)
#algotrading_comments.to_csv('./Data/Cleaned_Data/algotrading_comments.csv')

#wallstreetbets_comments = subreddit_pull('wallstreetbets', limit = 10000, after = after, before = before)
#wallstreetbets_comments.to_csv('./Data/Cleaned_Data/wallstreetbets_comments.csv')

In [13]:
stockmarket_comments = pd.read_csv('./Data/Cleaned_Data/stockmarket_comments.csv')
securityanalysis_comments = pd.read_csv('./Data/Cleaned_Data/securityanalysis_comments.csv')
algotrading_comments = pd.read_csv('./Data/Cleaned_Data/algotrading_comments.csv')
wallstreetbets_comments = pd.read_csv('./Data/Cleaned_Data/wallstreetbets_comments.csv')

In [14]:
stockmarket_apple_comments = keyword_filter(stockmarket_comments, ['AAPL', 'Apple', 'apple'])
securityanalysis_apple_comments = keyword_filter(securityanalysis_comments, ['AAPL', 'Apple', 'apple'])
algotrading_apple_comments = keyword_filter(algotrading_comments, ['AAPL', 'Apple', 'apple'])
wallstreetbets_apple_comments = keyword_filter(wallstreetbets_comments, ['AAPL', 'Apple', 'apple'])

In [15]:
stockmarket_apple_daily_sentiment = daily_mean(reddit_vader_analyzer('stockmarket', stockmarket_apple_comments))
securityanalysis_apple_daily_sentiment = daily_mean(reddit_vader_analyzer('securityanalysis', securityanalysis_apple_comments))
algotrading_apple_daily_sentiment = daily_mean(reddit_vader_analyzer('algotrading', algotrading_apple_comments))
wallstreetbets_apple_sentiment = daily_mean(reddit_vader_analyzer('wallstreetbets', wallstreetbets_apple_comments))

In [16]:
apple_sentiment = pd.concat([apple_articles_sentiment, stockmarket_apple_daily_sentiment, securityanalysis_apple_daily_sentiment, algotrading_apple_daily_sentiment, wallstreetbets_apple_sentiment], axis = 1)
apple_sentiment.to_csv('./Data/Cleaned_Data/apple_sentiment.csv')
apple_sentiment

Unnamed: 0_level_0,articles_compound_sentiment,articles_positive_sentiment,articles_neutral_sentiment,articles_negative_sentiment,stockmarket_compound_sentiment,stockmarket_positive_sentiment,stockmarket_neutral_sentiment,stockmarket_negative_sentiment,securityanalysis_compound_sentiment,securityanalysis_positive_sentiment,securityanalysis_neutral_sentiment,securityanalysis_negative_sentiment,algotrading_compound_sentiment,algotrading_positive_sentiment,algotrading_neutral_sentiment,algotrading_negative_sentiment,wallstreetbets_compound_sentiment,wallstreetbets_positive_sentiment,wallstreetbets_neutral_sentiment,wallstreetbets_negative_sentiment
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-01,,,,,-0.319520,0.045600,0.838600,0.115800,,,,,,,,,,,,
2013-01-02,,,,,0.252993,0.070267,0.919000,0.010667,,,,,,,,,,,,
2013-01-03,,,,,0.212671,0.106000,0.815857,0.078000,,,,,,,,,,,,
2013-01-04,,,,,0.573991,0.184636,0.744364,0.071000,,,,,,,,,,,,
2013-01-05,,,,,0.033250,0.066500,0.860500,0.073000,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-02-11,,,,,,,,,,,,,,,,,,,,
2015-02-12,,,,,,,,,,,,,,,,,,,,
2015-02-13,,,,,0.105544,0.074563,0.847313,0.078125,,,,,,,,,,,,
2015-02-14,,,,,0.284350,0.114167,0.825133,0.060800,,,,,,,,,,,,
