In [105]:
import requests
import pandas as pd
import regex as re
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt

from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

load_dotenv()

True

In [106]:
def clean_tokenize(raw_text):
    lemmatizer = WordNetLemmatizer()
    cleanr = re.compile('\.+|<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_text).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ').strip()
    new_content = ' '.join(cleantext.split())
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', new_content)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)

In [107]:
def get_news(tickers, range_of_dates):
    news_df = pd.DataFrame()
    ticker_string = ','.join(tickers)
    for date in range_of_dates:
        url = f'https://api.benzinga.com/api/v2/news?token=6b65009cb3594ca380ef509da6e1deee&date={str(date.date())}&tickers={ticker_string}&displayOutput=full'
        data = requests.get(url, headers={'accept': 'application/json'})
        news_df = news_df.append(data.json())
    news_df = news_df.drop(['id', 'author', 'updated', 'teaser', 'url', 'image', 'channels', 'tags'], axis=1)
    news_df['created'] = pd.to_datetime(pd.to_datetime(news_df['created']).dt.date).dt.normalize()
    news_df = news_df.rename(columns={'created': 'date'})
    return news_df
    

In [108]:
news_df = get_news(['AZN', 'JNJ', 'PFE', 'MRNA'], pd.date_range('2020-09-01', '2021-09-15'))
news_df['word_tokens'] = news_df['body'].apply(clean_tokenize)
news_df['word_tokens'] += news_df['title'].apply(clean_tokenize)


In [109]:
def assign_ticker(x):
    if 'AZN' in  [value for elem in x for value in elem.values()]:
        return 'AZN'
    if 'MRNA' in  [value for elem in x for value in elem.values()]:
        return 'MRNA'
    if 'JNJ' in  [value for elem in x for value in elem.values()]:
        return 'JNJ'
    if 'PFE' in  [value for elem in x for value in elem.values()]:
        return 'PFE'

news_df['symbol'] = news_df['stocks'].apply(assign_ticker)

In [110]:
news_df

Unnamed: 0,date,title,body,stocks,word_tokens,symbol
0,2020-09-16,AstraZeneca Trial Participant Adverse Events L...,<p><strong>AstraZeneca Plc&rsquo;s </strong>(N...,"[{'name': 'AZN'}, {'name': 'MRNA'}, {'name': '...",astrazeneca plcs nyseazn vaccine didnt likely ...,AZN
1,2020-09-17,Sinovac Plans To Expand COVID-19 Vaccine Trial...,<p><strong>Sinovac Biotech</strong> has plans ...,"[{'name': 'AZN'}, {'name': 'BNTX'}, {'name': '...",sinovac biotech ha plan expand clinical trial ...,AZN
2,2020-09-17,"5 Stocks To Watch For September 17, 2020",<p>Some of the stocks that may grab investor f...,"[{'name': 'APOG'}, {'name': 'CHWY'}, {'name': ...",stock may grab investor focus today wall stree...,MRNA
3,2020-09-17,BioNTech Buys German Biotech Production Site f...,"BioNTech SE (NASDAQ:<a class=""ticker"" href=""/s...","[{'name': 'BNTX'}, {'name': 'NVS'}, {'name': '...",biontech se nasdaqbntx today announced signing...,PFE
4,2020-09-17,AstraZeneca Acquires Oral PCSK9 Inhibitor Prog...,"AstraZeneca (NYSE:<a class=""ticker"" href=""/sto...",[{'name': 'AZN'}],astrazeneca nyseazn ha entered agreement dogma...,AZN
...,...,...,...,...,...,...
10,2021-09-15,FDA Declines To Take Stance On Pfizer's COVID-...,<p>-Reuters</p>,"[{'name': 'BNTX'}, {'name': 'PFE'}]",reutersfda declines take stance pfizers covid ...,PFE
11,2021-09-15,Pfizer Builds Case For COVID-19 Booster As Pro...,<ul>\r\n\t<li><strong>Pfizer Inc</strong>&nbsp...,"[{'name': 'BNTX'}, {'name': 'PFE'}]",pfizer incnysepfe said us regulator approve bo...,PFE
12,2021-09-15,Moderna Highlights New Clinical Data On COVID-...,<h1>Moderna Highlights New Clinical Data on it...,[{'name': 'MRNA'}],moderna highlights new clinical data covid vac...,MRNA
13,2021-09-15,UPDATE: Moderna Says 'mRNA-1273 remains highly...,,[{'name': 'MRNA'}],update moderna says mrna remains highly effect...,MRNA


In [111]:
big_string = ' '.join(news_df['word_tokens'].values)
wc = WordCloud().generate(big_string)
plt.imshow(wc)

<matplotlib.image.AxesImage at 0x1b556afab48>

In [112]:
from collections import Counter
from nltk import ngrams

In [113]:
news_bigrams = ngrams([word for value in news_df['word_tokens'].values for word in value.split()], n=2)
list(news_bigrams)

[('astrazeneca', 'plcs'),
 ('plcs', 'nyseazn'),
 ('nyseazn', 'vaccine'),
 ('vaccine', 'didnt'),
 ('didnt', 'likely'),
 ('likely', 'cause'),
 ('cause', 'adverse'),
 ('adverse', 'event'),
 ('event', 'suspended'),
 ('suspended', 'latestage'),
 ('latestage', 'human'),
 ('human', 'trial'),
 ('trial', 'drug'),
 ('drug', 'according'),
 ('according', 'oxford'),
 ('oxford', 'university'),
 ('university', 'happened'),
 ('happened', 'volunteers'),
 ('volunteers', 'british'),
 ('british', 'drugmakers'),
 ('drugmakers', 'chadox'),
 ('chadox', 'ncov'),
 ('ncov', 'vaccine'),
 ('vaccine', 'trial'),
 ('trial', 'developed'),
 ('developed', 'unexplained'),
 ('unexplained', 'neurological'),
 ('neurological', 'symptom'),
 ('symptom', 'including'),
 ('including', 'changed'),
 ('changed', 'sensation'),
 ('sensation', 'weakness'),
 ('weakness', 'limb'),
 ('limb', 'according'),
 ('according', 'participant'),
 ('participant', 'information'),
 ('information', 'document'),
 ('document', 'published'),
 ('published

In [114]:
news_word_count = Counter([word for value in news_df['word_tokens'].values for word in value.split()])
news_word_count

Counter({'astrazeneca': 1358,
         'plcs': 74,
         'nyseazn': 163,
         'vaccine': 8754,
         'didnt': 80,
         'likely': 1229,
         'cause': 286,
         'adverse': 464,
         'event': 628,
         'suspended': 49,
         'latestage': 252,
         'human': 303,
         'trial': 2732,
         'drug': 1706,
         'according': 702,
         'oxford': 211,
         'university': 201,
         'happened': 237,
         'volunteers': 20,
         'british': 69,
         'drugmakers': 60,
         'chadox': 19,
         'ncov': 19,
         'developed': 342,
         'unexplained': 33,
         'neurological': 57,
         'symptom': 201,
         'including': 965,
         'changed': 140,
         'sensation': 20,
         'weakness': 88,
         'limb': 36,
         'participant': 782,
         'information': 551,
         'document': 93,
         'published': 181,
         'codeveloping': 24,
         'independent': 162,
         'review': 636,
     

In [115]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

In [118]:
news_doc = nlp(big_string[:10000])
displacy.render(news_doc, style='ent')

In [119]:
[ent for ent in news_doc.ents]

[oxford university,
 british,
 oxford,
 first,
 reuters,
 united kingdom,
 india,
 south africa,
 united states food drug administration,
 stephen hahn,
 moderna inc nysemrna pfizer inc nysepfe,
 microsoft,
 nasdaqmsft,
 october,
 cnbc price,
 wednesday,
 later month,
 chinese,
 age three,
 national library medicine,
 hebei,
 reuters,
 two,
 german,
 united states food drug administration,
 year old,
 moderna inc nasdaqmrna,
 late october early november,
 new york times,
 oxford university,
 british,
 united kingdom,
 india,
 south africa,
 apogee enterprises inc nasdaqapog,
 quarterly,
 million,
 bell apogee share,
 herman miller inc nasdaqmlhr,
 fiscal first quarter,
 miller,
 cantel medical corp nysecmd report,
 quarterly,
 million,
 bell cantel,
 moderna inc nasdaqmrna,
 million,
 september,
 today,
 ag,
 switzerland,
 marburg germany,
 fourth quarter,
 million,
 first,
 half,
 past five year,
 one hour,
 frankfurt,
 mainz germany,
 bnt bnt,
 five,
 europe south america,
 china,
 o

In [120]:
def clean_html(raw_html):
    cleanr = re.compile('\.+|<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html).replace('\r', ' ').replace('\t', ' ').replace('\n', ' ').strip()
    return ' '.join(cleantext.split())

In [121]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

news_features_df = pd.DataFrame()
news_features_df['date'] = news_df['date']
news_features_df['symbol'] = news_df['symbol']

news_features_df['compound_b'] = [analyzer.polarity_scores(v)['compound'] for v in news_df['body'].apply(clean_html)]
news_features_df['neg_b'] = [analyzer.polarity_scores(v)['neg'] for v in news_df['body'].apply(clean_html)]
news_features_df['neu_b'] = [analyzer.polarity_scores(v)['neu'] for v in news_df['body'].apply(clean_html)]
news_features_df['pos_b'] = [analyzer.polarity_scores(v)['pos'] for v in news_df['body'].apply(clean_html)]

news_features_df['compound_t'] = [analyzer.polarity_scores(v)['compound'] for v in news_df['title'].apply(clean_html)]
news_features_df['neg_t'] = [analyzer.polarity_scores(v)['neg'] for v in news_df['title'].apply(clean_html)]
news_features_df['neu_t'] = [analyzer.polarity_scores(v)['neu'] for v in news_df['title'].apply(clean_html)]
news_features_df['pos_t'] = [analyzer.polarity_scores(v)['pos'] for v in news_df['title'].apply(clean_html)]

In [122]:
news_features_df = news_features_df.groupby(['date', 'symbol']).sum()

In [125]:
news_features_df.to_csv('../Data/news_features.csv')