In [1]:
import os
import datetime
import string
import re
import numpy as np
import pandas as pd
import gdown
import nltk
nltk.download('stopwords')
from gensim.parsing.preprocessing import remove_stopwords
from textblob import TextBlob

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

from PIL import Image

[nltk_data] Downloading package stopwords to /Users/andre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_url = "https://drive.google.com/drive/folders/1zQGlgh5kHTXSq7eoyXf6i_uAWYFJ5xzx?usp=share_link"

current_path = os.getcwd()

data_folder_path = os.path.join(os.getcwd(), 'genie_data')

In [4]:
if not os.path.exists(data_folder_path):

    os.makedirs(data_folder_path)
    print(f'Downloading data from Google Drive to {data_folder_path}')
    gdown.download_folder(data_url,output=data_folder_path, quiet=True, use_cookies=False)
else:
    print(f'Directory already exists: {data_folder_path}')

Downloading data from Google Drive to /Users/andre/dev/AIAgentPortfolio/test-env/genie_data


In [5]:
start_date = '2005-01-01'
end_date = '2020-12-31'

In [7]:
headlines_data = pd.read_csv('genie_data/raw_partner_headlines.csv')
headlines_data.head()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,2,Agilent Technologies Announces Pricing of $5……...,http://www.gurufocus.com/news/1153187/agilent-...,GuruFocus,2020-06-01 00:00:00,A
1,3,Agilent (A) Gears Up for Q2 Earnings: What's i...,http://www.zacks.com/stock/news/931205/agilent...,Zacks,2020-05-18 00:00:00,A
2,4,J.P. Morgan Asset Management Announces Liquida...,http://www.gurufocus.com/news/1138923/jp-morga...,GuruFocus,2020-05-15 00:00:00,A
3,5,"Pershing Square Capital Management, L.P. Buys ...",http://www.gurufocus.com/news/1138704/pershing...,GuruFocus,2020-05-15 00:00:00,A
4,6,Agilent Awards Trilogy Sciences with a Golden ...,http://www.gurufocus.com/news/1134012/agilent-...,GuruFocus,2020-05-12 00:00:00,A


In [None]:
headlines_data.shape

(1845559, 6)

In [9]:
headlines_data['date'] = pd.to_datetime(headlines_data['date'])
headlines_data['year'] = headlines_data['date'].dt.year


In [10]:
headlines_data = headlines_data[(headlines_data['year'] >= 2005) & (headlines_data['year'] <= 2020)]

In [11]:
headlines_data.shape

(1845558, 7)

In [12]:
# Removes any duplicates from the dataset.
headlines_data = headlines_data.drop_duplicates(subset=['headline', 'stock', 'date'], keep='first')

In [13]:
headlines_data.shape

(1822715, 7)

In [14]:
def filter_records(df, stock, possible_phrases):
    """
    This fucntion helps to return new dataframe with the relacnet news.
    """

    search_phrase = " |  ".join(possible_phrases)
    new_df = df.loc[
        (df["stock"] == stock)
        | (headlines_data["headline"].str.contains(search_phrase, case=False))
    ]
    new_df = new_df.reset_index().sort_values(by='date')
    return new_df[['date', 'headline']]




In [15]:
# apply filter to each ticker.

ENPH_headlines    = filter_records(headlines_data,"ENPH",["Enphase Energy Inc","Enphase Energy","ENPH"])
KEY_headlines     = filter_records(headlines_data,"KEY",["KeyCorp","KeyBank"])
DAL_headlines     = filter_records(headlines_data,"DAL",["Delta Airlines Corp","Delta Airlines","DEL"])
LNC_headlines     = filter_records(headlines_data,"LNC",["Lincoln National Corp","Lincoln National","LNC"])
ETSY_headlines    = filter_records(headlines_data,"ETSY",["Etsy Inc","Etsy"])

XEL_headlines     = filter_records(headlines_data,"XEL",["Xcel Energy Inc","Xcel Energy","XEL"])
PG_headlines      = filter_records(headlines_data,"PG",["P&G","Procter & Gamble Co","Procter & Gamble","PG"])
LNT_headlines     = filter_records(headlines_data,"LNT",["Alliant","Alliant Energy","Alliant Energy Corporation","LNT"])
PEP_headlines     = filter_records(headlines_data,"PEP",["PepsiCo"])
D_headlines       = filter_records(headlines_data,"D",["Dominion Energy Inc","Dominion Energy"])

In [16]:
def process_sentence(sentence):
    """
    Process the sentence to convert to lowercase, remove digits, puntuation, and stopwords.
    """

    l_sentence = sentence.lower()                       # Converts the sentence to lowercase.
    rd_sentence = re.sub(r'\d+', '', l_sentence)        # Remove the digits from the sentence.
    plain_sentence = rd_sentence.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    plain_sentence = plain_sentence.strip()
    return remove_stopwords(plain_sentence)


In [17]:
def get_sentiment_polarity(sentence):
    """
    Returns the sentiment polarity of the sentence provided.
    """

    processes_sentence = process_sentence(sentence)
    return TextBlob(processes_sentence).sentiment.polarity

In [18]:
ENPH_headlines["SentimentScore"]   = ENPH_headlines["headline"].apply(get_sentiment_polarity)
KEY_headlines["SentimentScore"]    = KEY_headlines["headline"].apply(get_sentiment_polarity)
DAL_headlines["SentimentScore"]    = DAL_headlines["headline"].apply(get_sentiment_polarity)
LNC_headlines["SentimentScore"]    = LNC_headlines["headline"].apply(get_sentiment_polarity)
ETSY_headlines["SentimentScore"]   = ETSY_headlines["headline"].apply(get_sentiment_polarity)


XEL_headlines["SentimentScore"]    = XEL_headlines["headline"].apply(get_sentiment_polarity)
PG_headlines["SentimentScore"]     = PG_headlines["headline"].apply(get_sentiment_polarity)
LNT_headlines["SentimentScore"]    = LNT_headlines["headline"].apply(get_sentiment_polarity)
PEP_headlines["SentimentScore"]    = PEP_headlines["headline"].apply(get_sentiment_polarity)
D_headlines["SentimentScore"]      = D_headlines["headline"].apply(get_sentiment_polarity)

In [None]:
# Generate a intermediate dataframe with continuous dates in them.

dates_list = pd.date_range(start=start_date, end=end_date)