### Data collection
In this notebook, we pull stock and company financials data from alpha vantage.
First, we import the necessary packages below.

In [1]:
#import usual packages for data manipulation
import pandas as pd
import numpy as np

#import usual packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import datetime to get the current date
import datetime as dt

#import alpha vantage packages
from alpha_vantage.timeseries import TimeSeries #for stock data
from alpha_vantage.techindicators import TechIndicators #for technical indicators
from alpha_vantage.fundamentaldata import FundamentalData #for company overview
from alpha_vantage.alphaintelligence import AlphaIntelligence #for news sentiment

To use alpha vantage, we need an API key, which can be obtained at this link:
<https://www.alphavantage.co/support/#api-key>

Note: the free API key allows up to 25 queries per day.

After obtaining our personal key, we initialize the four classes from which we will pull data.

In [2]:
#Arvind's API key
my_key='DJ3QKKTFZ5J298QY'

ts = TimeSeries(key=my_key, output_format='pandas')
fd = FundamentalData(key=my_key, output_format='pandas')
ai = AlphaIntelligence(key=my_key, output_format='pandas')

Next, we write a few simple functions that will pull the data, apply some pre-processing, and then save to csv.

In [6]:
#function to get stock data and save to csv
def stock_to_csv(ticker):
    df, _ = ts.get_daily(symbol=ticker, outputsize='full')
    df.columns = ['open','high','low','close','volume']
    df.sort_index(inplace=True)
    df = df[df.index.year >= 2014]
    df.to_csv(f'../../data/{ticker}_stock.csv')

In [7]:
def news_grouper(df):
    df = pd.DataFrame({'date': pd.to_datetime(df['time_published']).dt.date,
                       'ticker_sentiment': df['ticker_sentiment']})
    return df.groupby('date').sum()

def news_cleaner(df):
    scores = [] #to store daily sentiment scores for aapl
    relevances = [] #to store daily relevance scores for aapl
    for i in range(len(df)):
        temp = pd.DataFrame(df['ticker_sentiment'].iloc[i])
        temp = temp[temp['ticker'] == ticker]
        wts = temp['relevance_score'].astype(float)
        raw_scores = temp['ticker_sentiment_score'].astype(float)
        #append the mean relevance score for the day
        relevances.append(wts.mean())
        #append the weighted average sentiment score for the day
        scores.append(np.dot(wts,raw_scores) / wts.sum())
    df[f'{ticker}_sentiment_score'] = np.array(scores)
    df[f'{ticker}_relevance_score'] = np.array(relevances)
    df.drop('ticker_sentiment', axis=1, inplace=True)
    return df

def news_to_csv(ticker, time_from, time_to):
    queries = []
    start_date = pd.to_datetime(time_from).date()
    count = 0
    while(count <= 10):
        df, _ = ai.get_news_sentiment(tickers=ticker, 
                                    topics='earnings',
                                    time_from=time_from, 
                                    time_to=time_to,
                                    limit=1000)
        df.reset_index(inplace=True, drop=True)
        queries.append(df)
        earliest_date = pd.to_datetime(df['time_published'].iloc[-1]).date()
        if earliest_date <= start_date:
            break
        time_to = pd.to_datetime(df['time_published'].iloc[-1]).strftime('%Y%m%dT%H%M')
    df = pd.concat(queries)
    df.to_csv(f'../../data/{ticker}_news_data_raw.csv')
    df = news_grouper(df)
    df = news_cleaner(df)
    df.to_csv(f'../../data/{ticker}_news.csv')
    return df

In [None]:
news_to_csv('TSLA', time_from='20230101T0130', time_to='20240724T0130')

In [None]:
news_to_csv('META', time_from='20230101T0130', time_to='20240803T0130')

In [None]:
news_to_csv('GOOG', time_from='20230101T0130', time_to='20240815T0130')

In [8]:
news_to_csv('AAPL',time_from='20230101T0130', time_to='20240709T0130')

NameError: name 'ticker' is not defined

In [None]:
news_to_csv('DIS', time_from='20230101T0130', time_to='20240815T0130')

In [None]:
news_to_csv('AMZN', time_from='20230101T0130', time_to='20240626T0130')

In [None]:
news_to_csv('NFLX', time_from='20230101T0130', time_to='20240625T0130')

In [9]:
def financial_cleaner(df):
    df['date'] = pd.to_datetime(df['fiscalDateEnding'])
    df.drop(columns='fiscalDateEnding',inplace=True)
    df.set_index('date',inplace=True)
    df.sort_index(inplace=True)
    df = df[df.index.year >= 2014]
    return df

def financials_to_csv(ticker):
    df_e, _ = fd.get_earnings_quarterly(ticker)
    df_b, _ = fd.get_balance_sheet_quarterly(ticker)
    df_i, _ = fd.get_income_statement_quarterly(ticker)
    df_e.drop(columns=['reportTime'],inplace=True)
    df_b.drop(columns=['reportedCurrency'], inplace=True)
    df_i.drop(columns=['reportedCurrency'], inplace=True)
    #cleaning the data
    df_b = financial_cleaner(df_b)
    df_i = financial_cleaner(df_i)
    df_e = financial_cleaner(df_e)
    #concatenating the dataframes
    df_financials = pd.concat([df_b,df_i,df_e],axis=1)
    df_financials.to_csv(f'../data/{ticker}_financials.csv')

To read in the csv's, some care is needed to ensure that the dtypes of the columns are correctly interpreted. For convenience, we wrap this in a simple function that pulls the data for a given ticker.

In [13]:
def get_data(ticker):
    df_stock = pd.read_csv(f'../data/{ticker}_stock.csv', index_col='date',parse_dates=True,dtype=float)
    df_news = pd.read_csv(f'../data/{ticker}_news_data.csv', index_col='date',parse_dates=True,dtype=float)
    df_financials = pd.read_csv(f'../data/{ticker}_financials.csv', 
                                index_col='date',
                                parse_dates=True,
                                dtype={col: 'float' for col in pd.read_csv(f'../data/{ticker}_financials.csv', nrows=1).columns if col!='reportedDate'},
                                na_values='None')
    return {'stock': df_stock, 
            'news' : df_news, 
            'financials' : df_financials}