### Data collection
In this notebook, we pull stock and company financials data from alpha vantage.
First, we import the necessary packages below.

In [2]:
#import usual packages for data manipulation
import pandas as pd
import numpy as np

#import usual packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#import datetime to get the current date
import datetime as dt

#import alpha vantage packages
from alpha_vantage.timeseries import TimeSeries #for stock data
from alpha_vantage.techindicators import TechIndicators #for technical indicators
from alpha_vantage.fundamentaldata import FundamentalData #for company overview
from alpha_vantage.alphaintelligence import AlphaIntelligence #for news sentiment

To use alpha vantage, we need an API key, which can be obtained at this link:
<https://www.alphavantage.co/support/#api-key>

Note: the free API key allows up to 25 queries per day.

After obtaining our personal key, we initialize the four classes from which we will pull data.

In [6]:
#Arvind's API key
my_key='DJ3QKKTFZ5J298QY'

ts = TimeSeries(key=my_key, output_format='pandas')
fd = FundamentalData(key=my_key, output_format='pandas')
ai = AlphaIntelligence(key=my_key, output_format='pandas')

Next, we write a few simple functions that will pull the data, apply some pre-processing, and then save to csv.

In [7]:
#function to get stock data and save to csv
def stock_to_csv(ticker):
    df, _ = ts.get_daily(symbol=ticker, outputsize='full')
    df.columns = ['open','high','low','close','volume']
    df.sort_index(inplace=True)
    df = df[df.index.year >= 2014]
    df.to_csv(f'../data/{ticker}_stock.csv')

In [8]:
def ticker_news_to_csv(ticker):
    df, _ = ai.get_news_sentiment(tickers = ticker, time_from='20220810T0130', limit=1000)
    df.reset_index(inplace=True, drop=True)
    df.to_csv(f'../data/{ticker}_news_data_raw.csv')
    #choose only the date and news sentiment score columns
    df = pd.DataFrame({'date': pd.to_datetime(df['time_published']).dt.date,
                       'ticker_sentiment': df['ticker_sentiment']})
    df = df.groupby('date').sum()
    scores = [] #to store daily sentiment scores for aapl
    relevances = [] #to store daily relevance scores for aapl
    for i in range(len(df)):
        temp = pd.DataFrame(df['ticker_sentiment'].iloc[i])
        temp = temp[temp['ticker'] == ticker]
        wts = temp['relevance_score'].astype(float)
        raw_scores = temp['ticker_sentiment_score'].astype(float)
        #append the mean relevance score for the day
        relevances.append(wts.mean())
        #append the weighted average sentiment score for the day
        scores.append(np.dot(wts,raw_scores) / wts.sum())
    df[f'{ticker}_sentiment_score'] = np.array(scores)
    df[f'{ticker}_relevance_score'] = np.array(relevances)
    df.drop('ticker_sentiment', axis=1, inplace=True)
    df.to_csv(f'../data/{ticker}_news_data.csv')

In [9]:
def financial_cleaner(df):
    df['date'] = pd.to_datetime(df['fiscalDateEnding'])
    df.drop(columns='fiscalDateEnding',inplace=True)
    df.set_index('date',inplace=True)
    df.sort_index(inplace=True)
    df = df[df.index.year >= 2014]
    return df

def financials_to_csv(ticker):
    df_e, _ = fd.get_earnings_quarterly(ticker)
    df_b, _ = fd.get_balance_sheet_quarterly(ticker)
    df_i, _ = fd.get_income_statement_quarterly(ticker)
    df_e.drop(columns=['reportTime'],inplace=True)
    df_b.drop(columns=['reportedCurrency'], inplace=True)
    df_i.drop(columns=['reportedCurrency'], inplace=True)
    #cleaning the data
    df_b = financial_cleaner(df_b)
    df_i = financial_cleaner(df_i)
    df_e = financial_cleaner(df_e)
    #concatenating the dataframes
    df_financials = pd.concat([df_b,df_i,df_e],axis=1)
    df_financials.to_csv(f'../data/{ticker}_financials.csv')

Let's check that it all works by setting the ticker to NVDA.

In [10]:
ticker = 'NVDA'
stock_to_csv(ticker)
ticker_news_to_csv(ticker)
financials_to_csv(ticker)

To read in the csv's, some care is needed to ensure that the dtypes of the columns are correctly interpreted. For convenience, we wrap this in a simple function that pulls the data for a given ticker.

In [13]:
def get_data(ticker):
    df_stock = pd.read_csv(f'../data/{ticker}_stock.csv', index_col='date',parse_dates=True,dtype=float)
    df_news = pd.read_csv(f'../data/{ticker}_news_data.csv', index_col='date',parse_dates=True,dtype=float)
    df_financials = pd.read_csv(f'../data/{ticker}_financials.csv', 
                                index_col='date',
                                parse_dates=True,
                                dtype={col: 'float' for col in pd.read_csv(f'../data/{ticker}_financials.csv', nrows=1).columns if col!='reportedDate'},
                                na_values='None')
    return {'stock': df_stock, 
            'news' : df_news, 
            'financials' : df_financials}

In [14]:
nvda = get_data('NVDA')

In [18]:
nvda['financials']

Unnamed: 0_level_0,totalAssets,totalCurrentAssets,cashAndCashEquivalentsAtCarryingValue,cashAndShortTermInvestments,inventory,currentNetReceivables,totalNonCurrentAssets,propertyPlantEquipment,accumulatedDepreciationAmortizationPPE,intangibleAssets,...,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome,reportedDate,reportedEPS,estimatedEPS,surprise,surprisePercentage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-26,7.250894e+09,5.624711e+09,1.152000e+09,4.671810e+09,3.877650e+08,,1.626183e+09,5.827400e+08,7.000800e+08,9.391910e+08,...,6.192420e+08,1.460460e+08,1.761160e+08,2.308060e+08,1.469170e+08,,,,,
2014-01-31,,,,,,,,,,,...,,,,,,2014-02-12,0.006,0.005,0.001,20.0000
2014-04-27,6.864663e+09,5.273798e+09,5.090000e+08,4.347817e+09,3.932800e+08,,1.590865e+09,5.708020e+08,,9.207090e+08,...,1.365160e+08,1.380000e+08,1.740000e+08,2.290000e+08,1.370000e+08,,,,,
2014-04-30,,,,,,,,,,,...,,,,,,2014-05-06,0.006,0.004,0.002,50.0000
2014-07-27,6.933719e+09,5.377586e+09,5.150000e+08,4.386054e+09,3.874340e+08,,1.556133e+09,5.569110e+08,,9.037920e+08,...,1.279760e+08,1.260000e+08,1.670000e+08,2.230000e+08,1.280000e+08,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-31,,,,,,,,,,,...,,,,,,2023-11-21,0.402,0.336,0.066,19.6429
2024-01-28,6.572800e+10,4.434500e+10,7.280000e+09,2.598400e+10,5.282000e+09,9.999000e+09,2.289900e+10,3.914000e+09,3.509000e+09,5.512000e+09,...,1.228500e+10,1.240000e+10,1.416900e+10,1.431200e+10,1.228500e+10,,,,,
2024-01-31,,,,,,,,,,,...,,,,,,2024-02-21,0.516,0.461,0.055,11.9306
2024-04-28,7.707200e+10,5.372900e+10,7.587000e+09,3.143800e+10,5.864000e+09,1.236500e+10,2.509300e+10,4.006000e+09,,5.439000e+09,...,1.488100e+10,1.474500e+10,1.734300e+10,1.748600e+10,1.488100e+10,,,,,


In [19]:
ticker = 'TSLA'
stock_to_csv(ticker)
ticker_news_to_csv(ticker)
financials_to_csv(ticker)

In [20]:
tsla = get_data('TSLA')

In [32]:
tsla['financials']

Unnamed: 0_level_0,totalAssets,totalCurrentAssets,cashAndCashEquivalentsAtCarryingValue,cashAndShortTermInvestments,inventory,currentNetReceivables,totalNonCurrentAssets,propertyPlantEquipment,accumulatedDepreciationAmortizationPPE,intangibleAssets,...,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome,reportedDate,reportedEPS,estimatedEPS,surprise,surprisePercentage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-03-31,4500410000.0,3156047000.0,2393908000.0,2584068000.0,450730000.0,72380000.0,892634000.0,849389000.0,169335000.0,13662000.0,...,-49800000.0,-49800000.0,-37108000.0,7160000.0,-49800000.0,2014-05-07,0.01,0.1,-0.09,-90.0
2014-06-30,5054463000.0,3441694000.0,2674910000.0,2686624000.0,596927000.0,96607000.0,1081543000.0,1035822000.0,204690000.0,13541000.0,...,-61900000.0,-61900000.0,-29512000.0,25203000.0,-61900000.0,2014-07-31,0.01,0.04,-0.03,-75.0
2014-09-30,5437533000.0,3362914000.0,2370735000.0,2370735000.0,752492000.0,156889000.0,1470296000.0,1404326000.0,243447000.0,13420000.0,...,-74708000.0,-74708000.0,-41919000.0,23053000.0,-74708000.0,2014-11-05,0.02,-0.01,0.03,300.0
2014-12-31,5830667000.0,3180073000.0,1905713000.0,1905713000.0,953675000.0,226604000.0,1883850000.0,1829267000.0,292590000.0,,...,261697000.0,-107651000.0,-75208000.0,-7232000.0,-107630000.0,2015-02-11,-0.01,0.02,-0.03,-150.0
2015-03-31,6120030000.0,2921417000.0,1510076000.0,1510076000.0,1054840000.0,200052000.0,2286552000.0,2224191000.0,342386000.0,,...,-154181000.0,-170124000.0,-124567000.0,-47455000.0,-154181000.0,2015-05-06,-0.36,-0.5,0.14,28.0
2015-06-30,6468185000.0,2628621000.0,1150673000.0,1150673000.0,1212279000.0,138648000.0,2719318000.0,2646017000.0,403869000.0,,...,-184227000.0,-183066000.0,-156708000.0,-65319000.0,-184227000.0,2015-08-05,-0.03,-0.6,0.57,95.0
2015-09-30,7547497000.0,2998795000.0,1426036000.0,1426036000.0,1293717000.0,119964000.0,3187977000.0,3103811000.0,472769000.0,,...,-229858000.0,-239039000.0,-198766000.0,-88400000.0,-229858000.0,2015-11-03,-0.04,-0.04,0.0,0.0
2015-12-31,8067939000.0,2782006000.0,1196908000.0,1196908000.0,1277838000.0,168965000.0,3416156000.0,3403334000.0,571126000.0,12816000.0,...,218564000.0,-299968000.0,-276732000.0,-133009000.0,-320397000.0,2016-02-10,-0.87,0.1,-0.97,-970.0
2016-03-31,9191702000.0,3239543000.0,1441789000.0,1441789000.0,1301961000.0,318056000.0,3707949000.0,3593014000.0,673089000.0,,...,-282267000.0,-265146000.0,-237796000.0,-81336000.0,-282267000.0,2016-05-04,-0.04,-0.05,0.01,20.0
2016-06-30,11868950000.0,5203705000.0,3246301000.0,3246301000.0,1609607000.0,178594000.0,4145721000.0,3993250000.0,783986000.0,,...,-293188000.0,-272560000.0,-243171000.0,-59939000.0,-293188000.0,2016-08-03,-0.07,-0.04,-0.03,-75.0


In [26]:
ticker = 'META'
stock_to_csv(ticker)
ticker_news_to_csv(ticker)
financials_to_csv(ticker)

In [27]:
meta = get_data('META')

In [30]:
meta['news']

Unnamed: 0_level_0,META_sentiment_score,META_relevance_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-08-04,0.034152,0.087244
2024-08-05,0.106548,0.129432
2024-08-06,0.216477,0.124139
2024-08-07,0.190669,0.10972
2024-08-08,0.059341,0.076973
2024-08-09,0.124186,0.094386
2024-08-10,0.019719,0.095671
2024-08-11,0.11852,0.094934
2024-08-12,0.185444,0.113785
2024-08-13,0.229597,0.112566


In [31]:
ticker = 'NFLX'
stock_to_csv(ticker)
ticker_news_to_csv(ticker)
financials_to_csv(ticker)

In [33]:
nflx = get_data('NFLX')

In [36]:
nflx['news']

Unnamed: 0_level_0,NFLX_sentiment_score,NFLX_relevance_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-06-26,0.176093,0.03646
2024-06-27,0.230301,0.187146
2024-06-28,0.134642,0.164873
2024-06-29,0.092957,0.140604
2024-06-30,0.114813,0.155233
2024-07-01,-0.043832,0.203937
2024-07-02,0.14328,0.209403
2024-07-03,0.116522,0.239829
2024-07-04,0.123569,0.241946
2024-07-05,0.093334,0.164383
