In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import requests
import time
import json
import csv
import re
import os
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv() 

True

In [2]:
ACL19_dir = r"D:\ACL19_Release" # too big for GitHub https://github.com/GeminiLn/EarningsCall_Dataset 
MAEC_dir = 'data/MAEC' # https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction
sp1500_dir = r"D:\sp1500" 
# there is a link for the audio data in the MAEC GitHub, but it does not work
# I emailed the authors, and they send another link.
# There is like a half-million files, but only 19 GB
# https://drive.google.com/file/d/1m1GRCHgKn9Vz9IFMC_SpCog6uP3-gFgY/view?usp=drive_link 

# Original dataset
[paper](https://aclanthology.org/P19-1038.pdf)
[GitHub](https://github.com/GeminiLn/EarningsCall_Dataset)

In [4]:
# Loop through the directory, each folder represents an earnings conference call; the folders are named as "CompanyName_Date".
filename_data = []
for filename in os.listdir(ACL19_dir):
    company_name, date_str = filename.rsplit('_', 1)
    date_str = date_str.split('.')[0] 
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
    filename_data.append([company_name, date])
filename_data = pd.DataFrame(filename_data, columns=["Company", "Date"])
filename_data.drop_duplicates(inplace=True)

# Special file I made to match the company name to the ticker
company_ticker = pd.read_csv('data/company_ticker.csv')# special file
filename_data = filename_data.merge(company_ticker, on="Company", how="left")
filename_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Company  572 non-null    object
 1   Date     572 non-null    object
 2   Ticker   572 non-null    object
dtypes: object(3)
memory usage: 13.5+ KB


# Yahoo

In order to compare apples to apples, we need stock data for the year 2017, for each ticker. In the original paper, the loss function is MSE of standard deviation of n_day after the call (3, 7, 15, 30 days). 

In [6]:
tickers = filename_data.Ticker.unique().tolist()
# 9 tickers do not have data on Yahoo! (No longer trading)
tickers_to_remove = ['GGP', 'CA', 'STI', 'FLT', 'NLSN', 'WRK','RTN', 'UTX', 'DISH']
tickers = [ticker for ticker in tickers if ticker not in tickers_to_remove]
yahoo_data = yf.download(tickers, start="2017-01-01", end="2017-12-31", group_by="ticker")
yahoo_data.index = yahoo_data.index.tz_localize(None)
yahoo_data.to_csv('data/yahoo_data.csv')
yahoo_data.info(verbose=True)

[*********************100%***********************]  265 of 265 completed


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2017-01-03 to 2017-12-29
Data columns (total 1590 columns):
 #     Column              Dtype  
---    ------              -----  
 0     (REGN, Open)        float64
 1     (REGN, High)        float64
 2     (REGN, Low)         float64
 3     (REGN, Close)       float64
 4     (REGN, Adj Close)   float64
 5     (REGN, Volume)      int64  
 6     (NEM, Open)         float64
 7     (NEM, High)         float64
 8     (NEM, Low)          float64
 9     (NEM, Close)        float64
 10    (NEM, Adj Close)    float64
 11    (NEM, Volume)       int64  
 12    (LRCX, Open)        float64
 13    (LRCX, High)        float64
 14    (LRCX, Low)         float64
 15    (LRCX, Close)       float64
 16    (LRCX, Adj Close)   float64
 17    (LRCX, Volume)      int64  
 18    (IPGP, Open)        float64
 19    (IPGP, High)        float64
 20    (IPGP, Low)         float64
 21    (IPGP, Close)       float64
 22    (IPGP, Adj Close)   float64

# Alphadvantage

I broke down and went with Alphadvantage API. It requires an API key.

In [8]:
API_KEY = os.getenv('API_KEY') # used with www.alphavantage.co
alpha_url = 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={}&apikey={}&outputsize=full&datatype=json' #.format(ticker,API_KEY)
alpha_dir = 'data/alpha_data/{}.csv' #.format(ticker) # I saved the raw alphadvantage data, so I don't have to do it again
alpha_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Dividend Amount', 'Split Coefficient'] # same as Yahoo

alpha_data = pd.DataFrame()
tickers = filename_data.Ticker.unique().tolist()
for ticker in tqdm(tickers):
    try:
        response = requests.get(alpha_url.format(ticker,API_KEY)).json()
        ticker_df = pd.DataFrame(response['Time Series (Daily)']).T
        ticker_df.columns = [f"{ticker}_{col}" for col in alpha_columns]
        ticker_df.index = pd.to_datetime(ticker_df.index)
        ticker_df = ticker_df.apply(pd.to_numeric, errors='coerce')
        ticker_df.to_csv(alpha_dir.format(ticker))
        ticker_df_2017 = ticker_df[ticker_df.index.year == 2017]
        ticker_df_2017 = ticker_df_2017.rename(columns={f"{ticker}_Adj Close": ticker})
        ticker_df_2017 = ticker_df_2017[[ticker]].copy()
        if alpha_data.empty:
            alpha_data = ticker_df_2017
        else:
            alpha_data = pd.concat([alpha_data, ticker_df_2017], axis=1)
        # this API has a limit per minute
        time.sleep(0.8)
    except KeyboardInterrupt: break
    except Exception as e: 
        print(ticker, e)

alpha_data.to_csv('data/alpha_data.csv')
alpha_data.info(verbose=True)

100%|██████████| 273/273 [06:38<00:00,  1.46s/it]

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2017-01-03 to 2017-12-29
Data columns (total 273 columns):
 #    Column  Dtype  
---   ------  -----  
 0    MMM     float64
 1    AOS     float64
 2    ABT     float64
 3    ABBV    float64
 4    MSFT    float64
 5    ADBE    float64
 6    AAP     float64
 7    AMD     float64
 8    AES     float64
 9    AET     float64
 10   A       float64
 11   AKAM    float64
 12   ALK     float64
 13   AZN     float64
 14   ALGN    float64
 15   ALLE    float64
 16   BFH     float64
 17   MO      float64
 18   AMZN    float64
 19   AEE     float64
 20   AXP     float64
 21   AMT     float64
 22   COR     float64
 23   AME     float64
 24   AMGN    float64
 25   ELV     float64
 26   AON     float64
 27   APA     float64
 28   ADM     float64
 29   T       float64
 30   ADSK    float64
 31   ADP     float64
 32   AVY     float64
 33   BALL    float64
 34   BAX     float64
 35   BDX     float64
 36   BIIB    float64
 37   BA      flo




# Add standard deviation (TARGET of the regression)


In [14]:
def add_n_day(row, n_day):
    Ticker = row['Ticker']
    Date = pd.to_datetime(row['Date'])
    end = Date + pd.Timedelta(days=n_day)
    data = alpha_data.loc[Date:end, Ticker]
    std_dev = data.std()
    if pd.isna(std_dev):
        return 0  
    else:
        return std_dev

filename_data['3_day'] = filename_data.apply(lambda row: add_n_day(row, 3), axis=1)
filename_data['7_day'] = filename_data.apply(lambda row: add_n_day(row, 7), axis=1)
filename_data['15_day'] = filename_data.apply(lambda row: add_n_day(row, 15), axis=1)
filename_data['30_day'] = filename_data.apply(lambda row: add_n_day(row, 30), axis=1)
filename_data.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Company  572 non-null    object 
 1   Date     572 non-null    object 
 2   Ticker   572 non-null    object 
 3   3_day    572 non-null    float64
 4   7_day    572 non-null    float64
 5   15_day   572 non-null    float64
 6   30_day   572 non-null    float64
dtypes: float64(4), object(3)
memory usage: 31.4+ KB


In [16]:
def add_n_day(row, n_day):
    Ticker = row['Ticker']
    if Ticker in ['GGP', 'CA', 'STI', 'FLT', 'NLSN', 'WRK','RTN', 'UTX', 'DISH']:
        return float('inf')
        
    Date = pd.to_datetime(row['Date'])
    end = Date + pd.Timedelta(days=n_day)
    data = yahoo_data.loc[Date:end, (Ticker, 'Adj Close')] 
    std_dev = data.std()
    if pd.isna(std_dev):
        return 0  
    else:
        return std_dev


filename_data['3_day_Yahoo'] = filename_data.apply(lambda row: add_n_day(row, 3), axis=1)
filename_data['7_day_Yahoo'] = filename_data.apply(lambda row: add_n_day(row, 7), axis=1)
filename_data['15_day_Yahoo'] = filename_data.apply(lambda row: add_n_day(row, 15), axis=1)
filename_data['30_day_Yahoo'] = filename_data.apply(lambda row: add_n_day(row, 30), axis=1)
filename_data.info(verbose=True)
filename_data.to_csv('data/targets.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company       572 non-null    object 
 1   Date          572 non-null    object 
 2   Ticker        572 non-null    object 
 3   3_day         572 non-null    float64
 4   7_day         572 non-null    float64
 5   15_day        572 non-null    float64
 6   30_day        572 non-null    float64
 7   3_day_Yahoo   572 non-null    float64
 8   7_day_Yahoo   572 non-null    float64
 9   15_day_Yahoo  572 non-null    float64
 10  30_day_Yahoo  572 non-null    float64
dtypes: float64(8), object(3)
memory usage: 49.3+ KB


Here I am comparing the Yahoo data to the alphabet advantage. Why is there a difference? This is where I was when the meeting started

In [25]:
filename_data['3_day_diff'] = filename_data['3_day'] - filename_data['3_day_Yahoo']
filename_data['7_day_diff'] = filename_data['7_day'] - filename_data['7_day_Yahoo']
filename_data['15_day_diff'] = filename_data['15_day'] - filename_data['15_day_Yahoo']
filename_data['30_day_diff'] = filename_data['30_day'] - filename_data['30_day_Yahoo']

filename_data['3_day_pct_change'] = ((filename_data['3_day_Yahoo'] - filename_data['3_day']) / filename_data['3_day']) * 100
filename_data['7_day_pct_change'] = ((filename_data['7_day_Yahoo'] - filename_data['7_day']) / filename_data['7_day']) * 100
filename_data['15_day_pct_change'] = ((filename_data['15_day_Yahoo'] - filename_data['15_day']) / filename_data['15_day']) * 100
filename_data['30_day_pct_change'] = ((filename_data['30_day_Yahoo'] - filename_data['30_day']) / filename_data['30_day']) * 100

sorted_data = filename_data.sort_values(by='30_day_pct_change', ascending=False)

# Step 6: Save to CSV
sorted_data.to_csv('data/temp.csv', index=False)

In [23]:
Stop (scratch)

NameError: name 'Stop' is not defined

In [None]:
filename_data

In [None]:
tickers = filename_data.Ticker.unique().tolist()
alpha_advantage = pd.DataFrame(columns = alpha_columns)
for ticker in tqdm(tickers):
    try:
        response = requests.get(alpha_url.format(ticker,API_KEY)).json()
        ticker_df = pd.DataFrame(response['Time Series (Daily)']).T
        ticker_df.columns = alpha_columns
        ticker_df.index = pd.to_datetime(ticker_df.index)
        ticker_df = ticker_df.apply(pd.to_numeric, errors='coerce')
        ticker_df.to_csv(alpha_dir.format(ticker))
        ticker_df_2017 = ticker_df[ticker_df.index.year == 2017]
        ticker_df_2017.columns = [f"{ticker}_{col}" for col in ticker_df_2017.columns]
        if alpha_advantage.empty:
            alpha_advantage = ticker_df_2017
        else:
            alpha_advantage = pd.concat([alpha_advantage, ticker_df_2017], axis=1)
        time.sleep(0.8)
    except KeyboardInterrupt: break
    except Exception as e: 
        print(ticker, e)

alpha_advantage.info(verbose=True)
alpha_advantage.to_csv('data/alpha_advantage.csv', index=False)

In [None]:
filename_data[filename_data['Ticker']=='FLT']CPAY
filename_data[filename_data['Ticker']=='GPS']GAP
Alpha had different takers
	Company	Date	Ticker	otherTicker
197	FleetCor Technologies Inc	2017-08-03	CPAY	FLT
208	Gap Inc.	2017-02-23	GAP	GPS
209	Gap Inc.	2017-05-18	GAP	GPS

UTX
RTN
Raytheon Co.
United Technologies
RTX

In [None]:
print(filename_data[filename_data['Ticker']=='GGP'] )
print(filename_data[filename_data['Ticker']=='CA'] )
print(filename_data[filename_data['Ticker']=='STI'] )
print(filename_data[filename_data['Ticker']=='FLT'] )
print(filename_data[filename_data['Ticker']=='NLSN'] )
print(filename_data[filename_data['Ticker']=='WRK'] )