# Libraries

In [1]:
import time
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
import yfinance as yf
import datetime
from statsmodels.tsa.stattools import grangercausalitytests
import warnings as wrn
import os

# Setting up params

In [2]:
general_stocks = ['KO', 'PFE', 'WMT', 'PG', 'JNJ', 'DIS', 'PEP', 'MCD', 'T', 'VZ']
tech_stocks = ['AAPL', 'AMZN', 'MSFT', 'GOOGL', 'NVDA', 'TSLA', 'META', 'INTC', 'IBM', 'AMD']
finance_stocks = ['GS', 'BAC', 'WFC', 'USB', 'JPM', 'MA', 'V', 'AXP', 'C', 'BLK']
decentralized_currencies = ['BTC', 'ETH', 'ADA', 'SOL', 'XRP', 'XMR', 'LTC', 'DOT', 'LINK', 'XTZ']

general_stocks_names = ['Coca-Cola', 'Pfizer', 'Walmart', 'Procter & Gamble', 'Johnson & Johnson', 'Disney', 'Pepsi', 'McDonalds', 'AT&T', 'Verizon']
tech_stocks_names = ['Apple', 'Amazon', 'Microsoft', 'Google', 'Nvidia', 'Tesla', 'Meta', 'Intel', 'IBM', 'AMD']
finance_stocks_names = ['Goldman Sachs', 'Bank of America', 'Wells Fargo', 'US Bancorp', 'JPMorgan Chase', 'Mastercard', 'Visa', 'American Express', 'Citigroup', 'BlackRock']
decentralized_currencies_names = ['Bitcoin', 'Ethereum', 'Cardano', 'Solana', 'Ripple', 'Monero', 'Litecoin', 'Polkadot', 'Chainlink', 'Tezos']

color_map = {
    'general': 'deepskyblue',
    'tech': 'limegreen',
    'finance': 'darkorchid',
    'crypto': 'red'
}

start = '2019-06-30'
end = '2024-07-01'
stock = 'Bitcoin'
ticker = 'BTC'
max_lags = 60

# Functions
## get_trends_data
gets the trend data using pytrends, given a certain timeframe

In [3]:
def get_trends_data(keyword, 
                    timeframe=datetime.date.today().strftime('%Y-%m-%d') + ' ' + (datetime.date.today() - datetime.timedelta(days = 269)).strftime('%Y-%m-%d'),
                    retries=5, 
                    backoff_factor=1.0,
                    verbose=True):
    pytrends = TrendReq(hl='en-US', tz=360, timeout=(10,25), )
    pytrends.build_payload(keyword, cat = 0, timeframe = timeframe, geo='')
    
    for i in range(retries):
        try:
            df = pytrends.interest_over_time()
            if df is not None and not df.empty:
                if verbose:
                    print(f"Trend Data for {keyword[0]} at timeframe {timeframe} retrieved successfully.")
                df.reset_index(inplace = True)
                df.rename(columns = {'date': 'Date', keyword[0]: 'Trend'}, inplace = True)
                df['Date'] = pd.to_datetime(df['Date'].dt.strftime('%m/%d/%Y'))
                return df
            else:
                print("No data retrieved or DataFrame is empty.")
                return None
        except Exception as e:
            if "429" in str(e):
                sleep_time = backoff_factor * (2 ** i)
                if verbose:
                    print(f"Rate limit exceeded. Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                raise(f"An error occurred: {e}")
    print("Failed to retrieve data after several retries.")
    return None

## get_stock_data
gets the prices of a certain stock in a certain timeframe

In [4]:
def get_stock_data(ticker, start, end, verbose = True):
    currTicker = yf.Ticker(ticker)
    tickerDF = currTicker.history(repair = True, start = start, end = end, auto_adjust = False).drop(columns = ['Dividends', 'Stock Splits', 'Repaired?']).reset_index()
    if verbose:
        print(f"Stock Data for {ticker} retrieved successfully.")
    tickerDF['Date'] = pd.to_datetime(tickerDF['Date'].dt.strftime('%m/%d/%Y'))
    return tickerDF

## trend_corr
gets the data of the trends and prices of the stock given to it in a certain timeframe and calculates the correlation between the log_returns and the trends delayed by certain delay

In [5]:
def trend_corr(stock, days = 60, start = '2023-10-01', end = '2024-06-01', delay = 7):
    if not os.path.exists(f"./Data/{stock}_trends({start} - {end}).csv"):
        if stock in general_stocks:
            name = general_stocks_names[general_stocks.index(stock)]
        elif stock in tech_stocks:
            name = tech_stocks_names[tech_stocks.index(stock)]
        elif stock in finance_stocks:
            name = finance_stocks_names[finance_stocks.index(stock)]
        else:
            name = decentralized_currencies_names[decentralized_currencies.index(stock)]
        t = get_trends_data([name], timeframe = f"{start} {end}")
        if t is None:
            raise Exception(f'Failed to retrieve Trend Data of {stock}.')
        t.to_csv(f"./Data/{stock}_trends({start} - {end}).csv")
    else:
        t = pd.read_csv(f"./Data/{stock}_trends({start} - {end}).csv")
    if not os.path.exists(f"./Data/{stock}_Prices({start} - {end}).csv"):
        if stock in decentralized_currencies:
            p = get_stock_data(f'{stock}-USD', start = start, end = end)
        else:
            p = get_stock_data(stock, start = start, end = end)
        p.to_csv(f"./Data/{stock}_Prices({start} - {end}).csv")
    else:
        p = pd.read_csv(f"./Data/{stock}_Prices({start} - {end}).csv")

    t['Date'] = pd.to_datetime(t['Date'])
    p['Date'] = pd.to_datetime(p['Date'])

    full_data = pd.merge(p, t, on='Date')

    full_data['log_returns'] = np.log(full_data.Close / full_data.Close.shift(1))
    full_data['Volatility'] = full_data['log_returns'].rolling(window=days).std() * np.sqrt(days)

    for i in range(1, 8):
        full_data[f'Delay_{i}'] = full_data['Trend'].shift(i)

    rho = full_data.corr()
    rho_c = rho['Close'][f'Delay_{delay}']
    return rho_c, full_data

## plot_stock_data
gets the data of the trends and the prices of the stock given to it in a certain timeframe and plots its close and its delayed trend

In [6]:
def plot_stock_data(stock, days = 60, start = '2023-10-01', end = '2024-06-01', delay = 7, download = False):
    if not os.path.exists(f"./Data/{stock}_trends({start} - {end}).csv"):
        if stock in general_stocks:
            name = general_stocks_names[general_stocks.index(stock)]
        elif stock in tech_stocks:
            name = tech_stocks_names[tech_stocks.index(stock)]
        elif stock in finance_stocks:
            name = finance_stocks_names[finance_stocks.index(stock)]
        else:
            name = decentralized_currencies_names[decentralized_currencies.index(stock)]
        t = get_trends_data([name], timeframe = f"{start} {end}")
        t.to_csv(f"./Data/{stock}_trends({start} - {end}).csv")
    else:
        t = pd.read_csv(f"./Data/{stock}_trends({start} - {end}).csv")
    if not os.path.exists(f"./Data/{stock}_Prices({start} - {end}).csv"):
        p = get_stock_data(stock, start = start, end = end)
        p.to_csv(f"./Data/{stock}_Prices({start} - {end}).csv")
    else:
        p = pd.read_csv(f"./Data/{stock}_Prices({start} - {end}).csv")

    t['Date'] = pd.to_datetime(t['Date'])
    p['Date'] = pd.to_datetime(p['Date'])

    full_data = pd.merge(p, t, on='Date')

    full_data['log_returns'] = np.log(full_data.Close / full_data.Close.shift(1))
    full_data['Volatility'] = full_data['log_returns'].rolling(window=days).std() * np.sqrt(days)

    full_data[f'Delay_{delay}'] = full_data.Trend.shift(7)

    # Determine the color based on the stock category
    if stock in general_stocks:
        color = color_map['general']
        name = general_stocks_names[general_stocks.index(stock)]
    elif stock in tech_stocks:
        color = color_map['tech']
        name = tech_stocks_names[tech_stocks.index(stock)]
    elif stock in finance_stocks:
        color = color_map['finance']
        name = finance_stocks_names[finance_stocks.index(stock)]
    else:
        color = color_map['crypto']
        name = decentralized_currencies_names[decentralized_currencies.index(stock)]

    # Create subplots
    fig, axes = plt.subplots(1, 2, figsize=(16, 7))

    # Plot Close price
    axes[0].plot(full_data['Date'], full_data['Close'], label = 'Close Price', color = color)
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Close Price')
    axes[0].set_title(f'{stock}: Close Price')
    legend = axes[0].legend(loc='upper left')
    legend.get_frame().set_alpha(0.3)

    # Plot 7-days delay trend
    axes[1].plot(full_data['Date'], full_data[f'Delay_{delay}'], label = f'{delay}-Days Delayed Trend', color = 'black')
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel(f'{delay}-Days Delayed Trend')
    axes[1].set_title(f'{stock}: {delay}-Days Delayed Trend')
    legend = axes[1].legend(loc='upper right')
    legend.get_frame().set_alpha(0.3)

    fig.suptitle(f'{name} ({stock})', fontsize=20, verticalalignment = 'bottom', fontweight = 'bold')
    plt.tight_layout(pad=2.0)
    plt.subplots_adjust(top=0.95)
    if download:
        plt.savefig(f"./Plots/{stock}_plot({start} - {end}).png", bbox_inches='tight')
    plt.show()

## time_jump
adds time in days to a given date that was accepted as string, returns as string

In [7]:
# function to find the date in string format after a certain number of days
def time_jump(start, days = 7 * 38):
    return (datetime.datetime.strptime(start, '%Y-%m-%d') + datetime.timedelta(days = days)).strftime('%Y-%m-%d')

## get_breakpoints
calculates the breakpoints needed for the multiple requests of data to make the data as long as possible for us

In [8]:
def get_breakpoints(start, end, days = 7 * 38):
    breakpoints = [start]
    while datetime.datetime.strptime(breakpoints[-1], '%Y-%m-%d') < datetime.datetime.strptime(end, '%Y-%m-%d'):
        temp = time_jump(breakpoints[-1], days)
        if datetime.datetime.strptime(temp, '%Y-%m-%d') < datetime.datetime.strptime(end, '%Y-%m-%d'):
            breakpoints.append(temp)
        else:
            breakpoints.append(end)
    return breakpoints

## connectNnormalizeTrends
gets the data of the trends and prices daily in the whole time frame and in parts, and connects them by estimating an eproximation of their absolute amount of searches

In [9]:
def connectNnormalizeTrends(Dfs, stock):
    # setting up the data for step 1
    if not os.path.exists(f'./Data/glimpse_{stock}_5Y.csv'):
        raise Exception(f'Failed to retrieve Glimpse Data of {stock}.')
    glmpsDf = pd.read_csv(f'./Data/glimpse_{stock}_5Y.csv')
    glmpsDf.rename(columns={'Time (week of)': 'Date', 'Absolute Google Search Volume': 'Absolute_Volume'}, inplace=True)
    glmpsDf['Date'] = pd.to_datetime(glmpsDf['Date'])

    df_concat = pd.concat(Dfs).reset_index(drop = True)
    df_concat['Date'] = pd.to_datetime(df_concat['Date'])

    # calculating the mean trend for each week and merging it into glmpsDf
    df_concat['MeanTrend'] = df_concat['Trend'].rolling(window=7, min_periods=1).mean().shift(-6)
    glmpsDf = pd.merge(glmpsDf, df_concat, on='Date', how='left').drop(columns = ['Trend'])

    # setting up the data for step 2
    glmpsDf.rename(columns={'Date': 'Date_Week'}, inplace=True)
    df_concat = df_concat.drop(columns = ['MeanTrend'])
    df_concat['Date_Week'] = (df_concat['Date'] - pd.to_timedelta((df_concat['Date'].dt.weekday + 1) % 7, unit='d')).dt.strftime('%Y-%m-%d')
    df_concat['Date_Week'] = pd.to_datetime(df_concat['Date_Week'])

    # calculating the ratio and search volume for each week
    df_concat = pd.merge(df_concat, glmpsDf[['Date_Week', 'MeanTrend', 'Absolute_Volume']], on='Date_Week', how='left')
    df_concat['Ratio'] = df_concat['Trend'] / (df_concat['MeanTrend'] * 7)
    df_concat['Search_Volume'] = df_concat['Ratio'] * df_concat['Absolute_Volume']

    # adding to df_concat the check ratio to check validity of the data
    df_concat['check_ratio'] = df_concat['Search_Volume'] / df_concat['Trend']

    # renormalizing the data
    df_concat['Normalized_Searches'] = ((df_concat['Search_Volume'] - df_concat['Search_Volume'].min()) / (df_concat['Search_Volume'].max() - df_concat['Search_Volume'].min())) * 100

    # cleaning out unnecessary columns
    df_concat = df_concat.drop(columns = ['Trend', 'Date_Week', 'MeanTrend', 'Absolute_Volume', 'Ratio'])

    df_concat['Date'] = df_concat['Date'].dt.strftime('%Y-%m-%d')

    return df_concat

## getNormalizedData
gets the data normalized, and performs validity checks (optional)

In [10]:
def getNormalizedData(stock, ticker, start = '2019-06-23', end = '2024-06-01', weeks = 38, do_double = True, verbose = False):
    if weeks > 38:
        wrn.warn('The maximum number of weeks is 38. \nThe number of weeks will be set to 38.', category=Warning)
        weeks = 38
    if do_double & (weeks % 2 != 0):
        wrn.warn('The number of weeks must be even to use the double method. \nThe number of weeks will be rounded down to the nearest even number.', category=Warning)
        weeks -= 1
    breakpoints = get_breakpoints(start, end, days = 7 * weeks)
    if do_double:
        breakpoints2 = get_breakpoints(start, end, days = 7 * weeks / 2)
        breakpoints2 = [x for x in breakpoints2 if x not in breakpoints[1:-1]]
    
    # initializing the list to store the dataframes
    Dfs = []

    # extracting the data for each time period
    for i in range(len(breakpoints) - 1):
        startTemp = breakpoints[i]
        endTemp = time_jump(breakpoints[i + 1], days=-1)
        # checking if the data is already extracted
        if not os.path.exists(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv"):
            # extracting the data
            t = get_trends_data([stock], timeframe = f"{startTemp} {endTemp}", verbose = verbose).drop(columns = ['isPartial'])
            if t is None:
                raise Exception(f'Failed to retrieve Trend Data of {stock}.')
            t.to_csv(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv")
        else:
            t = pd.read_csv(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv").drop(columns = ['Unnamed: 0'])

        Dfs.append(t)

    df_concat = connectNnormalizeTrends(Dfs, stock)

    if do_double:
        # initializing the list to store the dataframes
        Dfs2 = []

        # extracting the data for each time period
        for i in range(len(breakpoints2) - 1):
            startTemp = breakpoints2[i]
            endTemp = time_jump(breakpoints2[i + 1], days=-1)
            # checking if the data is already extracted
            if not os.path.exists(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv"):
                # extracting the data
                t = get_trends_data([stock], timeframe = f"{startTemp} {endTemp}", verbose = verbose).drop(columns = ['isPartial'])
                if t is None:
                    raise Exception(f'Failed to retrieve Trend Data of {stock}.')
                t.to_csv(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv")
            else:
                t = pd.read_csv(f"./Data/{stock}_trends({startTemp} - {endTemp}).csv").drop(columns = ['Unnamed: 0'])

            Dfs2.append(t)
        
        df_concat2 = connectNnormalizeTrends(Dfs2, stock)
    
        df_concat["Normalized_Searches"] = (df_concat["Normalized_Searches"] + df_concat2["Normalized_Searches"]) / 2
    
    if not os.path.exists(f"./Data/{stock}_Prices({start}-{end}).csv"):
        stockData = get_stock_data(f'{ticker}-USD', start = start, end = end, verbose = verbose)
        stockData.to_csv(f"./Data/{stock}_Prices({start}-{end}).csv")
    else:
        stockData = pd.read_csv(f"./Data/{stock}_Prices({start}-{end}).csv").drop(columns = ['Unnamed: 0'])

    df_concat[df_concat.Normalized_Searches == 0] = 0.1

    df_concat['log_searches'] = np.log(df_concat.Normalized_Searches / df_concat.Normalized_Searches.shift(1))
    stockData['log_returns'] = np.log(stockData.Close / stockData.Close.shift(1))

    try:
        df_concat['Date'] = df_concat['Date'].dt.strftime('%Y-%m-%d')
    except:
        pass
    try:
        stockData['Date'] = stockData['Date'].dt.strftime('%Y-%m-%d')
    except:
        pass

    finalDf = pd.merge(stockData, df_concat, on='Date', how='left').dropna()

    if verbose:
        # data normalization validity check
        top = 0
        bottom = 0
        for i in range(len(Dfs)):
            top += Dfs[i].shape[0]
            print(f'period {i + 1}:\n=========\nmean: {df_concat.iloc[bottom:top]['check_ratio'].mean():.4f}\nsd: {df_concat.iloc[bottom:top]["check_ratio"].std():.4f}\n\nmean/sd: {df_concat.iloc[bottom:top]['check_ratio'].mean()/df_concat.iloc[bottom:top]["check_ratio"].std():.4f}\n')
            bottom += Dfs[i].shape[0]

        if do_double:
            top = 0
            bottom = 0
            for i in range(len(Dfs2)):
                top += Dfs2[i].shape[0]
                print(f'period {i + len(Dfs) + 1}:\n=========\nmean: {df_concat.iloc[bottom:top]['check_ratio'].mean():.4f}\nsd: {df_concat.iloc[bottom:top]["check_ratio"].std():.4f}\n\nmean/sd: {df_concat.iloc[bottom:top]['check_ratio'].mean()/df_concat.iloc[bottom:top]["check_ratio"].std():.4f}\n')
                bottom += Dfs2[i].shape[0]
    return finalDf

# Collecting The Data

In [11]:
wrn.filterwarnings('ignore', category=UserWarning)
wrn.filterwarnings('ignore', category=FutureWarning)

coinsDFs = []
lags = []

for i in range(len(decentralized_currencies)):
    coinsDFs.append(getNormalizedData(decentralized_currencies_names[i], decentralized_currencies[i], 
                                      start = start, end = end, 
                                      do_double = True))
    cause = grangercausalitytests(coinsDFs[-1][['log_returns', 'log_searches']], 
                                  maxlag = max_lags, 
                                  verbose = False)

    min_p_value = float('inf')
    min_p_lag = None

    for lag, result in cause.items():
        p_value = result[0]['ssr_ftest'][1]
        if p_value < min_p_value:
            min_p_value = p_value
            min_p_lag = lag

    lags.append(min_p_lag)

    print(f'{decentralized_currencies_names[i]}: {min_p_lag} lags, p-value = {min_p_value:.4f}')

Bitcoin: 21 lags, p-value = 0.0466
Ethereum: 1 lags, p-value = 0.1807
Cardano: 2 lags, p-value = 0.0776
Solana: 57 lags, p-value = 0.0712
Ripple: 60 lags, p-value = 0.1876
Monero: 1 lags, p-value = 0.1025
Litecoin: 2 lags, p-value = 0.2933
Polkadot: 3 lags, p-value = 0.0056
Chainlink: 24 lags, p-value = 0.0076
Tezos: 3 lags, p-value = 0.4993


## Examples for checks code

In [None]:
# see data validity checks of a certain stock
stock_name = 'Bitcoin' # change to any stock name that is in the list at the top
stock_ticker = 'BTC' # change to the corresponding ticker of the stock

getNormalizedData(stock_name, stock_ticker, 
                  start = start, end = end, 
                  do_double = True, verbose = True)

# see the granger causality test results of a certain stock
df = getNormalizedData(stock_name, stock_ticker, start = start, end = end, do_double = True)

grangercausalitytests(df[['log_returns', 'log_searches']], 
                      maxlag = max_lags, 
                      verbose = True)

# Previously used chunks

In [4]:
# Calculate correlations for each category
volt_del_corr_general = [trend_corr(stock)[0] for stock in general_stocks]
volt_del_corr_tech = [trend_corr(stock)[0] for stock in tech_stocks]
volt_del_corr_finance = [trend_corr(stock)[0] for stock in finance_stocks]
volt_del_corr_crypto = [trend_corr(crypto)[0] for crypto in decentralized_currencies]

# Combine the results
volt_del_corr = volt_del_corr_general + volt_del_corr_tech + volt_del_corr_finance + volt_del_corr_crypto

# Create labels for the scatter plot
labels = general_stocks + tech_stocks + finance_stocks + decentralized_currencies

Trend Data for ['Shiba Inu'] retrieved successfully.
Stock Data for SHIB-USD retrieved successfully.


In [None]:
# Plot the scatter plot
plt.figure(figsize=(16, 8))
plt.scatter(range(len(general_stocks)), volt_del_corr_general, color = color_map['general'], label = 'General Stocks')
for i in range(len(general_stocks)):
    plt.axvline(x = i, color = color_map['general'], linestyle = ':', alpha = 0.3)  # Add vertical lines to separate the stocks
plt.scatter(range(len(general_stocks), len(general_stocks) + len(tech_stocks)), volt_del_corr_tech, color = color_map['tech'], label = 'Tech Stocks')
for i in range(len(general_stocks), len(general_stocks) + len(tech_stocks)):
    plt.axvline(x = i, color = color_map['tech'], linestyle = ':', alpha = 0.3)  # Add vertical lines to separate the stocks
plt.scatter(range(len(general_stocks) + len(tech_stocks), len(general_stocks) + len(tech_stocks) + len(finance_stocks)), volt_del_corr_finance, color = color_map['finance'], label = 'Finance Stocks')
for i in range(len(general_stocks) + len(tech_stocks), len(general_stocks) + len(tech_stocks) + len(finance_stocks)):
    plt.axvline(x = i, color = color_map['finance'], linestyle = ':', alpha = 0.3)  # Add vertical lines to separate the stocks
plt.scatter(range(len(general_stocks) + len(tech_stocks) + len(finance_stocks), len(general_stocks) + len(tech_stocks) + len(finance_stocks) + len(decentralized_currencies)), volt_del_corr_crypto, color = color_map['crypto'], label = 'Decentralized Currencies')
for i in range(len(general_stocks) + len(tech_stocks) + len(finance_stocks), len(general_stocks) + len(tech_stocks) + len(finance_stocks) + len(decentralized_currencies)):
    plt.axvline(x = i, color = color_map['crypto'], linestyle = ':', alpha = 0.3)  # Add vertical lines to separate the stocks
plt.axhline(y = 0, color = 'black', linestyle = '--')  # Add a horizontal line at y = 0
plt.xlabel('Assets')
plt.ylabel('Correlation with 7-Day Delayed Trend')
plt.title('Correlation of Close Price and 7-Day Delayed Trend')
legend = plt.legend()
legend.get_frame().set_alpha(0.3)
plt.xticks(range(len(labels)), labels, rotation = 60)
plt.tight_layout(pad = 2)
plt.savefig('Correlation_Scatter_Plot.png')
plt.show()

# Print the correlation values and their mean
print(volt_del_corr, np.mean(volt_del_corr))

In [None]:
# Plot for each stock and decentralized currency
for stock in general_stocks + tech_stocks + finance_stocks + decentralized_currencies:
    plot_stock_data(stock, download = True)