In [3]:
import yfinance as yf
import pandas as pd
import polars as pl
import os
import re
import urllib.parse
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math 
import pyarrow
import numpy as np

In [7]:
COMPANY_NAME_LIST = ['Apple%2520Inc.%2520(AAPL)%2520(CIK%25200000320193)', 'MICROSOFT%2520CORP%2520(MSFT)%2520(CIK%25200000789019)', 
                     'BigCommerce%2520Holdings%252C%2520Inc.%2520(BIGC)%2520(CIK%25200001626450)', 'ROKU%252C%2520INC%2520(ROKU)%2520(CIK%25200001428439)', 
                     'JPMORGAN%2520CHASE%2520%2526%2520CO%2520(JPM%252C%2520AMJ%252C%2520AMJB%252C%2520JPM)%2520(CIK%25200000019617)', 'VISA%2520INC.%2520(V)%2520(CIK%25200001403161)', 
                     'Block%252C%2520Inc.%2520(SQ%252C%2520BSQKZ)%2520(CIK%25200001512673)', 'Robinhood%2520Markets%252C%2520Inc.%2520(HOOD)%2520(CIK%25200001783879)', 
                     'JOHNSON%252C%252C%2520JOHNSON%2520(JNJ)%2520(CIK%25200000200406)', 'PFIZER%2520INC%2520(PFE)%2520(CIK%25200000078003)', 
                     'Moderna%252C%2520Inc.%2520(MRNA)%2520(CIK%25200001682852)', 'Teladoc%2520Health%252C%2520Inc.%2520(TDOC)%2520(CIK%25200001477449)', 
                     'EXXON%2520MOBIL%2520CORP%2520(XOM)%2520(CIK%25200000034088)', 'CHEVRON%2520CORP%2520(CVX)%2520(CIK%25200000093410)', 
                     'FIRST%2520SOLAR%252C%2520INC.%2520(FSLR)%2520(CIK%25200001274494)', 'PLUG%2520POWER%2520INC%2520(PLUG)%2520(CIK%25200001093691)', 
                     'GENERAL%252CELECTRIC%2520CO%2520(GE)%2520(CIK%25200000040545)', '3M%2520CO%2520(MMM)%2520(CIK%25200000066740)', 
                     'CATERPILLAR%2520INC%2520(CAT)%2520(CIK%25200000018230)', 'FASTENAL%2520CO%2520(FAST)%2520(CIK%25200000815556)']

Collect scores from hard memory

In [10]:
df_dict_scores = {}
regex = r"\\([^\\]+)_full_doc_sen_score_df\.parquet$"

for root, dirs, files in os.walk(r'Parsim-sec\src\Analysis\data\20_com_full_doc'):
    for file in files:
            
            file_path = os.path.join(root, file)
            ticker = re.search(regex, file_path).group(1)
            
            df = pl.read_parquet(file_path)

            df_dict_scores[ticker] = df 

In [11]:
df = df_dict_scores["GE"]
tic_sym = "GE"
df

KeyError: 'GE'

This is support func to find end prices and their dates. It takes as an arguments starting index, which is the nuber of row of hist dataframe and this hist dataframe itself. Hist dataframe contain daily OHLC data for particular ticker (company). 

It also cheks wether index of end price is in hist, meaning that if this particular report was released very resently (less then 8 trading days ago) it will append none values for end prices out of range of hist df instead of raising an error.

It returns lists of end prices and corresponding dates (none values for prices and their dates, if they are out of range of hist df).

In [5]:
def find_end_price(start_index, hist):
    end_price_list = []
    end_price_date_list = []
    
    for x in range(2, 8):
        idx = start_index + x
        if idx < len(hist):
            end_price_list.append(hist.iloc[idx]['Open'])
            end_price_date_list.append(hist.index[idx])
        else:
            end_price_list.append(None)
            end_price_date_list.append(None)
    return end_price_list, end_price_date_list

This is support func to calc the regular returns. It takes start price and end_price_list as an arguments. It checks whether value in end price list is none and if it is, it appends none in returns list, instead of raising an error. So we are shure that all end price lists are of the same length, but for those prices that are not found in historical data we have none values for returns.

In [6]:
def regular_returns(start_price, end_price_list):
    retuns = []
    for end_price in end_price_list:
        if end_price is None:
            retuns.append(None)
        else:
            ret = (end_price - start_price) / start_price * 100
            retuns.append(ret)
    return retuns

This is the support func that calc SnP500 returns. It takes_start date and end_price_date_list as and arguments and then calculate returns for this timeframes. This func also checks whether value of end_price_date_list is none (meaning that it is not found in historical data), and if it is, func will append none instead of raising an error.

Now we are shure that list of snp_returns will be of equal length for all report release dates and all time frames, but for those returns that cannot be calculates due to non existent end prices we will have nones 

In [7]:
def snp_500_return(start_date, end_price_date_list, snp_price):

    snp_returns = []

    for end_date in end_price_date_list:
        if end_date is None:
            snp_returns.append(None)

        else: 
            start_index = snp_price.index.get_loc(start_date)
            start_price = snp_price.iloc[start_index]['Open']
    
            end_index = snp_price.index.get_loc(end_date)
            end_price = snp_price.iloc[end_index]['Open']
    
            ret = (end_price - start_price) / start_price * 100
            
            snp_returns.append(ret)

    return snp_returns

This is suppurt function that retrive the end quarter price. It take as an argument hist - historical price dataframe, date_str - the date of release of this particular report, df - company dataframe, with cols named with dates of releases of corresponding reports.

It takes the release date for current report and checks whether it is the last date in time series or not. If it is (this mean that we are are realy close to the end of hist df) we consider the end of hist df as the end of quarter. if it is not, we take next date in company df (which is the date of releas of the next quarterly report) and see its open price, this is pricisely the end of the quarter.

In [8]:
def find_end_quarter(hist, date_str, df):

    col_names = df.columns
    current_date_index = col_names.index(date_str)

    if current_date_index < len(col_names) - 1:

        next_date = col_names[current_date_index + 1]

        next_date_time_stemp = pd.Timestamp(next_date, tz='America/New_York')

        while next_date_time_stemp not in hist.index:

            next_date_time_stemp += pd.Timedelta(days=1)
    
        end_quarter_index = hist.index.get_loc(next_date_time_stemp)
        end_quarter_price = hist.iloc[end_quarter_index]['Open']
        end_quarter_date = hist.index[end_quarter_index]
        
        current_date_time_stemp = pd.Timestamp(date_str, tz='America/New_York')

        if current_date_time_stemp in hist.index:

            start_quarter_index = hist.index.get_loc(current_date_time_stemp)
        else:
            print('cannot find in index')
            print(f'current_date_time_stemp: {current_date_time_stemp}')
            first_date = hist.index[0]
            print(f'firts date: {first_date}')

        quarter_length = end_quarter_index - start_quarter_index

    else:
        end_quarter_price = None
        end_quarter_date = None
        quarter_length = None
    

    return end_quarter_price, end_quarter_date, quarter_length

This is a func that takes as an input a key-value pair from df_dict_scores and calculate 2-days, 3-days, 4-days, 5-days, 6-days, 7-days and full quarter excess returns for it (S&P500 is the benchmark). It stores the result as a polars df with dates of starting of the period as col names and corresponding returns as col values (seven in each: starting from 2-days down to 7-days and full quarter) - 

In [9]:
company_ticker = yf.Ticker(tic_sym)
hist = company_ticker.history(period="6y")
hist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-05-07 00:00:00-04:00,64.170638,64.580529,63.760747,64.079552,6858410,0.0,0.0
2018-05-08 00:00:00-04:00,64.580511,65.992357,64.170620,64.990402,12381195,0.0,0.0
2018-05-09 00:00:00-04:00,65.354768,66.857701,65.127049,66.584442,10614202,0.0,0.0
2018-05-10 00:00:00-04:00,67.222034,67.222034,66.037907,66.903229,7820843,0.0,0.0
2018-05-11 00:00:00-04:00,67.130935,67.267568,66.128980,66.493332,5970052,0.0,0.0
...,...,...,...,...,...,...,...
2024-04-30 00:00:00-04:00,163.559998,166.259995,161.580002,161.820007,6723600,0.0,0.0
2024-05-01 00:00:00-04:00,162.570007,163.300003,158.820007,159.699997,4775800,0.0,0.0
2024-05-02 00:00:00-04:00,161.389999,162.820007,159.419998,162.639999,5030400,0.0,0.0
2024-05-03 00:00:00-04:00,165.300003,165.300003,162.009995,164.110001,3968600,0.0,0.0


In [10]:
def computing_returns(tic_sym, df):

    company_ticker = yf.Ticker(tic_sym)
    hist = company_ticker.history(period="6y")

    snp500_ticker = yf.Ticker("^GSPC")
    snp_price = snp500_ticker.history(period="6y", auto_adjust=True)
    
    returns = {}

    for date_str in df.columns:

        start_date = pd.Timestamp(date_str, tz='America/New_York')
        
        while start_date not in hist.index:

            start_date += pd.Timedelta(days=1)
        

        start_index = hist.index.get_loc(start_date)

        start_price = hist.iloc[start_index]['Open']

        end_price_list, end_price_date_list = find_end_price(start_index, hist)
        
        if None not in end_price_list:
            end_quarter_price, end_quarter_date, length_of_quarter = find_end_quarter(hist, date_str, df)
            
        else:
            end_quarter_price = None
            end_quarter_date = None

        
        end_price_list.append(end_quarter_price)
        end_price_date_list.append(end_quarter_date)
        
        reg_returns = regular_returns(start_price, end_price_list)
        snp_returns = snp_500_return(start_date, end_price_date_list, snp_price)
        
        excess_returns = [a - b if a is not None and b is not None else None for a, b in zip(reg_returns, snp_returns)]
        
        #normalization: divide returns for each time frame by number of trading days (2-days returns/2, 3-days returns/3, ful quarter returns/length_of_quarter)
        timeframe_length = [2, 3, 4, 5, 6, 7, length_of_quarter]

        normalized_excess_returns = [x / y if x is not None and y is not None else None for x, y in zip(excess_returns, timeframe_length)]

        returns[date_str] = normalized_excess_returns

    returns = pl.DataFrame(returns)
    
    return returns


In [11]:
computing_returns(tic_sym, df)

2019-04-30,2019-07-31,2019-10-30,2020-02-24,2020-04-29,2020-07-29,2020-10-28,2021-02-12,2021-04-27,2021-07-27,2021-10-26,2022-02-11,2022-04-26,2022-07-26,2022-10-25,2023-02-10,2023-04-25,2023-07-25,2023-10-24,2024-02-02,2024-04-23
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.135994,-2.286279,1.353142,0.438005,0.770958,-5.963037,-0.400486,1.796746,-1.398768,-0.380321,-0.947382,0.334166,-3.21175,2.572387,0.402784,0.106703,-1.798186,-0.009876,1.030062,0.483866,0.171553
0.318082,-1.699236,1.751244,-1.054996,-0.714661,-4.683877,0.41715,1.220329,-0.917904,-0.264637,-0.212925,0.733901,-2.737175,0.457406,0.673731,0.328117,-1.601782,0.222579,0.108764,0.398735,0.301743
0.264892,-1.154535,2.576425,-1.106406,-1.072307,-3.668826,0.512642,0.618446,-0.503753,-0.452481,-0.174344,0.614347,-2.333881,0.292274,0.384414,0.584638,-0.847757,-0.175717,-0.225727,0.19215,0.394966
0.379224,-1.408712,2.119088,0.918282,-1.195086,-2.904891,0.09653,1.290325,-0.19306,-1.100741,-0.041669,0.418949,-1.353679,0.831984,0.410568,0.6464,-0.445455,-0.24528,0.027005,0.17023,0.428098
0.082072,-1.273048,1.99026,0.197163,-1.317338,-2.069197,0.262134,1.970792,-0.375225,-0.753251,-0.126101,-0.432046,-0.948539,0.60097,0.452159,0.342176,0.005442,-0.207122,-0.276899,0.130259,0.499944
0.208395,-1.306744,1.922946,0.159189,-0.958,-1.930069,0.261249,1.87889,-0.231115,-0.463293,-0.247976,-0.179654,-0.91094,0.257513,0.605062,0.474203,0.087455,-0.14207,-0.388662,0.313372,0.264078
0.040864,-0.152569,0.160232,-0.703804,-0.091147,0.063911,0.480514,0.227109,-0.104953,-0.068808,-0.062514,-0.181003,-0.165446,0.182692,0.420243,0.497866,0.032636,0.063846,0.086811,0.799817,


In [12]:
df = df_dict_scores["ROKU"]
tic_sym = "ROKU"
df

2019-05-10,2019-08-09,2019-11-08,2020-03-02,2020-05-11,2020-08-07,2020-11-06,2021-02-26,2021-05-07,2021-08-05,2021-11-04,2022-02-18,2022-04-29,2022-07-29,2022-11-03,2023-02-16,2023-04-27,2023-07-28,2023-11-02,2024-02-16,2024-04-26
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-27.295226,-26.769276,-27.194249,-28.559457,-33.156006,-34.871952,-34.746064,-32.90677,-36.729215,-36.477945,-36.595384,-30.815151,-34.560856,-37.275143,-37.896232,-33.891065,-38.898848,-38.186708,-38.393132,-38.267797,-39.034612
-164.0,-160.0,-162.0,-173.0,-192.0,-202.0,-200.0,-192.0,-201.0,-202.0,-200.0,-177.0,-184.0,-193.0,-196.0,-190.0,-200.0,-197.0,-198.0,-198.0,-200.0
-0.422897,-0.411494,-0.415051,-0.371429,-0.435664,-0.431111,-0.427043,-0.380845,-0.427751,-0.426284,-0.433662,-0.389239,-0.444867,-0.447124,-0.446593,-0.377617,-0.456204,-0.46217,-0.477837,-0.399584,-0.463652
0.466667,0.462373,0.469699,0.489944,0.45404,0.448459,0.447561,0.484229,0.448135,0.439471,0.425574,0.474099,0.413121,0.409879,0.409548,0.484827,0.407322,0.398859,0.387704,0.470352,0.404057
34909.0,36320.0,37040.0,57876.0,39824.0,42375.0,42709.0,57455.0,42257.0,44062.0,43365.0,57357.0,40446.0,41788.0,42524.0,58304.0,40998.0,41641.0,42433.0,57998.0,41937.0


This is the general loop, that takes each key-value pair from df_dict_scores, applies computing_returns() to it, then verticaly stackes the result with orig df and stores it in the new dict

In [13]:
dict_for_anal = {}
row_names = pl.Series("row_names", ["weighted scores", "raw scores", "lm scores", "harvard scores", "doc length", "2_day_reterns","3_day_reterns", "4_day_reterns", "5_day_reterns", "6_day_reterns", "7_day_reterns", "full_quarter_returns"])

for tic_sym, df in df_dict_scores.items():
    print(tic_sym)

    returns_df = computing_returns(tic_sym, df)
    
    stacked_df = df.vstack(returns_df)
    stacked_df_with_indx = stacked_df.hstack([row_names])
    
    dict_for_anal[tic_sym] = stacked_df_with_indx
    

AAPL
BIGC
CAT
CVX
FAST
FSLR
GE
HOOD
JNJ
JPM
MMM
MRNA
MSFT
PFE
PLUG
ROKU
SQ
TDOC
V
XOM


In [45]:
dict_for_anal["XOM"]

2019-05-02,2019-08-07,2019-11-06,2020-02-26,2020-05-06,2020-08-05,2020-11-04,2021-02-24,2021-05-05,2021-08-04,2021-11-03,2022-02-23,2022-05-04,2022-08-03,2022-11-02,2023-02-22,2023-05-02,2023-08-01,2023-10-31,2024-02-28,row_names
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
-5.624371,-5.774065,-6.142186,-15.573369,-7.044378,-6.653674,-7.921205,-16.922346,-4.906546,-3.096871,-3.053303,-13.249128,-2.555981,-3.328652,-4.82591,-16.099691,-2.418704,-3.472974,-4.291548,-18.633463,"""weighted score…"
-34.0,-34.0,-38.0,-96.0,-51.0,-44.0,-53.0,-101.0,-31.0,-19.0,-17.0,-87.0,-16.0,-14.0,-28.0,-102.0,-18.0,-22.0,-23.0,-110.0,"""raw scores"""
-0.253456,-0.263158,-0.261084,-0.091067,-0.360324,-0.314741,-0.374233,-0.125402,-0.336134,-0.248869,-0.245763,-0.073016,-0.234043,-0.307692,-0.328063,-0.1,-0.278481,-0.25,-0.265823,-0.075515,"""lm scores"""
0.532777,0.5225,0.530864,0.553063,0.511018,0.506048,0.470389,0.542662,0.43934,0.436385,0.482379,0.5377,0.503467,0.447514,0.414905,0.529795,0.478261,0.438776,0.434084,0.536935,"""harvard scores…"
5714.0,5888.0,5988.0,38731.0,6780.0,7124.0,8288.0,39502.0,6145.0,6279.0,6826.0,38600.0,5218.0,5394.0,6553.0,39072.0,4704.0,5787.0,6988.0,40099.0,"""doc length"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
-0.191677,-0.09593,-0.669581,0.070892,0.273965,0.365251,2.508111,0.468193,0.937906,-0.011776,0.261807,0.534425,-0.205253,-1.310119,0.658515,0.325908,-0.591962,0.698443,-0.543068,-0.199369,"""4_day_reterns"""
-0.066573,-0.228677,-0.85038,-0.250894,-0.032326,0.308114,1.91393,0.495917,0.59553,0.230914,0.502829,1.093878,0.22806,-1.147909,0.490261,-0.019443,-0.791578,0.182757,-1.144647,0.250449,"""5_day_reterns"""
0.07565,-0.299875,-0.781364,-0.921066,-0.624288,-0.200656,0.966591,0.820248,0.432223,0.200674,0.160106,0.446788,0.607937,-0.782488,-0.237861,0.284125,-0.53287,0.555494,-1.143245,0.125426,"""6_day_reterns"""
0.079014,-0.283439,-0.779326,-0.650465,-0.285958,-0.450478,0.668577,1.496555,0.409297,0.017494,0.09804,0.800813,0.543628,-0.564654,-0.282865,-0.042,-0.824774,0.718785,-1.012154,0.103216,"""7_day_reterns"""


Lets start the analisys. The general question is: how well sentiment scores and weighted scores can explain returns? 

Lets see them on the graph:

In [14]:
#for company_name, scores in dict_for_anal.items():
#   
#    df_for_plt = scores.select(pl.exclude("row_names"))
#
#    dates = [datetime.strptime(date, "%Y-%m-%d") for date in df_for_plt.columns]
#   
#    plt.figure(figsize=(10, 6))
#    row_indices_to_plot = [0, 1, 4, -1]
#
#    for idx in row_indices_to_plot:
#        row = df_for_plt.row(idx, named=True)  
#        plt.plot(dates, list(row.values()), label=scores.row(idx)[-1])
#
#    plt.title(company_name)
#    plt.xlabel("Date")
#    plt.ylabel("Values")
#    plt.legend()
#    plt.xticks(rotation=45)
#    plt.tight_layout()
#    plt.show()

At first glance some time series exhibits at least piecewise dependence. 

Next step is to regress each return on weighted, raw scores and polarity scores from Loughran-McDonald and Harvard dictionaries.

And also we can see that in cases of 10-K reports which are typicly much longer we can see strong pattern in raw scores which is smoothed in weighted scores. But still this pattern could be recognized. This can be interpreted as 10-K can contain more information about future prices

This func takes as input key-value pair (company name and dataframe with sentiment scores and returns), regress each retern (2_days, 3_days, etc) on weightes sentiment score ans raw sentiment score. Then it collects the slopes (beta-coefficient) of returns regressed on raw score and on weighted score in the separate polars series (in the loop they will be staked in dataframe)

In [14]:
def add_quarter_dummy(company_df_pd):
    
    dates = pd.to_datetime(company_df_pd.columns)
    date_series = pd.Series(dates.quarter, index=dates)

    dummies = pd.get_dummies(date_series, prefix="Q")
    dummies.index = dummies.index.strftime('%Y-%m-%d')

    dummies = dummies.drop(columns=["Q_4"])
    
    dummies = dummies.transpose()
    df_wit_dum = pd.concat([company_df_pd, dummies])

    df_wit_dum = df_wit_dum.transpose()

    for col in df_wit_dum.columns:
        df_wit_dum[col] = pd.to_numeric(df_wit_dum[col])

    boolean_cols = ['Q_1', 'Q_2', 'Q_3']
    df_wit_dum[boolean_cols] = df_wit_dum[boolean_cols].astype(int)

    return df_wit_dum

    

In [63]:
def fit_regression(y, x):

    x = sm.add_constant(x)
    model = sm.OLS(y, x, missing='drop').fit()
    
    slope = model.params.iloc[1]
    print(model.summary())
    
    return slope

In [64]:
def regression_slopes(df_wit_dum, X_names, company_name="total_df"):

    # x in range(0, 4) represents set of independent variables; 4th column represents control variable for document length; -3, -2, -1 columns represent dummy variables for quarters
    X = [df_wit_dum.iloc[:, [x, 4, -3, -2, -1]] for x in range(0, 4)]

    Y = [df_wit_dum.iloc[:, [y]] for y in range(5, 12)]

    slopes_dict = {}

    for i, x in enumerate(X):

        name_of_score = X_names[i][0].split()[0]
        slopes_name = f'{name_of_score}_slopes'
        
        slopes = []
        for y in Y:
            slope = fit_regression(y, x)
            slopes.append(slope)

        slopes_dict[slopes_name] = pl.Series(company_name, slopes)
    
    return slopes_dict

This is the general loop that takes dict_for_anal() whith scores and returns and for each company in the dict it calculates two series: first one contain regression slopes for weighted scores and second one for raw scores. Serieses stacked in the separete dataframes (for raw and for weighted scores)

In [52]:
def compute_all_slopes(dict_for_anal):

    slopes_df_dict = {
        "weighted_slopes_df" : pl.DataFrame(),
        "raw_slopes_df" : pl.DataFrame(),
        "lm_slopes_df" : pl.DataFrame(),
        "harvard_slopes_df" : pl.DataFrame()
    }
    
    for company_name, company_df in dict_for_anal.items(): 

        X_names = [company_df.select(pl.col("row_names")).row(x) for x in range(0, 4)]

        company_df_woi = company_df.select(pl.exclude("row_names"))

        company_df_woi = company_df_woi.to_pandas()

        df_wit_dum = add_quarter_dummy(company_df_woi)

        slopes_dict = regression_slopes(df_wit_dum, X_names, company_name)

        for x in X_names:
            
            name_of_score = x[0].split()[0]
            slopes_name = f'{name_of_score}_slopes'
            
            df_name = f"{name_of_score}_slopes_df"
            
            slopes_df_dict[df_name] = slopes_df_dict[df_name].hstack([slopes_dict[slopes_name]])

    return slopes_df_dict
        

In [None]:
slopes_df_dict = compute_all_slopes(dict_for_anal)

As an output from previous functions we have dataframes with regression slopes of different returns on raw, weighted scores and Loughran-McDonald and Harvard dictionaries. Each col of these dfs contain regression slopes for particular company. First row contain slope for 2_days return, second row contain slope for 3_days return, etc.  I have decided to check what is the proportion of positive regression slopes for each return timeframe (2_days, 3_days, etc). For that purpose i will transpose each of these dfs, apply condition > 0, then verticaly sum boolean values and obtain the prorortion of positive slopes for each time frame. 

In [66]:
def calc_efficiency_metrics(slopes_df):

    df_tranposed = slopes_df.transpose()

    res = df_tranposed.select(pl.all() > 0).sum()
    total_num_of_slopes = slopes_df.shape[1]

    positive_slopes = [res.select(pl.col(res.columns[x]).gather(0)).item() for x in range(slopes_df.shape[0])]

    for counter, x in enumerate(positive_slopes):
        print(f'Prorortion of positive {counter + 2}_day returns: {x/total_num_of_slopes}')

This loop apply calc_efficiency_metrics function to each item of slopes_df_dict where each item is a dataframe which regression slopes for each regressor.

In [67]:
for key, slopes_df in slopes_df_dict.items():
    print("\n")
    print(f'Metrics for: {key}')
    calc_efficiency_metrics(slopes_df)
    



Metrics for: weighted_slopes_df
Prorortion of positive 2_day returns: 0.45
Prorortion of positive 3_day returns: 0.45
Prorortion of positive 4_day returns: 0.55
Prorortion of positive 5_day returns: 0.45
Prorortion of positive 6_day returns: 0.4
Prorortion of positive 7_day returns: 0.45
Prorortion of positive 8_day returns: 0.7


Metrics for: raw_slopes_df
Prorortion of positive 2_day returns: 0.45
Prorortion of positive 3_day returns: 0.45
Prorortion of positive 4_day returns: 0.55
Prorortion of positive 5_day returns: 0.6
Prorortion of positive 6_day returns: 0.5
Prorortion of positive 7_day returns: 0.65
Prorortion of positive 8_day returns: 0.65


Metrics for: lm_slopes_df
Prorortion of positive 2_day returns: 0.5
Prorortion of positive 3_day returns: 0.4
Prorortion of positive 4_day returns: 0.45
Prorortion of positive 5_day returns: 0.5
Prorortion of positive 6_day returns: 0.4
Prorortion of positive 7_day returns: 0.45
Prorortion of positive 8_day returns: 0.5


Metrics for: 

Positive slopes means that sentiment scores and returns mooves in the same direction (hense, they are working as predictors of stock price movement). As the result of this preliminary analisis we can say that sentiment scores of quarterly reports are not very good predictors of stock movments. Positive proportion of this slopes are close to 0.5, which means that on average in our sample there are 50/50 chance that scores and returns are mooving in the same direction.

As we can see that, despite visual conclusion that weighted scores are more "smooth", they barely outperform raw scores, but at least they are not worse, meaning that at each time frame (except 5_days) the proportion of positive regression slopes of weighted scores => proportion of positive regression slopes of raw scores.

This func stackes all data in one df, and calculate the single slope for each time frame

In [68]:
def compute_signl_slope(dict_for_anal):

    df_list = []
    
    for company_name in dict_for_anal:


        X_names = [dict_for_anal[company_name].select(pl.col("row_names")).row(x) for x in range(0, 4)] 
    
        company_df_woi = dict_for_anal[company_name].select(pl.exclude("row_names"))
        company_df_pd = company_df_woi.to_pandas(use_pyarrow_extension_array=True)

        df_wit_dum = add_quarter_dummy(company_df_pd)

        df_list.append(df_wit_dum)
        

    total_df = pd.concat(df_list, ignore_index=True)
    
    slopes_dict = regression_slopes(total_df, X_names)

    return slopes_dict

In [71]:
slopes_dict = compute_signl_slope(dict_for_anal)
print(slopes_dict)

                            OLS Regression Results                            
Dep. Variable:                      5   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9166
Date:                Tue, 07 May 2024   Prob (F-statistic):              0.470
Time:                        13:01:30   Log-Likelihood:                -1013.0
No. Observations:                 392   AIC:                             2038.
Df Residuals:                     386   BIC:                             2062.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1518      0.405      0.374      0.7

This loop iterate over slopes_dict where now each item is not a datafame but a series of values, where each series contain regression slopes for the whole data (data for different companies stacked in one dataframe) for each time frame of returns: 2-days, 3-days... 7_days, full quarter returns.

In [70]:
for slope in slopes_dict:
    print(f'{slope} for all data stacked displayed: \n {slopes_dict[slope]}')
    print("\n")

weighted_slopes for all data stacked displayed: 
 shape: (7,)
Series: 'total_df' [f64]
[
	-0.026322
	-0.000798
	0.008307
	0.005171
	0.009057
	0.008221
	0.004432
]


raw_slopes for all data stacked displayed: 
 shape: (7,)
Series: 'total_df' [f64]
[
	-0.008051
	-0.002117
	-0.000113
	-0.000422
	0.0004
	0.000383
	0.000756
]


lm_slopes for all data stacked displayed: 
 shape: (7,)
Series: 'total_df' [f64]
[
	-0.326308
	-0.003934
	-0.052607
	0.114031
	0.104608
	0.208513
	0.352519
]


harvard_slopes for all data stacked displayed: 
 shape: (7,)
Series: 'total_df' [f64]
[
	-2.25362
	-0.056819
	0.944824
	0.528853
	0.677927
	0.552354
	0.183957
]




jupyter nbconvert --to pdf Parsim-sec\src\Analysis\YFiin_excess_returns.ipynb

jupyter nbconvert --to html Parsim-sec\src\Analysis\YFiin_excess_returns.ipynb

