In [433]:
import sys
from pathlib import Path

sys.path.append(r"/home/maxim-shibanov/Projects_Py/Risk-and-return-prediction-with-LLM/src")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import copy
import matplotlib.lines as mlines
import re
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from textwrap import dedent
from linearmodels.panel import PanelOLS
from io import StringIO
from sklearn.preprocessing import StandardScaler

from data_analysis.data_fetcher.data_fetcher_class import DataFetcher
from data_collection.consts import  DB_PARAMS

In [434]:
fetcher = DataFetcher(
    db_params=DB_PARAMS,
    reports_table="reports",
    targets_table="targets_yf",
)

Available regressors:
 - avg_default_verbolizer
 - avg_shrink_verbolizer
 - doc_len
 - eps_surprise
 - f_size
 - full_list_default_verbolizer
 - full_list_shrink_verbolizer
 - hv_orig_score
 - lm_orig_score
 - max_abs_default
 - max_abs_shrink
 - max_default_verbolizer
 - max_shrink_verbolizer
 - md_hv1
 - md_hv2
 - md_hv3
 - md_lm1
 - md_lm2
 - md_lm3
 - min_default_verbolizer
 - min_shrink_verbolizer
 - stretch_default
 - stretch_shrink
Available sectors:
 - Technology (92)
 - Industrials (86)
 - Financial Services (85)
 - Healthcare (66)
 - Consumer Cyclical (58)
 - Consumer Defensive (40)
 - Real Estate (32)
 - Utilities (32)
 - Energy (30)
 - Basic Materials (23)
 - Communication Services (22)


  df = pd.read_sql_query(query, conn)
  df = pd.read_sql_query(query, conn)


In [435]:
df = fetcher.fetch_data(
      regressors=[
       'avg_default_verbolizer', 
       'avg_shrink_verbolizer',
       'max_abs_default',
       'max_abs_shrink',
        'max_default_verbolizer',
        'max_shrink_verbolizer',
        'min_default_verbolizer',
        'min_shrink_verbolizer',
        'eps_surprise',
        'f_size',
        'doc_len'
       ],
   prepare_fixed_effects=True
)

  df = pd.read_sql_query(query, conn)
  return pd.read_sql_query(query, conn)
  companies_df = pd.read_sql_query(query, conn)


In [436]:
df['f_size'] = df['f_size'] / 10**10

r_vol ~ (avg_default_verbolizer, avg_shrink_verbolizer, max_abs_default, max_default_verbolizer, max_shrink_verbolizer)

returns ~ max_abs_default, max_abs_shrink, min_default_verbolizer, min_shrink_verbolizer

e_returns ~ max_abs_shrink,

abn_returns ~ max_abs_shrink, max_default_verbolizer, max_shrink_verbolizer, min_default_verbolizer, min_shrink_verbolizer

In [437]:
class FEModeler:
    """
    A class to run panel fixed effects regressions using PanelOLS
    and extract key parameter statistics for a specified regressor.

    Attributes:
        df: The input panel data.
        var_names: Column names of the input DataFrame.
        params_dict: Dictionary storing regression parameters for each independent variable.
    """

    def __init__(self, df: pd.DataFrame, scale: bool = False):
        """
        Initialize the FEModeler with panel data.

        Args:
            df: Panel DataFrame with MultiIndex (entity, time).
            scale: Whether to standardize the regressors.
        """
        self.original_df = df.copy()
        self.df = df.copy()
        self.var_names = df.columns
        self.params_dict = {}
        self.scale = scale

    @staticmethod
    def extract_params(summary, regressor_name: str) -> pd.Series:
        """
        Return just the coefficient (beta) for `regressor_name`
        PLUS all R-squared metrics.
        """

        # --- coefficient table ---
        html = summary.tables[1].as_html()
        coef_df = pd.read_html(StringIO(html), header=0, index_col=0)[0]
        beta = coef_df.loc[regressor_name, "Parameter"]

        # --- grab R²s from text ---
        txt = summary.as_text()
        def grab(label):
            m = re.search(fr"{label}:\s*([-\d.]+)", txt)
            return float(m.group(1)) if m else np.nan

        r2          = grab("R-squared")
        r2_between  = grab(r"R-squared \(Between\)")
        r2_within   = grab(r"R-squared \(Within\)")
        r2_overall  = grab(r"R-squared \(Overall\)")

        params_dict = {
                "beta": beta,
                "r2": r2,
                "r2_between": r2_between,
                "r2_within": r2_within,
                "r2_overall": r2_overall,
            }
        return params_dict
    

    def fit(self, data: pd.DataFrame, formula: str):
        """
        Fit a fixed effects regression model using PanelOLS.

        Args:
            data: A subset of the full DataFrame containing required variables.
            formula: Regression formula in Patsy-style syntax.

        Returns:
            The regression summary object.
        """
        model = PanelOLS.from_formula(formula, data=data, check_rank=True)
        result = model.fit()
        return result.summary

    def compute(self) -> dict[str, list[pd.Series]]:
        """
        Run fixed effects regressions for all combinations of independent and dependent variables,
        and store the parameter summaries for each independent variable.

        Returns:
            A dictionary where keys are independent variable names,
            and values are lists of Series with parameter info for each dependent variable.        """
        
        ALLOWED = {
            "r_vol": [
                "avg_default_verbolizer", "avg_shrink_verbolizer",
                "max_abs_default", "max_default_verbolizer", "max_shrink_verbolizer",
            ],
            "returns": [
                "max_abs_default", "max_abs_shrink",
                "min_default_verbolizer", "min_shrink_verbolizer",
            ],
            "e_returns": ["max_abs_shrink"],
            "abn_returns": [
                "max_abs_shrink", "max_default_verbolizer",
                "max_shrink_verbolizer", "min_default_verbolizer",
                "min_shrink_verbolizer",
            ],
        }

        targets = ['returns', 'e_returns', 'abn_returns', 'r_vol']
        time_frames = ['2_day', '3_day', '4_day', '5_day', '6_day', '7_day', 'full_quarter' ]

        var_names = self.df.columns 

        for x in range(1, 9):
            # Targets starts from y index
            y = 12

            per_target_params = {}
            for target in targets:  
                if target not in per_target_params:
                    per_target_params[target] = {}

                for frame in time_frames:
                    reg_name = var_names[x]
                    if reg_name not in ALLOWED[target]:
                        y += 1  
                        continue  

                    data = self.df.iloc[:, [x, 9, 10, 11, y]].dropna().copy()

                    formula = f"{var_names[y]} ~ {var_names[x]} + eps_surprise + f_size + EntityEffects + TimeEffects"
                    try:
                        result_summary = self.fit(data, formula)
                        print(result_summary)
                        param_info = self.extract_params(result_summary, self.var_names[x])
                        per_target_params[target][var_names[y]] = param_info
                        
                    except Exception as e:
                        print(e)
                    y += 1

            self.params_dict[var_names[x]] = per_target_params

        return self.params_dict


In [None]:
class BootStraper:

    def __init__(self, df: pd.DataFrame, num_samples: int):
        self.df = df
        self.num_samples = num_samples
        self.resamples: list[pd.DataFrame] = []

    def create_resamples(self):
        
        def random_matrix(len_sample, num_samples):
            return np.random.randint(0, len_sample, size=(len_sample, num_samples)) 

        len_sample = len(df.index.levels[0])

        matrix = random_matrix(len_sample, self.num_samples)
        
        for sample in range(self.num_samples):
            col =  matrix[:, sample]

            level0_full = df.index.levels[0]
            selected_level0_values = level0_full[col]
            filtered_df = df.loc[selected_level0_values].copy()


            level0 = filtered_df.index.get_level_values(0).to_numpy()
            level1 = filtered_df.index.get_level_values(1).to_numpy()

            block_starts = np.r_[True, level0[1:] != level0[:-1]]
            block_ids = np.cumsum(block_starts) - 1 

            new_index = pd.MultiIndex.from_arrays(
                [block_ids, level1],
                names=filtered_df.index.names
            )

            filtered_df.index = new_index
            self.resamples.append(filtered_df)

    @staticmethod
    def fit_FE(df: pd.DataFrame):

        model = FEModeler(df=df, scale=False)
        return model.compute()
    
    @staticmethod
    def _deep_append(base: dict, new: dict):
        """
        Merge `new` into `base` in place.

        Rules
        -----
        • Internal nodes are plain dicts; recurse.
        • A terminal node is a dict whose *values are scalars* (not dicts).
          For terminals we build/extend **dicts of lists**, so after N
          draws every statistic key holds N values.
        """
        for key, val in new.items():

            # -------- leaf test: val is dict but none of its values is a dict
            is_leaf = isinstance(val, dict) and not any(
                isinstance(x, dict) for x in val.values()
            )

            # -------- branch doesn't exist yet
            if key not in base:
                if is_leaf:
                    # first encounter → start lists
                    base[key] = {k: [v] for k, v in val.items()}
                else:
                    base[key] = {}
                    BootStraper._deep_append(base[key], val)
                continue

            # -------- branch exists
            if is_leaf:
                # ensure existing branch is dict of lists
                for stat_k, stat_v in val.items():
                    if stat_k not in base[key]:
                        base[key][stat_k] = []
                    base[key][stat_k].append(stat_v)
            else:
                BootStraper._deep_append(base[key], val)

    @staticmethod
    def drop_nulls(d: dict):
        keys_to_del = {}

        for regressor in d:
            for target, value in d[regressor].items():
                if not d[regressor][target]:
                    if not value:                                  
                        keys_to_del.setdefault(regressor, []).append(target)

        for reg in keys_to_del:
            for target in keys_to_del[reg]:
                print(reg, keys_to_del[reg])
                del d[reg][target]

        return d

    def compute_obs(self):
        return self.fit_FE(self.df)

    def run(self, n_jobs: int = -1) -> dict:
        """
        * Creates all resamples,
        * fits FE regressions on each (in parallel),
        * returns a nested dict where every leaf is **a list of Series**
          (one Series per bootstrap draw).
        """
        #  1. resample list
        self.create_resamples()

        #  2. parallel estimation with progress bar
        computed = Parallel(n_jobs=n_jobs)(
            delayed(self.fit_FE)(res)
            for res in tqdm(self.resamples, desc="Bootstrapping")
        )

        #  3. combine
        params: dict = {}
        for res_dict in computed:
            self._deep_append(params, res_dict)
        
        params = self.drop_nulls(params)

        return params


In [439]:
BootStraper(df, 2).compute_obs()

                          PanelOLS Estimation Summary                           
Dep. Variable:          two_day_r_vol   R-squared:                        0.0013
Estimator:                   PanelOLS   R-squared (Between):             -0.1152
No. Observations:               12968   R-squared (Within):               0.0048
Date:                Fri, May 16 2025   R-squared (Overall):             -0.0896
Time:                        22:30:24   Log-likelihood                 1.328e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      5.5983
Entities:                         486   P-value                           0.0008
Avg Obs:                       26.683   Distribution:                 F(3,12452)
Min Obs:                       1.0000                                           
Max Obs:                       28.000   F-statistic (robust):             5.5983
                            

{'avg_default_verbolizer': {'returns': {},
  'e_returns': {},
  'abn_returns': {},
  'r_vol': {'two_day_r_vol': {'beta': -0.0908,
    'r2': 0.0013,
    'r2_between': -0.1152,
    'r2_within': 0.0048,
    'r2_overall': -0.0896},
   'three_day_r_vol': {'beta': -0.1038,
    'r2': 0.0016,
    'r2_between': -0.1428,
    'r2_within': 0.0056,
    'r2_overall': -0.1053},
   'four_day_r_vol': {'beta': -0.111,
    'r2': 0.0019,
    'r2_between': -0.1609,
    'r2_within': 0.0064,
    'r2_overall': -0.118},
   'five_day_r_vol': {'beta': -0.1162,
    'r2': 0.0019,
    'r2_between': -0.162,
    'r2_within': 0.006,
    'r2_overall': -0.1135},
   'six_day_r_vol': {'beta': -0.1199,
    'r2': 0.0024,
    'r2_between': -0.1741,
    'r2_within': 0.0071,
    'r2_overall': -0.1239},
   'seven_day_r_vol': {'beta': -0.1186,
    'r2': 0.0024,
    'r2_between': -0.1765,
    'r2_within': 0.0073,
    'r2_overall': -0.1251},
   'full_q_r_vol': {'beta': -0.1053,
    'r2': 0.0039,
    'r2_between': -0.213,
    'r2_w

In [440]:
d = BootStraper(df, 2).run()

Bootstrapping:   0%|          | 0/2 [00:00<?, ?it/s]

                          PanelOLS Estimation Summary                           
Dep. Variable:          two_day_r_vol   R-squared:                        0.0013
Estimator:                   PanelOLS   R-squared (Between):             -0.1021
No. Observations:               12969   R-squared (Within):               0.0052
Date:                Fri, May 16 2025   R-squared (Overall):             -0.0805
Time:                        22:30:31   Log-likelihood                 1.322e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      5.3657
Entities:                         487   P-value                           0.0011
Avg Obs:                       26.630   Distribution:                 F(3,12452)
Min Obs:                       1.0000                                           
Max Obs:                       56.000   F-statistic (robust):             5.3657
                            

In [441]:
d

{'avg_default_verbolizer': {'r_vol': {'two_day_r_vol': {'beta': [-0.0889,
     -0.053],
    'r2': [0.0013, 0.0006],
    'r2_between': [-0.1021, -0.0652],
    'r2_within': [0.0052, 0.0024],
    'r2_overall': [-0.0805, -0.0513]},
   'three_day_r_vol': {'beta': [-0.0978, -0.0693],
    'r2': [0.0014, 0.0012],
    'r2_between': [-0.1283, -0.0907],
    'r2_within': [0.0053, 0.0049],
    'r2_overall': [-0.0957, -0.0745]},
   'four_day_r_vol': {'beta': [-0.0975, -0.0755],
    'r2': [0.0015, 0.0017],
    'r2_between': [-0.1371, -0.1026],
    'r2_within': [0.0054, 0.0066],
    'r2_overall': [-0.1017, -0.0862]},
   'five_day_r_vol': {'beta': [-0.1152, -0.0836],
    'r2': [0.0017, 0.0017],
    'r2_between': [-0.1458, -0.1004],
    'r2_within': [0.0048, 0.0056],
    'r2_overall': [-0.0946, -0.0771]},
   'six_day_r_vol': {'beta': [-0.1278, -0.0827],
    'r2': [0.0022, 0.002],
    'r2_between': [-0.1719, -0.1022],
    'r2_within': [0.0061, 0.0063],
    'r2_overall': [-0.1137, -0.0794]},
   'seven_day

In [442]:
d['avg_default_verbolizer']['r_vol']['two_day_r_vol']

{'beta': [-0.0889, -0.053],
 'r2': [0.0013, 0.0006],
 'r2_between': [-0.1021, -0.0652],
 'r2_within': [0.0052, 0.0024],
 'r2_overall': [-0.0805, -0.0513]}

In [443]:
d

{'avg_default_verbolizer': {'r_vol': {'two_day_r_vol': {'beta': [-0.0889,
     -0.053],
    'r2': [0.0013, 0.0006],
    'r2_between': [-0.1021, -0.0652],
    'r2_within': [0.0052, 0.0024],
    'r2_overall': [-0.0805, -0.0513]},
   'three_day_r_vol': {'beta': [-0.0978, -0.0693],
    'r2': [0.0014, 0.0012],
    'r2_between': [-0.1283, -0.0907],
    'r2_within': [0.0053, 0.0049],
    'r2_overall': [-0.0957, -0.0745]},
   'four_day_r_vol': {'beta': [-0.0975, -0.0755],
    'r2': [0.0015, 0.0017],
    'r2_between': [-0.1371, -0.1026],
    'r2_within': [0.0054, 0.0066],
    'r2_overall': [-0.1017, -0.0862]},
   'five_day_r_vol': {'beta': [-0.1152, -0.0836],
    'r2': [0.0017, 0.0017],
    'r2_between': [-0.1458, -0.1004],
    'r2_within': [0.0048, 0.0056],
    'r2_overall': [-0.0946, -0.0771]},
   'six_day_r_vol': {'beta': [-0.1278, -0.0827],
    'r2': [0.0022, 0.002],
    'r2_between': [-0.1719, -0.1022],
    'r2_within': [0.0061, 0.0063],
    'r2_overall': [-0.1137, -0.0794]},
   'seven_day