In [62]:
import sys
from pathlib import Path

sys.path.append(r"/home/maxim-shibanov/Projects_Py/Risk-and-return-prediction-with-LLM/src")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from linearmodels.panel import PanelOLS
from io import StringIO
import matplotlib.lines as mlines
from sklearn.preprocessing import StandardScaler
import time
from sklearn.linear_model import LassoCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from data_analysis.data_fetcher.data_fetcher_class import DataFetcher
from data_collection.consts import  DB_PARAMS

In [63]:
fetcher = DataFetcher(
    db_params=DB_PARAMS,
    reports_table="reports_2",
    targets_table="targets_yf",
)

Available regressors:
 - avg_default_verbolizer
 - avg_shrink_verbolizer
 - doc_len
 - eps_surprise
 - f_size
 - full_list_default_verbolizer
 - full_list_shrink_verbolizer
 - hv_orig_score
 - lm_orig_score
 - max_abs_default
 - max_abs_shrink
 - max_default_verbolizer
 - max_shrink_verbolizer
 - md_hv1
 - md_hv2
 - md_hv3
 - md_lm1
 - md_lm2
 - md_lm3
 - min_default_verbolizer
 - min_shrink_verbolizer
 - stretch_default
 - stretch_shrink
Available sectors:
 - Technology (92)
 - Industrials (86)
 - Financial Services (85)
 - Healthcare (66)
 - Consumer Cyclical (58)
 - Consumer Defensive (40)
 - Real Estate (32)
 - Utilities (32)
 - Energy (30)
 - Basic Materials (23)
 - Communication Services (22)


  df = pd.read_sql_query(query, conn)
  df = pd.read_sql_query(query, conn)


In [64]:
df_q = fetcher.fetch_data(
    regressors=[
        'stretch_shrink',
        'eps_surprise',
        'f_size',
        'doc_len'
        ],
    report_filters={'report_type' : ['10-K']},
       prepare_fixed_effects=True,
    )

  df = pd.read_sql_query(query, conn)


Expanding list regressor 'stretch_shrink' into 107 columns...


  return pd.read_sql_query(query, conn)
  companies_df = pd.read_sql_query(query, conn)


In [65]:
df_q.isna().sum()

report_type         0
eps_surprise        0
f_size              0
doc_len             0
segment_1           0
                   ..
four_day_r_vol     49
five_day_r_vol     49
six_day_r_vol      49
seven_day_r_vol    49
full_q_r_vol       49
Length: 139, dtype: int64

In [66]:
class FEModeler:
    def __init__(self, df: pd.DataFrame,):
        self.original_df = df.copy()
        self.df = df.copy()
        self.var_names = df.columns
        self.params_dict = {}

    @staticmethod
    def extract_params(summary, regressor_name: str) -> pd.Series:
        coef_table = summary.tables[1].as_html()
        coef_table_io = StringIO(coef_table)
        df = pd.read_html(coef_table_io, header=0, index_col=0)[0]
        df.columns = ['Parameter', 'std_err', 'T-stat', 'P-value', 'Lower CI', 'Upper CI']
        return df.loc[regressor_name, ['Parameter', 'std_err', 'P-value', 'Lower CI', 'Upper CI']].copy()

    def fit(self, data: pd.DataFrame, formula: str):
        model = PanelOLS.from_formula(formula, data=data, check_rank=False)
        result = model.fit(cov_type='kernel')
        return result.summary
    
    @staticmethod
    def try_convert_numeric(col):
        try:
            return pd.to_numeric(col)
        except Exception:
            return col

    def compute_adaptive_lasso(self, additional_regs: list[str] = None):
        """
        Run adaptive Lasso for each (target, timeframe) and store non-zero coefficients.
        
        Args:
            additional_regs: List of additional regressor names to include (e.g., ['f_size', 'eps_surprise'])
        """
        self.lasso_coefs = {}
    
        targets = ['returns', 'e_returns', 'abn_returns', "r_vol"]
        time_frames = ['2_day', '3_day', '4_day', '5_day', '6_day', '7_day', 'full_quarter']
        var_names = self.df.columns
        base_regs = var_names[4:111]
        all_additional = additional_regs if additional_regs else []
    
        for target in targets:
            self.lasso_coefs[target] = {}
    
            for frame in time_frames:
                y_col = 111 + targets.index(target) * 7 + time_frames.index(frame)
                y_name = var_names[y_col]
    
                cols = list(base_regs) + all_additional + [y_name]
                data = self.df[cols].dropna().apply(self.try_convert_numeric)
    
                X = data[list(base_regs) + all_additional].values
                y = data[y_name].values
    
                # Adaptive weights based on OLS (ridge-stabilized in case of multicollinearity)
                ols_coef = np.linalg.pinv(X) @ y
                weights = 1 / (np.abs(ols_coef) + 1e-6)

                alpha_ridge = 1.0  # or any small positive value
                ridge_coef = np.linalg.inv(X.T @ X + alpha_ridge * np.eye(X.shape[1])) @ X.T @ y
                weights = 1 / (np.abs(ridge_coef) + 1e-6)
    
                X_weighted = X / weights
    
                model = LassoCV(cv=5, max_iter=10000).fit(X_weighted, y)
                coefs_raw = model.coef_ / weights  
    
                names = list(base_regs) + all_additional
                coefs_dict = {
                    name: coef for name, coef in zip(names, coefs_raw)
                    if abs(coef) > 1e-6
                }
    
                self.lasso_coefs[target][frame] = coefs_dict
        return self.lasso_coefs

    def plot_adaptive_lasso(self):
        """
        Plot bar charts of adaptive Lasso non-zero coefficients for each (target, timeframe) combination.
        """
        for target, frame_dict in self.lasso_coefs.items():
            for frame, coef_dict in frame_dict.items():
                if not coef_dict:
                    continue

                names, coefs = zip(*coef_dict.items())
                x = np.arange(len(names))

                plt.figure(figsize=(12, 5))
                plt.bar(x, coefs, color='tab:purple', width=0.6)
                plt.axhline(0, color='black', linewidth=1)
                plt.xticks(x, names, rotation=90, fontsize=8)
                plt.title(f'Adaptive Lasso Coefficients — {target}, {frame}')
                plt.xlabel('Regressor')
                plt.ylabel('Coefficient')
                plt.tight_layout()
                plt.show()


In [67]:
list(df_q.isna().sum())

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 49,
 49,
 49,
 49,
 49,
 49,
 49]

In [68]:
df_q

Unnamed: 0_level_0,Unnamed: 1_level_0,report_type,eps_surprise,f_size,doc_len,segment_1,segment_2,segment_3,segment_4,segment_5,segment_6,...,six_day_abn_r,seven_day_abn_r,full_q_abn_r,two_day_r_vol,three_day_r_vol,four_day_r_vol,five_day_r_vol,six_day_r_vol,seven_day_r_vol,full_q_r_vol
company,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2019.1,10-K,0.0119,2.514862e+10,445639,0.051025390625,0.051025390625,0.051025390625,0.051025390625,0.051025390625,0.051025390625,...,0.032633,0.051254,-0.008724,0.063094,0.054265,0.050310,0.044027,0.042001,0.041469,0.106869
A,2020.1,10-K,0.1994,3.577338e+10,455272,0.1025390625,0.1025390625,0.1025390625,0.1025390625,0.1025390625,0.1025390625,...,-0.155437,-0.126671,0.011061,0.132549,0.125151,0.094628,0.088559,0.085395,0.083886,0.097928
A,2021.1,10-K,0.1301,4.465222e+10,470051,0.082763671875,0.082763671875,0.082763671875,0.082763671875,0.082763671875,0.082763671875,...,0.122070,0.200022,-0.038172,0.084129,0.107832,0.088003,0.082414,0.081777,0.200506,0.146564
A,2022.1,10-K,-0.0146,4.364278e+10,449165,0.126708984375,0.126708984375,0.126708984375,0.126708984375,0.126708984375,0.126708984375,...,0.282631,0.161116,-0.183177,0.071271,0.065115,0.065481,0.072281,0.076827,0.073824,0.113785
A,2023.1,10-K,-0.0078,4.070080e+10,458557,0.1328125,0.1328125,0.1328125,0.1328125,0.1328125,0.1328125,...,0.050749,0.055188,-0.033847,0.078364,0.071586,0.067257,0.063576,0.055223,0.053064,0.110927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTS,2019.1,10-K,0.0243,4.120781e+10,517631,0.3974609375,0.3974609375,0.3974609375,0.3974609375,0.3974609375,0.08984375,...,0.860991,0.718838,0.181591,0.110623,0.103488,0.099940,0.088095,0.084078,0.081120,0.070130
ZTS,2020.1,10-K,0.0494,6.504257e+10,514626,0.298828125,0.298828125,0.298828125,0.298828125,0.298828125,0.178955078125,...,-0.141750,-0.070151,-0.069342,0.103579,0.097359,0.093361,0.085038,0.081097,0.086007,0.261479
ZTS,2021.1,10-K,0.0389,7.783754e+10,538127,0.29150390625,0.29150390625,0.29150390625,0.29150390625,0.29150390625,0.19970703125,...,-0.015467,-0.118259,0.015992,0.163465,0.165018,0.150816,0.142407,0.138728,0.134719,0.089121
ZTS,2022.1,10-K,0.1144,9.151194e+10,513988,0.343994140625,0.343994140625,0.343994140625,0.343994140625,0.343994140625,0.07666015625,...,-0.256089,-0.242227,-0.132685,0.208128,0.130215,0.120650,0.128777,0.124826,0.120913,0.112266


In [69]:
model = FEModeler(df=df_q,)
res = model.compute_adaptive_lasso(additional_regs=['f_size', 'eps_surprise'])

In [70]:
res

{'returns': {'2_day': {},
  '3_day': {},
  '4_day': {},
  '5_day': {},
  '6_day': {},
  '7_day': {},
  'full_quarter': {}},
 'e_returns': {'2_day': {},
  '3_day': {},
  '4_day': {},
  '5_day': {},
  '6_day': {},
  '7_day': {},
  'full_quarter': {}},
 'abn_returns': {'2_day': {},
  '3_day': {},
  '4_day': {},
  '5_day': {},
  '6_day': {},
  '7_day': {},
  'full_quarter': {}},
 'r_vol': {'2_day': {},
  '3_day': {},
  '4_day': {},
  '5_day': {},
  '6_day': {},
  '7_day': {},
  'full_quarter': {}}}

In [71]:
model.plot_adaptive_lasso()