In [1]:
import pandas as pd
import polars as pl
from data import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from missforest import MissForest
import pickle
from lightgbm import LGBMClassifier, LGBMRegressor
import pyarrow
import sklearn
from sklearn.svm import SVR, LinearSVR
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
import numpy as np
import shap
from scipy import stats
from typing import Any, cast, Final
from utils import *
import wrds
from xgboost import XGBRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
full_data = pl.read_csv(get_data_file_path("1_4_complete_dataset.csv"), dtypes=ALL_DATA_WITH_DATES_ENTRY_SCHEMA).to_pandas()

In [5]:
import pandas as pd

def winsorize_columns(df: pd.DataFrame, quantile: float = 0.01) -> pd.DataFrame:
  """
  Winsorizes each column (from the 4th onwards) independently in a DataFrame.

  Args:
      df: The input DataFrame.
      quantile: The winsorization quantile (default: 0.01 for 1%).

  Returns:
      A new DataFrame with winsorized columns.
  """
  winsorized_df = df.copy()
  for col in df.columns[3:]:
    lower_bound = df[col].quantile(quantile)
    upper_bound = df[col].quantile(1 - quantile)
    winsorized_df[col] = winsorized_df[col].clip(lower=lower_bound, upper=upper_bound)
  return winsorized_df

# Assuming 'full_data' is your already sorted DataFrame
winsorized_data = winsorize_columns(full_data.copy(), quantile=0.01)  # Winsorize on a copy

# Now you can use 'winsorized_data' for further processing


In [6]:
winsorized_data.describe()

Unnamed: 0,permno,date,AM,AOP,AbnormalAccruals,Accruals,AccrualsBM,Activism1,Activism2,AdExp,...,std_turn,tang,zerotrade,zerotradeAlt1,zerotradeAlt12,STreversal,Price,Size,date_right,ret
count,5001719.0,5001719,2934379.0,1157850.0,2315298.0,2958277.0,213321.0,108731.0,30170.0,1005624.0,...,2064645.0,1398719.0,4288348.0,4453415.0,4035377.0,5001692.0,4889704.0,4889704.0,5001719,4918560.0
mean,52557.9,1994-01-21 01:42:15.297000,3.027881,-1.802403,-0.003428303,0.02875257,0.483201,14.891954,9.242051,0.0625775,...,-0.1125646,0.6912801,1.469744,1.554276,1.403133,-0.7159066,-2.49577,-11.56022,1994-03-01 01:15:24.042000,0.007377432
min,10000.0,1925-12-01 00:00:00,0.07494592,-40.93309,-0.3792033,-0.3033897,0.0,9.0,0.0,0.0001677615,...,-2.216114,0.2157037,2.167118e-09,1.435528e-09,2.456003e-09,-53.35041,-4.971894,-17.30189,1932-04-30 00:00:00,-0.382979
25%,20969.0,1982-01-01 00:00:00,0.6433154,-1.355855,-0.05248681,-0.0126095,0.0,13.0,0.0,0.004987868,...,-0.08703909,0.6002009,2.026757e-08,1.724399e-08,2.139086e-08,-5.9846,-3.35428,-13.07093,1982-01-29 00:00:00,-0.05782625
50%,54894.0,1996-10-01 00:00:00,1.384571,-0.3936195,-0.007283284,0.02834095,0.0,15.0,7.438817,0.01651571,...,-0.03480813,0.6968244,6.731287e-08,5.016651e-08,7.898551e-08,0.0,-2.682732,-11.42684,1996-10-31 00:00:00,0.0
75%,80717.0,2010-05-01 00:00:00,3.10883,-0.06500524,0.03915805,0.07099786,1.0,17.0,10.728353,0.05642348,...,-0.01556216,0.7957041,0.8590912,0.9545455,0.9048957,5.5556,-1.802122,-9.910483,2010-05-28 00:00:00,0.062112
max,93436.0,2023-12-01 00:00:00,32.43325,1.0,0.4429111,0.3799396,1.0,21.0,80.0,0.8886826,...,-0.002255419,1.108704,15.40347,17.18182,14.57526,38.1818,1.268511,-6.89429,2023-07-31 00:00:00,0.539683
std,29509.73,,4.889376,5.294965,0.1141838,0.09759911,0.499719,2.690423,12.468353,0.1316377,...,0.2854031,0.1709446,3.244883,3.626017,3.062318,13.61686,1.246024,2.252961,,0.1380761


In [7]:
winsorized_data

Unnamed: 0,permno,yyyymm,date,AM,AOP,AbnormalAccruals,Accruals,AccrualsBM,Activism1,Activism2,...,std_turn,tang,zerotrade,zerotradeAlt1,zerotradeAlt12,STreversal,Price,Size,date_right,ret
0,10000,198601,1986-01-01,,,,,,,,...,,,,,,0.0000,-1.475907,-9.686575,1986-01-31,0.539683
1,10000,198602,1986-02-01,,,,,,,,...,,,,4.785175e-08,,25.7143,-1.178655,-9.389323,1986-02-28,-0.257143
2,10000,198603,1986-03-01,,,,,,,,...,,,,1.023392e-07,,-36.5385,-1.490091,-9.700759,1986-03-31,0.365385
3,10000,198604,1986-04-01,,,,,,,,...,,,,7.467463e-08,,9.8592,-1.386294,-9.627207,1986-04-30,-0.098592
4,10000,198605,1986-05-01,,,,,,,,...,,,,7.649551e-08,,22.2656,-1.134423,-9.375336,1986-05-30,-0.222656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5001714,93436,202308,2023-08-01,,,-0.003877,0.016862,,,,...,,0.698178,,,,3.4962,-4.971894,-17.301890,2023-07-31,-0.034962
5001715,93436,202309,2023-09-01,,,-0.003877,0.016862,,,,...,,0.698178,,,,3.0456,-4.971894,-17.301890,2023-07-31,-0.030456
5001716,93436,202310,2023-10-01,,,-0.003877,0.016862,,,,...,,0.698178,,,,19.7346,-4.971894,-17.301890,2023-07-31,-0.197346
5001717,93436,202311,2023-11-01,,,-0.003877,0.016862,,,,...,,0.698178,,,,-19.5379,-4.971894,-17.301890,2023-07-31,0.195379
