In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

## Ch1. Preliminaries

### Winsorization and Truncation

Empirical asset pricing researchers usually take a more ad hoc approach to dealing with the effect of outliers   

Two techniques are commonly used in empirical asset pricing research to deal with the effect of outliers: winsorizing and truncating.    

Winsorizing is a technique that replaces extreme values with the nearest non-extreme value.   
The idea is to replace the extreme values with the nearest non-extreme value.   
For example, if the 1% quantile is 0.5 and the 99% quantile is 100, then all values less than 0.5 are replaced with 0.5 and all values greater than 100 are replaced with 100.   


In [None]:
def winsorize(data, column=False, lower=0.01, upper=0.99, copy=True):
    """
    Winsorizes the input data by replacing extreme values with the nearest values within the specified quantiles.
    
    Parameters:
    data (pd.Series or pd.DataFrame): The data to be winsorized.
    column (str or bool): The column to be winsorized or False to winsorize all columns. Defaults to False.
    lower (float): The lower quantile threshold. Defaults to 0.01.
    upper (float): The upper quantile threshold. Defaults to 0.99.
    copy (bool): Whether to return a copy of the data or to modify it in place. Defaults to True.
    
    Returns:
        pd.Series or pd.DataFrame: The winsorized data.
    """
    if copy:
        data = data.copy()
    
    if column:
        col_data = data[column]
        
        print(f"Original maximum of {column}: {col_data.max()} and Original minimum of {column}: {col_data.min()}\n")
        
        qtl = col_data.quantile([lower, upper])
        
        # Replace values below the lower quantile
        col_data[col_data < qtl.loc[lower]] = qtl.loc[lower]
        
        # Replace values above the upper quantile
        col_data[col_data > qtl.loc[upper]] = qtl.loc[upper]
        
        data[column] = col_data
        
        print(f"New maximum of {column}: {col_data.max()} and New minimum of {column}: {col_data.min()}\n")
    else:
        for col in data.columns:
            col_data = data[col]
            
            print(f"Original maximum of {col}: {col_data.max()} and Original minimum of {col}: {col_data.min()}\n")
            
            qtl = col_data.quantile([lower, upper])
            
            # Replace values below the lower quantile
            col_data[col_data < qtl.loc[lower]] = qtl.loc[lower]
            
            # Replace values above the upper quantile
            col_data[col_data > qtl.loc[upper]] = qtl.loc[upper]
            
            data[col] = col_data
            
            print(f"New maximum of {col}: {col_data.max()} and New minimum of {col}: {col_data.min()}\n")
    
    return data


In [None]:
winsor_data = crsp_m.copy()
winsor_data['ret'] = winsorize(crsp_m['ret']) # Winsorize returns

Data winsorized between 0.01 and 0.99

Original maximum: 24.0 and Original minimum: -0.98828125

New maximum: 0.5966796875 and New minimum: -0.41015625



#### Truncating

Truncating is a technique that removes extreme values.   
The idea is to remove the extreme values from the data.   
For example, if the 1% quantile is 0.5 and the 99% quantile is 100, then all values less than 0.5 and all values greater than 100 are removed.   

In [None]:
def truncate(data, column=False, lower=0.01, upper=0.99, copy=True):
    """

    Args:
        data (_type_): The data to be truncated.
        lower (float): The lower quantile threshold. Defaults to 0.01.
        upper (float): The upper quantile threshold. Defaults to 0.99.
        copy (bool): Whether to return a copy of the data or to modify it in place. Defaults to True.
    
    Returns:
        pd.Series or pd.DataFrame: The truncated data.
    """
    print(f"Data winsorized between {lower} and {upper}\n")
    
    if copy:
        data = data.copy()
        
    if column:
        data = data[column]
    
    print(f"Original maximum: {data.max()} and Original minimum: {data.min()}\n")
    
    qtl = data.quantile([lower, upper])
    
    # Remove values below the lower quantile
    data = data[data >= qtl.loc[lower]]
    
    # Remove values above the upper quantile
    data = data[data <= qtl.loc[upper]]
    
    print(f"New maximum: {data.max()} and New minimum: {data.min()}\n")
    return data

In [None]:
truncate_data = crsp_m.copy()
truncate_data['ret'] = truncate(crsp_m, ['ret']) # Winsorize returns

Data winsorized between 0.01 and 0.99

Original maximum: ret    24.0
dtype: float16 and Original minimum: ret   -0.988281
dtype: float16

New maximum: ret    0.59668
dtype: float16 and New minimum: ret   -0.410156
dtype: float16



### NEWEY AND WEST (1987) ADJUSTMENT

Newey-West standard errors are a robust method for estimating the standard errors of the coefficients in a regression model.   
The selection of the lag length is important because it determines the number of periods over which the autocorrelation of the residuals is calculated.   
Usally, the lag length is set to the integer part of the cube root of the number of observations or 6 or 12 in empirical asset pricing research.   
The Newey-West standard errors are calculated as follows:
- 1. Estimate the regression model.
- 2. Calculate the residuals.
- 3. Calculate the autocovariance of the residuals.
- 4. Calculate the Newey-West standard errors.
   
The Newey-West standard errors are robust to autocorrelation and heteroskedasticity in the residuals.


In [None]:
def newey_west_regression(y, X, lags=None):
    """
    Performs a regression and calculates Newey-West standard errors, t-statistics, and p-values.
    
    Args:
        y (pd.Series or np.array): Dependent variable.
        X (pd.DataFrame or np.array): Independent variables.
        lags (int): The lag length for Newey-West standard errors. If None, it will be set to the integer part of the cube root of the number of observations.
    
    Returns:
        pd.DataFrame: A DataFrame containing coefficients, Newey-West standard errors, t-statistics, and p-values.
    """
    if lags is None:
        lags = int(np.floor(len(y)**(1/3)))  # Set lag length to the integer part of the cube root of the number of observations

    # Ensure the dependent variable is numeric
    y = pd.to_numeric(y, errors='coerce')
    
    # Ensure all independent variables are numeric
    X = X.apply(pd.to_numeric, errors='coerce')

    # Drop rows with any NaN values in y or X
    valid_idx = ~y.isna() & X.notna().all(axis=1)
    y = y[valid_idx]
    X = X[valid_idx]
    
    # Add a constant term to the independent variables matrix
    X = sm.add_constant(X)
    
    # Estimate the regression model
    model = sm.OLS(y, X).fit()
    
    # Calculate Newey-West standard errors
    robust_cov = model.get_robustcov_results(cov_type='HAC', maxlags=lags)
    
    # Extract coefficients, standard errors, t-statistics, and p-values
    results = pd.DataFrame({
        'Coefficient': model.params,
        'Newey-West SE': robust_cov.bse,
        't-Statistic': robust_cov.tvalues,
        'p-Value': robust_cov.pvalues
    })
    
    return results

In [None]:
newey_west_regression(crsp_m['ret'], crsp_m.drop(['ret', 'date', 'permno'], axis=1), lags=6)

### Additional Problems

Not in the book.
However there are several problems about preprocessing the financial data