In [8]:
import pandas as pd

df = pd.read_parquet('./data/pulled/OptionMetrics.parquet')

df.shape[0]


3410580

In [3]:
df.head()

Unnamed: 0,date,secid,exdate,cp_flag,strike_price,forward_price,impl_volatility,volume,contract_size,best_bid,best_offer,open,close,tb_m3
0,1996-01-05,108105.0,1996-03-16,C,525000.0,,,0.0,100.0,92.875,93.875,617.7,616.71,5.03
1,1996-01-05,108105.0,1996-03-16,C,550000.0,,,0.0,100.0,68.75,69.75,617.7,616.71,5.03
2,1996-01-05,108105.0,1996-02-17,C,570000.0,,0.11611,0.0,100.0,48.25,49.25,617.7,616.71,5.03
3,1996-01-05,108105.0,1996-03-16,P,550000.0,,0.179154,623.0,100.0,1.0,1.375,617.7,616.71,5.03
4,1996-01-05,108105.0,1996-01-20,P,525000.0,,0.353404,100.0,100.0,0.0625,0.125,617.7,616.71,5.03


## level 1 filter

**“Identical” Filter:** The OptionMetrics data set contain duplicate observations, defined as two or more quotes with identical option type, strike, expiration date, and price. In each such case, we eliminate all but one of the quotes.

In [5]:
def remove_duplicate_quotes(df):
    """
    Removes duplicate observations from the OptionMetrics dataset.
    Duplicates are defined as quotes with the same option type, strike price,
    expiration date, and price.
    
    Parameters:
    df : pandas.DataFrame
        DataFrame containing the OptionMetrics dataset.

    Returns:
    pandas.DataFrame
        DataFrame with duplicate observations removed.
    """
    # Criteria for defining duplicates: option type, strike price, expiration date, and price
    cols_to_check = ['secid', 'cp_flag', 'strike_price','date','exdate', 'best_offer']
    
    # Remove duplicates, keeping the first occurrence
    df_unique = df.drop_duplicates(subset=cols_to_check, keep='first')
    
    return df_unique

In [6]:
df_1 = remove_duplicate_quotes(df)
print(df.shape[0]-df_1.shape[0])

0


**“Identical Except Price” Filter:** There are a few sets of quotes with identical
 terms (type, strike, and maturity) but different prices. When this occurs, we
 keep the quote whose T-bill-based implied volatility is closest to that of its
 moneyness neighbors, and delete the others.

In [20]:
def clean_options_data(df):
    """
    Cleans a DataFrame of options data by removing duplicates based on certain
    criteria (type, strike, maturity, date), while keeping the entry whose
    implied volatility is closest to the TBill based implied volatility of its
    moneyness neighbors.

    Parameters:
    - df: DataFrame containing options data with columns for 'secid', 'date',
          'cp_flag', 'strike_price', 'exdate', 'close', and 'impl_volatility'.

    Returns:
    - DataFrame with duplicates removed based on the specified logic.
    """
    # Calculate moneyness for each option
    df['moneyness'] = (df['strike_price'] / 1000) / df['close']

    # Identify all duplicates based on the given subset of columns
    duplicates_mask = df.duplicated(subset=['secid', 'date', 'cp_flag', 'strike_price', 'exdate'], keep=False)

    # Separate duplicates for further analysis
    df_duplicates = df[duplicates_mask]
    df_unique = df[~duplicates_mask]

    # Find moneyness neighbors and the implied volatility closest to TBill for each duplicate
    df_duplicates = df_duplicates.sort_values(by=['secid', 'cp_flag', 'date', 'exdate'])
    grouped = df_duplicates.groupby(['secid', 'cp_flag', 'date', 'exdate'])

    closest_to_tbill = grouped.apply(lambda x: x.loc[(x['moneyness'] - 1).abs().idxmin()])


    # Drop duplicates in the original duplicates DataFrame and keep only the closest entries
    df_cleaned_duplicates = closest_to_tbill.drop_duplicates(subset=['secid', 'date', 'cp_flag', 'strike_price', 'exdate'])

    # Combine the non-duplicate entries with the cleaned duplicates and sort
    df_final = pd.concat([df_unique, df_cleaned_duplicates]).sort_values(by=['secid', 'cp_flag', 'date', 'exdate']).reset_index(drop=True)

    # Cleanup: remove temporary columns if needed
    df_final.drop(['moneyness'], axis=1, inplace=True)

    return df_final


In [21]:
df_2 = clean_options_data(df_1)
print(df_1.shape[0]-df_2.shape[0])

  closest_to_tbill = grouped.apply(lambda x: x.loc[(x['moneyness'] - 1).abs().idxmin()])


10


**“Bid = 0” Filter:** We remove quotes of zero for bids, thereby avoiding low
valued options. Also, a zero bid may indicate censoring as negative bids
 cannot be recorded

In [22]:
def delete_zero_bid_filter(df):
    """
    Filters out rows from the DataFrame where the 'best_bid' value is zero.
    Rows with a 'best_bid' of zero might be considered as having no active bids,
    which could be irrelevant for certain analyses focusing on active market participation.

    Parameters:
    df : pandas.DataFrame
        The DataFrame containing options data with a 'best_bid' column.

    Returns:
    pandas.DataFrame
        A filtered DataFrame with rows having non-zero 'best_bid' values.
    """
    filtered_df = df.query("best_bid != 0.0")
    return filtered_df

In [24]:
df_3 = delete_zero_bid_filter(df_2)
print(df_2.shape[0]-df_3.shape[0])

272078


## level 2 filter

**“Days to Maturity <7 or >180 ” Filter:** We remove all options with fewer than seven or more than 180 calendar days to expiration. The short maturity op- tions tend to move erratically close to expiration and the long maturity op- tions lack volume and open interest.

In [1]:
def DaystoMaturity_filter(df):
    df['exdate'] = pd.to_datetime(df['exdate'])
    df['date'] = pd.to_datetime(df['date'])
    df['T-t'] = (df['exdate'] - df['date']).dt.days
    df = df[df['T-t'] > 7]
    df = df[df['T-t'] < 180]
    return df

**“IV<5% or >100%” Filter:**  We remove all option quotes with implied vola- tilities lower than 5% or higher than 100%, computed using T-bill interest rates. Such extreme values likely indicate quotation problems or simply low value.

In [2]:
def ExtremeIV_filter(df):
    df = df[df['impl_volatility'] > 0.05]
    df = df[df['impl_volatility'] < 1.0]
    return df

**“Moneyness <0.8 or >1.2” Filter:** We remove all option quotes with money- ness, the ratio of strike price to index price, below 0.8 or above 1.2. These options have little value beyond their intrinsic value and are also very thinly traded.

In [3]:
def moneyness_filter(df):
    df['strike_price']=df['strike_price']/1000
    df['ratio'] = df['close'] / df['strike_price']

    df_filtered = df[(df['ratio'] >= 0.8) & (df['ratio'] <= 1.2)]
    
    return df_filtered


**“Implied Interest Rate <0” Filter**: We need to check whether the implied interest rate in the pairs of options is abnormal. Abnormal implied interest rate shows an error in data or mispricing. To construct this rate, we take all put-call pairs of a given maturity and impose put-call parity using the bid-ask midpoint as the price, and allowing the interest rate to adjust. 

In [4]:
import numpy as np
def implied_interest_rate_filter(df):
    #df['avg_price']=df(['best_bid']+df['best_offer'])/2
    # Step 1: find pairs of options with same exdate & trike_price
    
    call_options = df[df['cp_flag'] == 'C']
    put_options = df[df['cp_flag'] == 'P']
    
    
    df['date']=pd.to_datetime(df['date'])
    
    df['exdate']=pd.to_datetime(df['exdate'])
    
  

   #print(merged_options)
    call_options['date']=pd.to_datetime(call_options['date'])
    
    call_options['exdate']=pd.to_datetime(call_options['exdate'])
    
    put_options['date']=pd.to_datetime(put_options['date'])
    
    put_options['exdate']=pd.to_datetime(put_options['exdate'])
    
    # Step 2: calculate average price of each pair of options
    call_options['option_price'] = (call_options['best_bid'] + call_options['best_offer']) / 2
    put_options['option_price'] = (put_options['best_bid'] + put_options['best_offer']) / 2

    # Step 3: calculate implied interest rate
    
    call_options['time_to_maturity'] = (call_options['exdate'] - call_options['date']).dt.days / 365.25
    
    put_options['time_to_maturity'] = (put_options['exdate'] - put_options['date']).dt.days / 365.25
    
    call_options['implied_rate'] = -np.log((call_options['option_price'] - 
                                            put_options['option_price'] + 
                                            call_options['close']) / 
                                            call_options['strike_price']) /call_options['time_to_maturity']
    
    put_options['implied_rate'] = -np.log((call_options['option_price'] - 
                                            put_options['option_price'] + 
                                            put_options['close']) / 
                                            put_options['strike_price']) /put_options['time_to_maturity']
    result_df = pd.concat([call_options,put_options],axis=0)
    result_df.sort_values('date')
    result_df.drop(result_df[result_df['implied_rate']<=0], axis=1, inplace=True)

    return(result_df)


**“IV” Filter**:We remove implied volatility outliers and those pairs who are not able to calcultate IV to reduce the prevalence of apparent butterfly arbitrage.

For each date and maturity, we fit a quadratic curve (separately to puts and calls) through the observed log implied volatilities. 

Then we compute a typical (one standard deviation) relative distance in percent from the level of the fitted curve.

Finally we check for each option’s IV, how many standard deviations it is apart from the fitted IV curve. These distances are tight in and around the money (about 2%) and wide in the out of the money range (around 3.5%). So we remove the samples whose IV is out of the 2 sigma range.

In [5]:
def IV_filter(df):
    
    from scipy.optimize import curve_fit
    from scipy.stats import norm

    def quadratic(x, a, b, c):
        return a * x**2 + b * x + c

    
    grouped = df.groupby(['date', 'exdate', 'cp_flag'])

    # save parameter and stdevs
    
    fit_params = {}
    std_devs = {}
   
    for name, group in grouped:
        params=[1,2,3]
        valid_data = group.dropna(subset=['impl_volatility'])
        if valid_data.empty:
            continue

        #generate IV
        
        log_iv = np.log(valid_data['impl_volatility'].dropna())
        #params,_ = curve_fit(quadratic, valid_data['strike_price'], log_iv)
        fit_params[name] = params

        #calculate IV and calculate residuals
        fitted_ivs = quadratic(valid_data['strike_price'], *params)
        residuals = log_iv - fitted_ivs

        # calculate stdevs of the residual
        std_dev = np.std(residuals)
        std_devs[name] = std_dev

    # filter the outliers that are above or below +-2 stdev
    filtered_df = pd.DataFrame()
    
    for name, group in grouped:
        if name not in fit_params or name not in std_devs:
            
            continue
        
        
        log_iv = np.log(group['impl_volatility'].dropna())
        fitted_ivs = quadratic(group['strike_price'], *fit_params[name])
        residuals = log_iv - fitted_ivs
        
    # find the samples within 95% interval
        within_confidence_interval = (residuals > -2 * std_devs[name]) & (residuals < 2 * std_devs[name])
        
        z_score = norm.ppf(0.975)  

        filtered_group = group.loc[within_confidence_interval]
        filtered_df = pd.concat([filtered_df, filtered_group])
    
    
    return filtered_df



**“Put-call Parity” Filter**: For every put-call pair with the same date, maturity, and moneyness, we insure that put-call parity holds and that violations are eliminated. Thus, for each put-call pair, we find the bid-ask midpoint put-call parity-implied interest rate. Next, we trim outliers in a similar way as with the IV filter. 

In [7]:
def parity_filter(df):
    
    df['parity']=np.zeros(len(df))
    
    call_options = df[df['cp_flag'] == 'C']
    put_options = df[df['cp_flag'] == 'P']
    
    
    df['date']=pd.to_datetime(df['date'])
    
    df['exdate']=pd.to_datetime(df['exdate'])
    
  

   #print(merged_options)
    call_options['date']=pd.to_datetime(call_options['date'])
    
    call_options['exdate']=pd.to_datetime(call_options['exdate'])
    
    put_options['date']=pd.to_datetime(put_options['date'])
    
    put_options['exdate']=pd.to_datetime(put_options['exdate'])
    
    # Step 2: calculate average price of each pair of options
    call_options['option_price'] = (call_options['best_bid'] + call_options['best_offer']) / 2
    put_options['option_price'] = (put_options['best_bid'] + put_options['best_offer']) / 2

    # Step 3: calculate implied interest rate
   
    call_options['time_to_maturity'] = (call_options['exdate'] - call_options['date']).dt.days / 365.25
    
    put_options['time_to_maturity'] = (put_options['exdate'] - put_options['date']).dt.days / 365.25
    
    call_options['implied_rate'] = -np.log((call_options['option_price'] - 
                                            put_options['option_price'] + 
                                            call_options['close']) / 
                                            call_options['strike_price']) /call_options['time_to_maturity']
    
    put_options['implied_rate'] = -np.log((call_options['option_price'] - 
                                            put_options['option_price'] + 
                                            put_options['close']) / 
                                            put_options['strike_price']) /put_options['time_to_maturity']
    
    # Step 4: calculate put call parity difference
    
    for idx, row in call_options.iterrows():
        
        put_option_row = put_options.loc[1]
        
        
        parity = (row['option_price'] - put_option_row['option_price'] -
                row['close'] + row['strike'] / (1 + row['implied_rate']))
        
        
        call_options.loc[idx, 'parity'] = parity

    # for those does not satisfy put-call parity the result will not be zero
    result_df=df.drop(result_df[(result_df['parity']<=-0.1)and(result_df['parity']>=0.1)], axis=1, inplace=True)
    
    
    return result_df