# 4. Constructing AR-GARCH-X model

This template is used to calculate and show plots of the AR-GARCH-X model, which makes use of the ArGarchX class. This class has several methods in order to compute, with the help of the Maximum Likelihood method the estimators that maximize the Quasi Log Likelihood. Via this template several models will be constructed and tested, in order to check the effect of public sentiment on volatility, and it will be checked whether adding these variables increases predictive accuracy.

## 4.1. Load packages and data

Here, load the packages, data and colors for the main analysis and for the construction of plots, also define export locations

### 4.1.1. Load packages

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import os, sys
import itertools
from datetime import datetime
from scipy.stats import chi2
import scipy.optimize as opt
from scipy.stats import norm

# Load data that returns tweets
sys.path.insert(0, os.path.abspath(r'C:\Users\Jonas\PycharmProjects\TwitterSentimentGARCH2021\Code\GARCH model'))
from garch_models import ArmaApARCHX, ArmaXapARCH, ArmaApArchXGarch, ExogGarch
from mean_model import MeanModel
from qml_estimation import QuasiMaximumLikelihoodEstimator
from bic_find_lags import LagsSelection

# Surpress warnings
import warnings
warnings.filterwarnings("ignore")

### 4.1.2. Colors for plots

In [None]:
colors = ['seagreen', 'mediumaquamarine', 'steelblue', 'cornflowerblue', 'navy', 'black']

### 4.1.3. Load data

Load company name data and DataFrame per company containing all the sentiment, return and control variable data

In [None]:
# Specify location of data + file name and location of storage
company_loc = r'C:\Users\Jonas\Documents\Data'
file_name_comp = '\company_ticker_list_all.xlsx'

# Access company names DataFrame
df_comp_names = pd.read_excel(company_loc + file_name_comp)

Specify location where all company specific data is stored

In [None]:
# Specify name and location
data_loc = r'C:\Users\Jonas\Documents\Data\Total_data'

Specify location where to store results

In [None]:
# Specify location where daily sentiment scores must be stored
store_loc = r'C:\Users\Jonas\Documents\Data\Results'

## 4.2. Analysis

This section performs the main analysis and will calculate the results for each company in the selection.

### 4.2.1. Create models

First, create the different columns in the total dataset that need to be evaluated and serve as input into the models

In [None]:
# Define possible exogenous columns
control_cols = ['VIX', 'TEDRATE']
sent_cols = ['sentiment', 'n_tweets', 'n_interactions']

exog_cols = sent_cols

# Define data on which to impose GARCH structure, and provide column names
x_garch_cols = [f'sigma2_{col}' for col in sent_cols]

# Define possible model types
model_types = ['mean-x', 'asym', 'x-garch']

### 4.2.2. Calculate exogenous GARCH processes

Now, calculate the parameters for the exogenous GARCH processes

In [None]:
# For every company in the dataset, read the data and construct the appropriate model
for company in df_comp_names['Company']:   
    # Read data
    data_name = f'\\total data {company}.csv' 
    
    # Get df_total
    df_total = pd.read_csv(data_loc + data_name)
       
    # Drop NaNs
    df_total = df_total.fillna(0)
    
    df_xgarch_params = pd.DataFrame()

    for j, col in enumerate(sent_cols):
        qmle = QuasiMaximumLikelihoodEstimator(df_total, [col], model_type='garch')

        # Optimize Exogenous GARCH process
        minimization_result, psi_hat, likelihood, df_params = qmle.optimize_likelihood()
        
        # Calculate exogenous GARCH process
        sigma2_x, et_x = ExogGarch(df_total, [col], psi_hat).conditional_volatility()
        df_total[f'sigma2_{col}'] = sigma2_x
        df_total[f'et_{col}'] = et_x

        # Calculate Variance-Covariance matrix
        vcov = qmle.vcov(psi_hat)
        df_params[f'errors'] = np.sqrt(np.diag(vcov))
        df_params[f't-stat'] = psi_hat / np.sqrt(np.diag(vcov))

        param_names = df_params['param names']
        params = df_params.psi_hat

        # Add parameter estimates to df_xgarch_params
        df_xgarch_params = pd.concat([df_xgarch_params, df_params], axis=1)
       
    # Store variables of sentiment regression    
    df_xgarch_params.to_csv(data_loc + f'\\exogenous garch data\\xgarch_params {company}.csv')
    df_total.to_csv(data_loc + data_name, index=False)

### 4.2.3 Find lags of mean and GARCH model

First, the lags of the ARMA(P,Q) conditional mean model and the GARCH-X(L_k) conditional volatility model are found. This is done with help of the `LagsSelection` class, which conditional finds the ARMA lags that minimize the BIC, and can both dependent and independent find the optimal lags of the conditional volatility specification.

In [None]:
# Create index
index = ['ARMA lags'] + [f'X-lags for model: {model}' for model in model_types]

# Instantiate DataFrame to store results
df_model_specification = pd.DataFrame(index=index)

# For every company in the dataset, read the data and construct the appropriate model
for company in df_comp_names['Company']:   
    # Create several different models
    lags_per_model = []
    
    # Read data
    data_name = f'\\total data {company}.csv' 

    # Get df_total
    df_total = pd.read_csv(data_loc + data_name)
       
    # Drop NaNs
    df_total = df_total.fillna(0)
    
    # Calculate ARMA lags disregarding how the exogenous variables enter the model
    lags_arma, bic_arma = LagsSelection(df=df_total, returns_col='returns').find_arma_lags()
    lags_per_model += [tuple(lags_arma)]
    
    # Create several different models
    for model in model_types:
        lags_vol = LagsSelection(df=df_total, returns_col='returns', arma_lags=lags_arma, exog_cols=exog_cols,
                                             model_type=model, x_garch_cols=x_garch_cols).find_exog_lags()
        lags_per_model += [tuple(lags_vol)]       
            
    df_model_specification[company] = lags_per_model
    
# Store DataFrame with model characterstics as .csv file
df_model_specification.to_csv(data_loc + f'\\lags\\lags_1.csv')

### 4.2.4. Calculate optimal parameters and standard errors

In this section, for each model, the optimal parameters are calculated and the standard error of these parameters. 

In [None]:
# Find lags specification for each model
df_model_specification = pd.read_csv(data_loc + f'\\lags\\model_params.csv')

# For every company in the dataset, read the data and construct the appropriate model
for company in df_comp_names['Company']:
    
    # Read data
    data_name = f'\\total data {company}.csv' 
    
    # Get df_total
    df_total = pd.read_csv(data_loc + data_name)
       
    # Drop NaNs
    df_total = df_total.fillna(0)
    
    # Unpack the optimal lags of the ARMA procedure from df_model_specification
    lags_per_model = [literal_eval(x) for x in df_model_specification[company]]
    lags_arma = list(lags_per_model[0])
    
    # Calculate optimal parameters for each model
    
    # - Benchmark model
    qmle = QuasiMaximumLikelihoodEstimator(df_total, 'returns', lags_arma, model_type='asym', exog_cols=None, lags_exog=None)
    
    minimization_result, psi_hat, likelihood, df_params = qmle.optimize_likelihood()
    
    vcov = qmle.vcov(psi_hat)
    df_params['errors'] = np.sqrt(np.diag(vcov))
    df_params['t-stat'] = psi_hat / np.sqrt(np.diag(vcov))
    
    
    # - Model ARMA-X-apARCH
    qmle1 = QuasiMaximumLikelihoodEstimator(df_total, 'returns', lags_arma,
                                            exog_cols, list(lags_per_model[1]), params=None, 
                                            model_type='mean-x')
    
    minimization_result1, psi_hat1, likelihood1, df_params1 = qmle1.optimize_likelihood()
    
    vcov1 = qmle1.vcov(psi_hat1)
    df_params1['errors'] = np.sqrt(np.diag(vcov1))
    df_params1['t-stat'] = psi_hat1 / np.sqrt(np.diag(vcov1))
    
    # - Model ARMA-apARCH-X
    qmle2 = QuasiMaximumLikelihoodEstimator(df_total, 'returns', lags_arma,
                                            exog_cols, list(lags_per_model[2]), params=None, 
                                            model_type='asym')
    
    minimization_result2, psi_hat2, likelihood2, df_params2 = qmle2.optimize_likelihood()
    
    vcov2 = qmle2.vcov(psi_hat2)
    df_params2['errors'] = np.sqrt(np.diag(vcov2))
    df_params2['t-stat'] = psi_hat2  /  np.sqrt(np.diag(vcov2))
    
    # - Model ARMA-apARCH-(X-ARMA-GARCH)
    exog_lags = list(lags_per_model[3])[:len(exog_cols)]
    x_garch_lags = list(lags_per_model[3])[len(exog_cols):]
    qmle3 = QuasiMaximumLikelihoodEstimator(df_total, 'returns', lags_arma,
                                            exog_cols, exog_lags, params=None,
                                            model_type='x-garch', x_garch_cols=x_garch_cols, x_garch_lags=x_garch_lags)
    
    minimization_result3, psi_hat3, likelihood3, df_params3 = qmle3.optimize_likelihood()
    
    vcov3 = qmle3.vcov(psi_hat3)
    df_params3['errors'] = np.sqrt(np.diag(vcov3))
    df_params3['t-stat'] = psi_hat3  / np.sqrt(np.diag(vcov3))
    
    # Store all parameters into a .csv file
    df_params.to_csv(data_loc + f'\\parameters\\benchmark\\benchmark {company}.csv', index=False)
    df_params1.to_csv(data_loc + f'\\parameters\\model1\\params {company}.csv', index=False)
    df_params2.to_csv(data_loc + f'\\parameters\\model2\\params {company}.csv', index=False)
    df_params3.to_csv(data_loc + f'\\parameters\\model3\\params {company}.csv', index=False)

Define a Likelihood calculation functionality that can be used to quickly calculate the likelihood given residuals and conditional volatility

## 4.3. Summary of results

In [None]:
def quasi_log_likelihood(sigma2, et):
    # QMLE from Franq and Thieu
    lls = np.log(sigma2) + ((et ** 2) / sigma2)

    # Calculate Quasi Maximum Likelihood
    ll = np.nan_to_num(lls).sum()

    return -ll

Here, calculate and plot the constructed values of the conditional volatility model, conditional mean model and the distribution of the innovations $z_t$.

In [None]:
model_names = ['ARMA-X-apARCH', 'ARMA-apXapARCH', 'ARMA-apGARCHX']

# Find lags specification for each model
df_model_specification = pd.read_csv(data_loc + f'\\lags\\model_params.csv')

# For every company in the dataset, read the data and construct the appropriate model
for company in df_comp_names['Company']:    
    # Read data
    data_name = f'\\total data {company}.csv' 
    
    # Get df_total
    df_total = pd.read_csv(data_loc + data_name)
       
    # Drop NaNs
    df_total = df_total.fillna(0)
    
    # Unpack the optimal lags of the ARMA procedure from df_model_specification
    lags_per_model = [literal_eval(x) for x in df_model_specification[company]]
    lags_arma = list(lags_per_model[0])
    
    # Open parameter files
    df_params = pd.read_csv(data_loc + f'\\parameters\\benchmark\\benchmark {company}.csv')
    df_params1 = pd.read_csv(data_loc + f'\\parameters\\model1\\params {company}.csv')
    df_params2 = pd.read_csv(data_loc + f'\\parameters\\model2\\params {company}.csv')
    df_params3 = pd.read_csv(data_loc + f'\\parameters\\model3\\params {company}.csv')
    
    # Calculate sigma2 values for each model
    benchmark = ArmaApARCHX(df_total, ['returns'], lags_arma, params=df_params['psi_hat'].tolist())
    
    
    model1 = ArmaXapARCH(df_total, 'returns', lags_arma, exog_cols, list(lags_per_model[1]), 
                         params = df_params1['psi_hat'].tolist())
    
    model2 = ArmaApARCHX(df_total, 'returns', lags_arma, exog_cols, list(lags_per_model[2]), 
                         params = df_params2['psi_hat'].tolist())
    
    exog_lags = list(lags_per_model[3])[:len(exog_cols)]
    x_garch_lags = list(lags_per_model[3])[len(exog_cols):]
    
    model3 = ArmaApArchXGarch(df_total, 'returns', lags_arma, exog_cols, exog_lags, 
                              params = df_params3['psi_hat'].tolist(),
                              xgarch_cols=x_garch_cols, lag_exog_sigma=x_garch_lags)
    
    sigma2, et = benchmark.conditional_volatility()
    sigma2_1, et_1 = model1.conditional_volatility()
    sigma2_2, et_2 = model2.conditional_volatility()
    sigma2_3, et_3 = model3.conditional_volatility()
    
    vars_, ets = [sigma2_1, sigma2_2, sigma2_3], [et_1, et_2, et_3]
    
    # Calculate likelihood for all models
    ll_benchmark = quasi_log_likelihood(sigma2, et)
    ll_1, ll_2, ll_3 = quasi_log_likelihood(sigma2_1, et_1), quasi_log_likelihood(sigma2_2, et_2), quasi_log_likelihood(sigma2_3, et_3)
    likelihoods = [ll_1, ll_2, ll_3]
    
    # Now, construct plots for all models
    fig, axs = plt.subplots(figsize = (25,5), nrows = 1, ncols = 3)
    
    first_date, last_date = df_total.date.iloc[0], df_total.date.iloc[-1]
    n = 250  # keeps every 150th label (around half a year)

    for j in range(len(axs)):
        axs[j].plot(df_total.date, np.sqrt(ets[j] ** 2), c='black', linestyle='-.', alpha=0.7, label='Realized volatility')
        axs[j].plot(df_total.date, np.sqrt(sigma2), c='yellow', linestyle='--', label='Benchmark model')
        axs[j].plot(df_total.date, np.sqrt(vars_[j]), c=colors[0], label=f'Single regime {model_names[j]} model')
        
        # Set title and xticklabels
        axs[j].set_title(f'Conditional volatility of {company}')
        axs[j].legend()
        
        # Set title and xticklabels
        axs[j].set_xticks(axs[j].get_xticks()[::n])
        axs[j].tick_params(axis='x', labelrotation = 45)
        
        # Check behaviour of the innovations
        #axs[j].hist(ets[j] / np.sqrt(vars_[j]), bins=50)
        #print(np.mean(ets[j] / np.sqrt(vars_[j])), np.std(ets[j] / np.sqrt(vars_[j])))
    
    plt.tight_layout()
       
    # Store figures as PNG
    fig.savefig(store_loc + f'\\plots\\plot conditional vol {company}')

## 4.5 Export parameter DataFrames

This section creates and exports the DataFrames with parameter estimates of each model included in this research for all companies in this research.

In [None]:
# Create empty DataFrames to store results in
df_psi_hat_1 = pd.DataFrame()
df_psi_hat_2 = pd.DataFrame()
df_psi_hat_3 = pd.DataFrame()

for i in range(len(df_comp_names['Company'])):
    company = df_comp_names['Company'].iloc[i]

    # Store all parameters into a .csv file
    df_params1 = pd.read_csv(data_loc + f'\\parameters\\model1\\params {company}.csv')
    df_params2 = pd.read_csv(data_loc + f'\\parameters\\model2\\params {company}.csv')
    df_params3 = pd.read_csv(data_loc + f'\\parameters\\model3\\params {company}.csv')
    
    # Compile errors in parentheses
    errors1 = df_params1['errors'].apply(lambda x: '%.1e' % x).apply(lambda x: f' ({x})')
    
    # Get p-value based on t-statistic
    p = df_params1['t-stat'].apply(lambda x: ''.join(['*' for alpha in [0.1, 0.05, 0.01] if abs(x) >= norm.ppf(1-alpha)]))    
    df_params = pd.DataFrame(columns = ['param names', f'psi_hat_{company}'])
    
    # Create DataFrame with errors in brackets under estimates, starts indicating the significance
    for j in range(len(df_params1)):
        df_params.loc[len(df_params)] = [df_params1.iloc[j]['param names'], 
                                         df_params1.iloc[j]['psi_hat'].round(3).astype(str) + p[j]]
        df_params.loc[len(df_params)] = ['error_' + df_params1.iloc[j]['param names'], errors1[j]]
            
    df_params1 = df_params
    
    # Compile errors in parentheses
    errors2 = df_params2['errors'].apply(lambda x: '%.1e' % x).apply(lambda x: f' ({x})')
    
    # Get p-value based on t-statistic
    p = df_params2['t-stat'].apply(lambda x: ''.join(['*' for alpha in [0.1, 0.05, 0.01] if abs(x) >= norm.ppf(1-alpha)]))
    df_params = pd.DataFrame(columns = ['param names', f'psi_hat_{company}'])
    
    # Create DataFrame with errors in brackets under estimates, starts indicating the significance
    for j in range(len(df_params2)):
        df_params.loc[len(df_params)] = [df_params2.iloc[j]['param names'], 
                                         df_params2.iloc[j]['psi_hat'].round(3).astype(str) + p[j]]
        df_params.loc[len(df_params)] = ['error_' + df_params2.iloc[j]['param names'], errors2[j]]
            
    df_params2 = df_params
   
    # Compile errors in parentheses
    errors3 = df_params3['errors'].apply(lambda x: '%.1e' % x).apply(lambda x:  f' ({x})')
    
    # Get p-value based on t-statistic
    p = df_params3['t-stat'].apply(lambda x: ''.join(['*' for alpha in [0.1, 0.05, 0.01] if abs(x) >= norm.ppf(1-alpha)])) 
    df_params = pd.DataFrame(columns = ['param names', f'psi_hat_{company}'])
    
    # Create DataFrame with errors in brackets under estimates, starts indicating the significance
    for j in range(len(df_params3)):
        df_params.loc[len(df_params)] = [df_params3.iloc[j]['param names'], 
                                         df_params3.iloc[j]['psi_hat'].round(3).astype(str) + p[j]]
        df_params.loc[len(df_params)] = ['error_' + df_params3.iloc[j]['param names'], errors3[j]]
            
    df_params3 = df_params
    
    if i == 0:        
        df_psi_hat_1 = df_params1
        df_psi_hat_2 = df_params2
        df_psi_hat_3 = df_params3
    else:
        df_psi_hat_1 = df_psi_hat_1.merge(df_params1[['param names', f'psi_hat_{company}']], 
                                          on='param names', 
                                          how='outer') 

        df_psi_hat_2 = df_psi_hat_2.merge(df_params2[['param names', f'psi_hat_{company}']], 
                                          on='param names', 
                                          how='outer')
    

        df_psi_hat_3 = df_psi_hat_3.merge(df_params3[['param names', f'psi_hat_{company}']], 
                                          on='param names', 
                                          how='outer')
    
    # Save DataFrames to .csv
    df_psi_hat_1.to_csv(data_loc + f'\\parameters\\params model1.csv')
    df_psi_hat_2.to_csv(data_loc + f'\\parameters\\params model2.csv')
    df_psi_hat_3.to_csv(data_loc + f'\\parameters\\params model3.csv')

---------
---------