In [1]:
import pandas as pd
import numpy as np

In [2]:
PRICE_FILES = ["data/Stocks/AlternativeEnergy_Price.csv", "data/Stocks/Automobile_Price.csv", "data/Stocks/Bank_Price.csv",
                        "data/Stocks/Beverage_Price.csv", "data/Stocks/BioTech_Price.csv", "data/Stocks/Chemical_Price.csv",
                        "data/Stocks/Construction_Price.csv", "data/Stocks/Electricity_Price.csv", "data/Stocks/Electronic_Price.csv",
                        "data/Stocks/Finance_Price.csv", "data/Stocks/Fix_Price.csv", "data/Stocks/Food_Price.csv", 
                        "data/Stocks/FoodProducer_Price.csv", "data/Stocks/Gas_Price.csv", "data/Stocks/GeneralIndustrial_Price.csv",
                        "data/Stocks/GeneralRetail_Price.csv", "data/Stocks/Hardware_Price.csv", "data/Stocks/Health_Price.csv",
                        "data/Stocks/Household_Price.csv", "data/Stocks/IndustrialEngineer_Price.csv", "data/Stocks/IndustrialMetal_Price.csv", 
                        "data/Stocks/IndustrialTransport_Price.csv", "data/Stocks/Insurance_Price.csv", "data/Stocks/Leisure_Price.csv",
                        "data/Stocks/Media_Price.csv", "data/Stocks/Mining_Price.csv", "data/Stocks/NonLifeInsure_Price.csv",
                        "data/Stocks/OilProducer_Price.csv", "data/Stocks/Paper_Price.csv", "data/Stocks/PersonalGoods_Price.csv",
                        "data/Stocks/RealEstate_Price.csv", "data/Stocks/Software_Price.csv", "data/Stocks/Support_Price.csv",
                        "data/Stocks/Travel_Price.csv", "data/Stocks/Unclassified_Price.csv"]

VOLUME_FILES = ["data/Stocks/AlternativeEnergy_Volume.csv", "data/Stocks/Automobile_Volume.csv", "data/Stocks/Automobile_Volume.csv",
                           "data/Stocks/Beverage_Volume.csv", "data/Stocks/BioTech_Volume.csv", "data/Stocks/Chemical_Volume.csv",
                           "data/Stocks/Construction_Volume.csv", "data/Stocks/Electricity_Volume.csv", "data/Stocks/Electronic_Volume.csv",
                           "data/Stocks/Finance_Volume.csv", "data/Stocks/Fix_Volume.csv", "data/Stocks/Food_Volume.csv", 
                           "data/Stocks/FoodProducer_Volume.csv", "data/Stocks/Gas_Volume.csv", "data/Stocks/GeneralIndustrial_Volume.csv",
                           "data/Stocks/GeneralRetail_Volume.csv", "data/Stocks/Hardware_Volume.csv", "data/Stocks/Health_Volume.csv", 
                           "data/Stocks/Household_Volume.csv", "data/Stocks/IndustrialEngineer_Volume.csv", "data/Stocks/IndustrialMetal_Volume.csv", 
                           "data/Stocks/IndustrialTransport_Volume.csv", "data/Stocks/Insurance_Volume.csv", "data/Stocks/Leisure_Volume.csv",
                           "data/Stocks/Media_Volume.csv", "data/Stocks/Mining_Volume.csv", "data/Stocks/NonLifeInsure_Volume.csv",
                           "data/Stocks/OilProducer_Volume.csv", "data/Stocks/Paper_Volume.csv", "data/Stocks/PersonalGoods_Volume.csv",
                           "data/Stocks/RealEstate_Volume.csv", "data/Stocks/Software_Volume.csv", "data/Stocks/Support_Volume.csv",
                           "data/Stocks/Travel_Volume.csv", "data/Stocks/Unclassified_Volume.csv"]

In [3]:
def RenameHeader(col_name, col_type):
    if col_name == "Code":
        return "Code"
    else:
        if col_type == "Price":
            return col_name[2:-3]
        else:
            return col_name[2:-4]

In [4]:
def Clean(price, volume):
    
    """
    Argument: price dataframe and volume dataframe
    Return: return the tuples of price dataframes (2014-2019) and volume dataframes (2014-2019)
                where each one of them is filtered out missing col, missing price from prev year, missing vol from prev year,
                        no volume traded occurs from prev year
    Note: this function requires RenameHeader (Defined above) and pandas
    """
    
    # Read two dataframes and filter to have files from 2014 to 2019
    price_df = pd.read_csv(price).rename(columns = lambda x: RenameHeader(x, "Price"))
    price_df['Code'] = pd.to_datetime(price_df['Code'])
    price_df = price_df[(price_df['Code'] > '2014-01-01') & (price_df['Code'] < '2020-01-01')]
    
    vol_df = pd.read_csv(volume).rename(columns = lambda x: RenameHeader(x, "Vol"))
    vol_df['Code'] = pd.to_datetime(vol_df['Code'])
    vol_df = vol_df[(vol_df['Code'] > '2014-01-01') & (vol_df['Code'] < '2020-01-01')]
    
    # Use only common cols in two dataframes
    common_cols = price_df.columns.intersection(vol_df.columns).tolist()
    price_df = price_df[common_cols]
    vol_df = vol_df[common_cols]
    
    # Temporarily include SET_VOL to filter holidays
    SET_IDX_VOL = pd.read_csv('data/SET/SET_VO.csv', parse_dates = True)
    SET_IDX_VOL = SET_IDX_VOL.rename(columns = {'Code': 'Code', 'BNGKSET(VO)': 'Volume'})
    SET_IDX_VOL['Code'] = pd.to_datetime(SET_IDX_VOL['Code'])
    
    # Filter holiday on price dataframes
    price_df = pd.merge(price_df, SET_IDX_VOL, how = 'inner', on = 'Code')
    price_df = price_df[price_df['Volume'].notna()]
    price_df.drop(['Volume'], axis = 1, inplace = True)
    
    # Filter holiday on price dataframes
    vol_df = pd.merge(vol_df, SET_IDX_VOL, how = 'inner', on = 'Code')
    vol_df = vol_df[vol_df['Volume'].notna()]
    vol_df.drop(['Volume'], axis = 1, inplace = True)
    
    # loop from 2014 to 2019 to create small dataframes (useful for filtering later on)
    price_df_s = []
    vol_df_s = []
    for i in range(6):
        x = i + 2014
        start_date = str(x) + '-01-01'
        end_date = str(x+1) + '-01-01'
        price_df_s.append(price_df[(price_df['Code'] > start_date) & (price_df['Code'] < end_date)])
        vol_df_s.append(vol_df[(vol_df['Code'] > start_date) & (vol_df['Code'] < end_date)])
        
    # Filter the columns from 2015 to 2019 by two criterias
    # 1) Any missing price variables from previous year
    # 2) Any missing + zero volume from previous year
    for i in range(5):
        
        price_filter_df = price_df_s[i]
        vol_filter_df = vol_df_s[i]
        
        price_null = price_filter_df.columns[price_filter_df.isna().any()].tolist()
        vol_null = vol_filter_df.columns[vol_filter_df.isna().any()].tolist()
        # Create another copy (a bit inefficient, but it works)
        vol_temp = vol_filter_df.drop(['Code'], axis = 1, inplace = False)
        vol_gt_zero = vol_temp.columns[(vol_temp <= 0).any()].tolist()
        
        filtered_out_col = list(set().union(price_null,vol_null,vol_gt_zero))
        
        price_df_s[i+1].drop(filtered_out_col, axis = 1, inplace = True)
        vol_df_s[i+1].drop(filtered_out_col, axis = 1, inplace = True)
        
        
    return (price_df_s, vol_df_s)

In [5]:
def GenerateFeatures(price_list, vol_list):
    """
    Input: price dataframes list, vol dataframes list
    Return: list of list of dict
    (for example, [automobile, ...] -> [2014, 2015, 2016, ...] -> {price: price_df, vol: vol_df, feature1: ...})
    """
    return_list = []
    for i in range(len(price_list)):
        compile_list = []
        price_df_s, vol_df_s = Clean(price_list[i], vol_list[i])
        
        for j in range(len(price_df_s)):
            sub_dict = {}
            price_df = price_df_s[j]
            vol_df = vol_df_s[j]
            
            ## Define new dataframes: return dataframe
            return3_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            return3_df['Code'] = price_df['Code']
            
            return5_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            return5_df['Code'] = price_df['Code']    
            
            return10_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            return10_df['Code'] = price_df['Code']
            
            return14_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            return14_df['Code'] = price_df['Code']
            
            return20_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            return20_df['Code'] = price_df['Code']
            
            for col in list(price_df.columns):
                if col != 'Code':
                    return3_df[col] = 100*price_df.loc[:, col].pct_change(periods = 3)
                    return5_df[col] = 100*price_df.loc[:, col].pct_change(periods = 5)
                    return10_df[col] = 100*price_df.loc[:, col].pct_change(periods = 10)
                    return14_df[col] = 100*price_df.loc[:, col].pct_change(periods = 14)
                    return20_df[col] = 100*price_df.loc[:, col].pct_change(periods = 20)
                    
            
            sub_dict['price'] = price_df
            sub_dict['vol'] = vol_df
            sub_dict['ret_3'] = return3_df
            sub_dict['ret_5'] = return5_df     
            sub_dict['ret_10'] = return10_df
            sub_dict['ret_14'] = return14_df
            sub_dict['ret_20'] = return20_df

            compile_list.append(sub_dict)
            
        return_list.append(compile_list)
        
    return return_list

In [11]:
def Rearrange(sector_list):
    """
    Input: sector_list (return list from GenerateFeatures function above)
    Return: list of list of dict as follows
        [2014, 2015, 2016, ...] -> {key: PTT, data: dataframes}
        (similar how we usually render in React)
    """
    # Each sublist is for each year
    return_list = [[], [], [], [], [], []]
    
    # i runs from 0 to # of sectors - 1
    for i in range(len(sector_list)):
        sector_stocks = sector_list[i]
        # j runs from 0 to 5 (0 -> 2014, 5 -> 2019)
        for j in range(6):
            sector_year_dict = sector_stocks[j]
            col_names = sector_year_dict['price'].columns.tolist()
            # k runs for each column except "Code"
            for k in range(1, len(col_names)):
                stock_name = col_names[k]
                # Step 1: create blank dataframes
                new_data = pd.DataFrame(index=sector_year_dict['price'].index, columns= ['Code', 'vol', 'price', 'ret_3', 'ret_5', 'ret_10', 'ret_14', 'ret_20'])
                new_data['Code'] = sector_year_dict['price']['Code']
                # Step 2: Retrieve data from the key and insert it into new_data
                new_data['vol'] = sector_year_dict['vol'].loc[:, stock_name]
                new_data['price'] = sector_year_dict['price'].loc[:, stock_name]
                new_data['ret_3'] = sector_year_dict['ret_3'].loc[:, stock_name]
                new_data['ret_5'] = sector_year_dict['ret_5'].loc[:, stock_name]
                new_data['ret_10'] = sector_year_dict['ret_10'].loc[:, stock_name]
                new_data['ret_14'] = sector_year_dict['ret_14'].loc[:, stock_name]
                new_data['ret_20'] = sector_year_dict['ret_20'].loc[:, stock_name]
                
                # Step 3: Drop rows with at least one missing value
                new_data.dropna(axis = 0, how = 'any', inplace = True)
                # Step 4: Insert the new_dataframe into appropriate place (notice we use dict for convenience later)
                new_dict = {'key': stock_name, 'data': new_data}
                return_list[j].append(new_dict)
                
    return return_list

In [7]:
def GetVolumeTraded(RenderList):
    """
    input: RenderList, using the same data structure corresponding to Rearrange function (defined above)
    Process: Add key of volume_traded in each stock-year (calculated from last year)
    Return: nothing
    """
    # For each year from 2015 to 2019
    for i in range(1, 6):
        
        # For each stock that we want to add the new key: volume traded
        for stock in RenderList[i]:
            stock_name = stock['key']
            
            prev_data = RenderList[i-1]
            
            # This is inefficient, but it's okay, because otherwise, we need to change the whole data structure
            for prev_stock in prev_data:
                if prev_stock['key'] == stock_name:
                    # Calculate the latest volume in last year
                    stock['volume_traded'] = prev_stock['data'].loc[:, 'vol'].iloc[-1]

In [8]:
def RemoveNoKey(RenderList, key):
    remove_keys = []
    
    for entry in RenderList:
        if key not in entry:
            remove_keys.append(entry["key"]) # Append stock abbreviations we want to remove
            
    if len(remove_keys) == 0:
        return RenderList
    else:
        return [x for x in RenderList if x["key"] not in remove_keys]

In [13]:
from arch import arch_model

def GenerateTable_GARCH(RenderList):
    """
    Input: RenderList, using the same data structure corresponding to Rearrange function and has the key volume_traded
    This will output the result of average coefficient across five years into csv file
    Return: nothing
    """
    # This is the return dataframe with three columns corresponding to each coefficient
    return3_df = pd.DataFrame(index=range(10) , columns= ['alpha', 'gamma', 'beta'])
    return3_df = return3_df.fillna(0)
    
    return5_df = pd.DataFrame(index=range(10) , columns= ['alpha', 'gamma', 'beta'])
    return5_df = return5_df.fillna(0)
    
    return10_df = pd.DataFrame(index=range(10) , columns= ['alpha', 'gamma', 'beta'])
    return10_df = return10_df.fillna(0)
    
    return14_df = pd.DataFrame(index=range(10) , columns= ['alpha', 'gamma', 'beta'])
    return14_df = return14_df.fillna(0)
    
    return20_df = pd.DataFrame(index=range(10) , columns= ['alpha', 'gamma', 'beta'])
    return20_df = return20_df.fillna(0)
    
    for i in range(1, 6):
        ## For debug purpose
        print('We are at year ' + str(i))
        # Sort by volume traded and split into ten sub-lists
        stock_list = RemoveNoKey(RenderList[i], 'volume_traded')
        stock_list = sorted(stock_list, key = lambda x: x['volume_traded'])
        ten_splits = np.array_split(stock_list, 10)
        for j in range(10):
            ## For debug purpose
            print('---> We are at decile of ' + str(j) + ' consisting of '+ str(len(ten_splits[j])) + ' firms!')
            
            # Accumulator
            ret3_alpha, ret3_gamma, ret3_beta = 0, 0, 0
            ret5_alpha, ret5_gamma, ret5_beta = 0, 0, 0
            ret10_alpha, ret10_gamma, ret10_beta = 0, 0, 0
            ret14_alpha, ret14_gamma, ret14_beta = 0, 0, 0
            ret20_alpha, ret20_gamma, ret20_beta = 0, 0, 0
            
            for Render in ten_splits[j]:
                combined_df = Render['data']
                
                # Run first model: Ret on 3
                am_3 = arch_model(combined_df['ret_3'], p=1, o=1, q=1)
                res_3 = am_3.fit(update_freq=5, disp='off')
                alpha_3, gamma_3, beta_3 = res_3.params['alpha[1]'], res_3.params['gamma[1]'], res_3.params['beta[1]']
                ret3_alpha += alpha_3
                ret3_gamma += gamma_3
                ret3_beta += beta_3
                
                am_5 = arch_model(combined_df['ret_5'], p=1, o=1, q=1)
                res_5 = am_5.fit(update_freq=5, disp='off')
                alpha_5, gamma_5, beta_5 = res_5.params['alpha[1]'], res_5.params['gamma[1]'], res_5.params['beta[1]']
                ret5_alpha += alpha_5
                ret5_gamma += gamma_5
                ret5_beta += beta_5
                
                am_10 = arch_model(combined_df['ret_10'], p=1, o=1, q=1)
                res_10 = am_10.fit(update_freq=5, disp='off')   
                alpha_10, gamma_10, beta_10 = res_10.params['alpha[1]'], res_10.params['gamma[1]'], res_10.params['beta[1]']
                ret10_alpha += alpha_10
                ret10_gamma += gamma_10
                ret10_beta += beta_10
                
                am_14 = arch_model(combined_df['ret_14'], p=1, o=1, q=1)
                res_14 = am_5.fit(update_freq=5, disp='off')   
                alpha_14, gamma_14, beta_14 = res_14.params['alpha[1]'], res_14.params['gamma[1]'], res_14.params['beta[1]']
                ret14_alpha += alpha_14
                ret14_gamma += gamma_14
                ret14_beta += beta_14
                
                am_20 = arch_model(combined_df['ret_20'], p=1, o=1, q=1)
                res_20 = am_20.fit(update_freq=5, disp='off')   
                alpha_20, gamma_20, beta_20 = res_20.params['alpha[1]'], res_20.params['gamma[1]'], res_20.params['beta[1]']
                ret20_alpha += alpha_20
                ret20_gamma += gamma_20
                ret20_beta += beta_20
                
            return3_df.loc[j, 'alpha'] += (ret3_alpha/len(ten_splits[j]))
            return3_df.loc[j, 'gamma'] += (ret3_gamma/len(ten_splits[j]))
            return3_df.loc[j, 'beta'] += (ret3_beta/len(ten_splits[j]))
            
            return5_df.loc[j, 'alpha'] += (ret5_alpha/len(ten_splits[j]))
            return5_df.loc[j, 'gamma'] += (ret5_gamma/len(ten_splits[j]))
            return5_df.loc[j, 'beta'] += (ret5_beta/len(ten_splits[j]))
            
            return10_df.loc[j, 'alpha'] += (ret10_alpha/len(ten_splits[j]))
            return10_df.loc[j, 'gamma'] += (ret10_gamma/len(ten_splits[j]))
            return10_df.loc[j, 'beta'] += (ret10_beta/len(ten_splits[j]))
            
            return14_df.loc[j, 'alpha'] += (ret14_alpha/len(ten_splits[j]))
            return14_df.loc[j, 'gamma'] += (ret14_gamma/len(ten_splits[j]))
            return14_df.loc[j, 'beta'] += (ret14_beta/len(ten_splits[j]))
            
            return20_df.loc[j, 'alpha'] += (ret20_alpha/len(ten_splits[j]))
            return20_df.loc[j, 'gamma'] += (ret20_gamma/len(ten_splits[j]))
            return20_df.loc[j, 'beta'] += (ret20_beta/len(ten_splits[j]))
                
            
    # Get average value
    return3_df['alpha'] /= 5
    return3_df['beta'] /= 5
    return3_df['gamma'] /= 5
    return3_df.to_csv('GARCH_3.csv')
    
    return5_df['alpha'] /= 5
    return5_df['beta'] /= 5
    return5_df['gamma'] /= 5
    return5_df.to_csv('GARCH_5.csv')    
    
    return10_df['alpha'] /= 5
    return10_df['beta'] /= 5
    return10_df['gamma'] /= 5
    return10_df.to_csv('GARCH_10.csv')
    
    return14_df['alpha'] /= 5
    return14_df['beta'] /= 5
    return14_df['gamma'] /= 5
    return14_df.to_csv('GARCH_14.csv')
    
    return20_df['alpha'] /= 5
    return20_df['beta'] /= 5
    return20_df['gamma'] /= 5
    return20_df.to_csv('GARCH_20.csv')
    

In [14]:
cleaned_dfs = GenerateFeatures(PRICE_FILES, VOLUME_FILES)
RenderList = Rearrange(cleaned_dfs)
GetVolumeTraded(RenderList)
GenerateTable_GARCH(RenderList)

We are at year 1
---> We are at decile of 0 consisting of 27 firms!
---> We are at decile of 1 consisting of 27 firms!
---> We are at decile of 2 consisting of 27 firms!
---> We are at decile of 3 consisting of 27 firms!
---> We are at decile of 4 consisting of 27 firms!
---> We are at decile of 5 consisting of 26 firms!
---> We are at decile of 6 consisting of 26 firms!
---> We are at decile of 7 consisting of 26 firms!
---> We are at decile of 8 consisting of 26 firms!
---> We are at decile of 9 consisting of 26 firms!
We are at year 2
---> We are at decile of 0 consisting of 25 firms!
---> We are at decile of 1 consisting of 25 firms!
---> We are at decile of 2 consisting of 25 firms!
---> We are at decile of 3 consisting of 25 firms!
---> We are at decile of 4 consisting of 25 firms!
---> We are at decile of 5 consisting of 25 firms!
---> We are at decile of 6 consisting of 24 firms!
---> We are at decile of 7 consisting of 24 firms!
---> We are at decile of 8 consisting of 24 firm