In [9]:
import pandas as pd
import numpy as np

In [10]:
PRICE_FILES = ["data/Stocks/AlternativeEnergy_Price.csv", "data/Stocks/Automobile_Price.csv", "data/Stocks/Bank_Price.csv",
                        "data/Stocks/Beverage_Price.csv", "data/Stocks/BioTech_Price.csv", "data/Stocks/Chemical_Price.csv",
                        "data/Stocks/Construction_Price.csv", "data/Stocks/Electricity_Price.csv", "data/Stocks/Electronic_Price.csv",
                        "data/Stocks/Finance_Price.csv", "data/Stocks/Fix_Price.csv", "data/Stocks/Food_Price.csv", 
                        "data/Stocks/FoodProducer_Price.csv", "data/Stocks/Gas_Price.csv", "data/Stocks/GeneralIndustrial_Price.csv",
                        "data/Stocks/GeneralRetail_Price.csv", "data/Stocks/Hardware_Price.csv", "data/Stocks/Health_Price.csv",
                        "data/Stocks/Household_Price.csv", "data/Stocks/IndustrialEngineer_Price.csv", "data/Stocks/IndustrialMetal_Price.csv", 
                        "data/Stocks/IndustrialTransport_Price.csv", "data/Stocks/Insurance_Price.csv", "data/Stocks/Leisure_Price.csv",
                        "data/Stocks/Media_Price.csv", "data/Stocks/Mining_Price.csv", "data/Stocks/NonLifeInsure_Price.csv",
                        "data/Stocks/OilProducer_Price.csv", "data/Stocks/Paper_Price.csv", "data/Stocks/PersonalGoods_Price.csv",
                        "data/Stocks/RealEstate_Price.csv", "data/Stocks/Software_Price.csv", "data/Stocks/Support_Price.csv",
                        "data/Stocks/Travel_Price.csv", "data/Stocks/Unclassified_Price.csv"]

VOLUME_FILES = ["data/Stocks/AlternativeEnergy_Volume.csv", "data/Stocks/Automobile_Volume.csv", "data/Stocks/Automobile_Volume.csv",
                           "data/Stocks/Beverage_Volume.csv", "data/Stocks/BioTech_Volume.csv", "data/Stocks/Chemical_Volume.csv",
                           "data/Stocks/Construction_Volume.csv", "data/Stocks/Electricity_Volume.csv", "data/Stocks/Electronic_Volume.csv",
                           "data/Stocks/Finance_Volume.csv", "data/Stocks/Fix_Volume.csv", "data/Stocks/Food_Volume.csv", 
                           "data/Stocks/FoodProducer_Volume.csv", "data/Stocks/Gas_Volume.csv", "data/Stocks/GeneralIndustrial_Volume.csv",
                           "data/Stocks/GeneralRetail_Volume.csv", "data/Stocks/Hardware_Volume.csv", "data/Stocks/Health_Volume.csv", 
                           "data/Stocks/Household_Volume.csv", "data/Stocks/IndustrialEngineer_Volume.csv", "data/Stocks/IndustrialMetal_Volume.csv", 
                           "data/Stocks/IndustrialTransport_Volume.csv", "data/Stocks/Insurance_Volume.csv", "data/Stocks/Leisure_Volume.csv",
                           "data/Stocks/Media_Volume.csv", "data/Stocks/Mining_Volume.csv", "data/Stocks/NonLifeInsure_Volume.csv",
                           "data/Stocks/OilProducer_Volume.csv", "data/Stocks/Paper_Volume.csv", "data/Stocks/PersonalGoods_Volume.csv",
                           "data/Stocks/RealEstate_Volume.csv", "data/Stocks/Software_Volume.csv", "data/Stocks/Support_Volume.csv",
                           "data/Stocks/Travel_Volume.csv", "data/Stocks/Unclassified_Volume.csv"]

In [11]:
def RenameHeader(col_name, col_type):
    if col_name == "Code":
        return "Code"
    else:
        if col_type == "Price":
            return col_name[2:-3]
        else:
            return col_name[2:-4]

In [12]:
def generatePrediction(close, prediction_interval):
    ''' Arguments: close (Pandas series containing the close price)
        Returns: Pandas series containing true Y
    '''
    series = np.where(close.shift(-prediction_interval) > 1.005*close, 1, 
                                    np.where(close.shift(-prediction_interval) < 0.995*close, -1, 0))
    return series

In [13]:
def RemoveNoKey(RenderList, key):
    remove_keys = []
    
    for entry in RenderList:
        if key not in entry:
            remove_keys.append(entry["key"]) # Append stock abbreviations we want to remove
            
    if len(remove_keys) == 0:
        return RenderList
    else:
        return [x for x in RenderList if x["key"] not in remove_keys]

In [66]:
def Clean(price, volume):
    
    """
    Argument: price dataframe and volume dataframe
    Return: return the tuples of price dataframes (2014-2020) and volume dataframes (2014-2020)
                where each one of them is filtered out missing col, missing price from prev year, missing vol from prev year,
                        no volume traded occurs from prev year
    Note: this function requires RenameHeader (Defined above) and pandas
    """
    
    # Read two dataframes and filter to have files from 2014 to 2020
    price_df = pd.read_csv(price).rename(columns = lambda x: RenameHeader(x, "Price"))
    price_df['Code'] = pd.to_datetime(price_df['Code'])
    price_df = price_df[(price_df['Code'] > '2014-01-01')]
    
    vol_df = pd.read_csv(volume).rename(columns = lambda x: RenameHeader(x, "Vol"))
    vol_df['Code'] = pd.to_datetime(vol_df['Code'])
    vol_df = vol_df[(vol_df['Code'] > '2014-01-01')]
    
    # Use only common cols in two dataframes
    common_cols = price_df.columns.intersection(vol_df.columns).tolist()
    price_df = price_df[common_cols]
    vol_df = vol_df[common_cols]
    
    # Temporarily include SET_VOL to filter holidays
    SET_IDX_VOL = pd.read_csv('data/SET/SET_VO.csv', parse_dates = True)
    SET_IDX_VOL = SET_IDX_VOL.rename(columns = {'Code': 'Code', 'BNGKSET(VO)': 'Volume'})
    SET_IDX_VOL['Code'] = pd.to_datetime(SET_IDX_VOL['Code'])
    
    # Filter holiday on price dataframes
    price_df = pd.merge(price_df, SET_IDX_VOL, how = 'inner', on = 'Code')
    price_df = price_df[price_df['Volume'].notna()]
    price_df.drop(['Volume'], axis = 1, inplace = True)
    
    # Filter holiday on price dataframes
    vol_df = pd.merge(vol_df, SET_IDX_VOL, how = 'inner', on = 'Code')
    vol_df = vol_df[vol_df['Volume'].notna()]
    vol_df.drop(['Volume'], axis = 1, inplace = True)
    
    # loop from 2014 to 2020 to create small dataframes (useful for filtering later on)
    price_df_s = []
    vol_df_s = []
    for i in range(7):
        x = i + 2014
        start_date = str(x) + '-01-01'
        end_date = str(x+1) + '-01-01'
        price_df_s.append(price_df[(price_df['Code'] > start_date) & (price_df['Code'] < end_date)])
        vol_df_s.append(vol_df[(vol_df['Code'] > start_date) & (vol_df['Code'] < end_date)])
        
    # Filter the columns from 2015 to 2020 by two criterias
    # 1) Any missing price variables from previous year
    # 2) Any missing + zero volume from previous year
    for i in range(6):
        
        price_filter_df = price_df_s[i]
        vol_filter_df = vol_df_s[i]
        
        price_null = price_filter_df.columns[price_filter_df.isna().any()].tolist()
        vol_null = vol_filter_df.columns[vol_filter_df.isna().any()].tolist()
        # Create another copy (a bit inefficient, but it works)
        vol_temp = vol_filter_df.drop(['Code'], axis = 1, inplace = False)
        vol_gt_zero = vol_temp.columns[(vol_temp <= 0).any()].tolist()
        
        filtered_out_col = list(set().union(price_null,vol_null,vol_gt_zero))
        
        price_df_s[i+1].drop(filtered_out_col, axis = 1, inplace = True)
        vol_df_s[i+1].drop(filtered_out_col, axis = 1, inplace = True)
        
        
    return (price_df_s, vol_df_s)
        

In [67]:
def GenerateFeatures(price_list, vol_list):
    """
    Input: price dataframes list, vol dataframes list
    Return: list of list of dict
    (for example, [automobile, ...] -> [2014, 2015, 2016, ...] -> {price: price_df, vol: vol_df, feature1: ...})
    """
    return_list = []
    for i in range(len(price_list)):
        compile_list = []
        price_df_s, vol_df_s = Clean(price_list[i], vol_list[i])
        
        for j in range(len(price_df_s)):
            sub_dict = {}
            # Initial dataframes we got from raw data
            price_df = price_df_s[j]
            vol_df = vol_df_s[j]
            
            ## Define new dataframes: this records MACD
            trend_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            trend_df['Code'] = price_df['Code']
            
            ## Define new dataframes: this records momentum
            mom3_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            mom3_df['Code'] = price_df['Code']
            mom5_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            mom5_df['Code'] = price_df['Code']         
            mom10_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            mom10_df['Code'] = price_df['Code']
            mom14_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            mom14_df['Code'] = price_df['Code']
            mom20_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            mom20_df['Code'] = price_df['Code']
            
            ## Define new dataframes: this records volatility (14-day STD)
            std3_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            std3_df['Code'] = price_df['Code']      
            std5_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            std5_df['Code'] = price_df['Code']         
            std10_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            std10_df['Code'] = price_df['Code'] 
            std14_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            std14_df['Code'] = price_df['Code'] 
            std20_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            std20_df['Code'] = price_df['Code'] 
            
            ## Define new dataframes: volume
            vol_ind_df =  pd.DataFrame(index=price_df.index, columns=price_df.columns)
            vol_ind_df['Code'] = price_df['Code']
            
            ## Define prediction in each time horizon
            y3_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            y3_df['Code'] = price_df['Code']
            y5_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            y5_df['Code'] = price_df['Code']
            y10_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            y10_df['Code'] = price_df['Code']
            y14_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            y14_df['Code'] = price_df['Code']
            y20_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            y20_df['Code'] = price_df['Code']
            yn1_df = pd.DataFrame(index=price_df.index, columns=price_df.columns)
            yn1_df['Code'] = price_df['Code']
            
            
            for col in list(price_df.columns):
                if col != 'Code':
                    
                    # Record MACD
                    trend_df[col] = price_df.loc[:, col].ewm(span=12, adjust=False).mean() - price_df.loc[:, col].ewm(span=26, adjust=False).mean() 
                    
                    # Record pct_change as momentum
                    mom3_df[col] = price_df.loc[:, col].pct_change(periods = 3)
                    mom5_df[col] = price_df.loc[:, col].pct_change(periods = 5)
                    mom10_df[col] = price_df.loc[:, col].pct_change(periods = 10)
                    mom14_df[col] = price_df.loc[:, col].pct_change(periods = 14)
                    mom20_df[col] = price_df.loc[:, col].pct_change(periods = 20)
                    
                    # Record rolling std
                    std3_df[col] = price_df.loc[:, col].rolling(3).std()
                    std5_df[col] = price_df.loc[:, col].rolling(5).std()
                    std10_df[col] = price_df.loc[:, col].rolling(10).std()
                    std14_df[col] = price_df.loc[:, col].rolling(14).std()
                    std20_df[col] = price_df.loc[:, col].rolling(20).std()
                    
                    # Record volume indicator
                    vol_ind_df[col] = (vol_df.loc[:, col].rolling(window=2).mean() - vol_df.loc[:, col].rolling(window=14).mean()).apply(lambda x: 1 if x > 0 else 0)
                    
                    # Record rolling prediction
                    y3_df[col] = generatePrediction(price_df.loc[:, col], 3)
                    y5_df[col] = generatePrediction(price_df.loc[:, col], 5)
                    y10_df[col] = generatePrediction(price_df.loc[:, col], 10)
                    y14_df[col] = generatePrediction(price_df.loc[:, col], 14)
                    y20_df[col] = generatePrediction(price_df.loc[:, col], 20)
                    yn1_df[col] = generatePrediction(price_df.loc[:, col], -1)
            
            sub_dict['price'] = price_df
            sub_dict['vol'] = vol_df
            
            sub_dict['trend'] = trend_df
            
            sub_dict['vol_sig'] =  vol_ind_df
            
            sub_dict['mom3'] = mom3_df
            sub_dict['mom5'] = mom5_df
            sub_dict['mom10'] = mom10_df
            sub_dict['mom14'] = mom14_df
            sub_dict['mom20'] = mom20_df
                        
            sub_dict['std3'] = std3_df
            sub_dict['std5'] = std5_df     
            sub_dict['std10'] = std10_df
            sub_dict['std14'] = std14_df     
            sub_dict['std20'] = std20_df
            
            sub_dict['y3'] = y3_df
            sub_dict['y5'] = y5_df
            sub_dict['y10'] = y10_df
            sub_dict['y14'] = y14_df
            sub_dict['y20'] = y20_df
            sub_dict['yn1'] = yn1_df
            
            compile_list.append(sub_dict)
            
        return_list.append(compile_list)
        
    return return_list

In [68]:
def Rearrange(sector_list):
    """
    Input: sector_list (return list from GenerateFeatures function above)
    Return: list of list of dict as follows
        [2014, 2015, 2016, ...] -> {key: PTT, data: dataframes}
        (similar how we usually render in React)
    """
    # Each sublist is for each year
    return_list = [[], [], [], [], [], [], []]
    
    # i runs from 0 to # of sectors - 1
    for i in range(len(sector_list)):
        sector_stocks = sector_list[i]
        # j runs from 0 to 6 (0 -> 2014, 6 -> 2020)
        for j in range(7):
            sector_year_dict = sector_stocks[j]
            col_names = sector_year_dict['price'].columns.tolist()
            # k runs for each column except "Code"
            for k in range(1, len(col_names)):
                stock_name = col_names[k]
                # Step 1: create blank dataframes
                COLS = ['Code', 'vol', 'price', 'trend', 'mom3', 'mom5', 'mom10', 'mom14', 'mom20', 'std3', 'std5', 'std10', 'std14', 'std20', 'vol_sig', 'y3', 'y5', 'y10', 'y14', 'y20', 'yn1']
                new_data = pd.DataFrame(index=sector_year_dict['price'].index, columns= COLS)
                new_data['Code'] = sector_year_dict['price']['Code']
                
                # Step 2: Retrieve data from the key and insert it into new_data
                
                new_data['vol'] = sector_year_dict['vol'].loc[:, stock_name]
                new_data['price'] = sector_year_dict['price'].loc[:, stock_name]
                new_data['trend'] = sector_year_dict['trend'].loc[:, stock_name]
                new_data['mom3'] = sector_year_dict['mom3'].loc[:, stock_name]
                new_data['mom5'] = sector_year_dict['mom5'].loc[:, stock_name]
                new_data['mom10'] = sector_year_dict['mom10'].loc[:, stock_name]
                new_data['mom14'] = sector_year_dict['mom14'].loc[:, stock_name]
                new_data['mom20'] = sector_year_dict['mom20'].loc[:, stock_name]
                new_data['std3'] = sector_year_dict['std3'].loc[:, stock_name]
                new_data['std5'] = sector_year_dict['std5'].loc[:, stock_name]
                new_data['std10'] = sector_year_dict['std10'].loc[:, stock_name]
                new_data['std14'] = sector_year_dict['std14'].loc[:, stock_name]
                new_data['std20'] = sector_year_dict['std20'].loc[:, stock_name]
                new_data['vol_sig'] = sector_year_dict['vol_sig'].loc[:, stock_name]
                new_data['y3'] = sector_year_dict['y3'].loc[:, stock_name]
                new_data['y5'] = sector_year_dict['y5'].loc[:, stock_name]
                new_data['y10'] = sector_year_dict['y10'].loc[:, stock_name]
                new_data['y14'] = sector_year_dict['y14'].loc[:, stock_name]
                new_data['y20'] = sector_year_dict['y20'].loc[:, stock_name]
                new_data['yn1'] = sector_year_dict['yn1'].loc[:, stock_name]
                
                # Step 3: Drop rows with at least one missing value
                new_data.dropna(axis = 0, how = 'any', inplace = True)
                
                # Step 4: Insert the new_dataframe into appropriate place (notice we use dict for convenience later)
                new_dict = {'key': stock_name, 'data': new_data}
                return_list[j].append(new_dict)
                
    return return_list

In [69]:
def GetVolumeTraded(RenderList):
    """
    input: RenderList, using the same data structure corresponding to Rearrange function (defined above)
    Process: Add key of volume_traded in each stock-year (calculated from last year)
    Return: nothing
    """
    # For each year from 2015 to 2020
    for i in range(1, 7):
        
        # For each stock that we want to add the new key: volume traded
        for stock in RenderList[i]:
            stock_name = stock['key']
            
            prev_data = RenderList[i-1]
            
            # This is inefficient, but it's okay, because otherwise, we need to change the whole data structure
            for prev_stock in prev_data:
                if prev_stock['key'] == stock_name:
                    # Calculate the latest volume in last year
                    stock['volume_traded'] = prev_stock['data'].loc[:, 'vol'].iloc[-1]
                    # Calculate # of ups in dfs as proportions
                    stock['up3'] = (prev_stock['data'].loc[:, 'y3'] == 1).mean()
                    stock['up5'] = (prev_stock['data'].loc[:, 'y5'] == 1).mean()
                    stock['up10'] = (prev_stock['data'].loc[:, 'y10'] == 1).mean()
                    stock['up14'] = (prev_stock['data'].loc[:, 'y14'] == 1).mean()
                    stock['up20'] = (prev_stock['data'].loc[:, 'y20'] == 1).mean()
                    # Calculate # of downs in dfs as proportions
                    stock['down3'] = (prev_stock['data'].loc[:, 'y3'] == -1).mean()
                    stock['down5'] = (prev_stock['data'].loc[:, 'y5'] == -1).mean()
                    stock['down10'] = (prev_stock['data'].loc[:, 'y10'] == -1).mean()
                    stock['down14'] = (prev_stock['data'].loc[:, 'y14'] == -1).mean()
                    stock['down20'] = (prev_stock['data'].loc[:, 'y20'] == -1).mean()
                    # Calculate average MACD of that stock
                    stock['MACD'] = prev_stock['data'].loc[:, 'trend'].mean()
                    # Calculate average mom of that stock
                    stock['mom3'] = prev_stock['data'].loc[:, 'mom3'].mean()
                    stock['mom5'] = prev_stock['data'].loc[:, 'mom5'].mean()
                    stock['mom10'] = prev_stock['data'].loc[:, 'mom10'].mean()
                    stock['mom14'] = prev_stock['data'].loc[:, 'mom14'].mean()
                    stock['mom20'] = prev_stock['data'].loc[:, 'mom20'].mean()                    
                    # Calculate average STD of that stock
                    stock['std3'] = prev_stock['data'].loc[:, 'std3'].mean()
                    stock['std5'] = prev_stock['data'].loc[:, 'std5'].mean()
                    stock['std10'] = prev_stock['data'].loc[:, 'std10'].mean()
                    stock['std14'] = prev_stock['data'].loc[:, 'std14'].mean()
                    stock['std20'] = prev_stock['data'].loc[:, 'std20'].mean()
                    # Calculate proportion of strong volume signal of that stock
                    stock['vol_sig'] = prev_stock['data'].loc[:, 'vol_sig'].mean()
                    


In [70]:
def GenerateTable(RenderList, columns):
    """
    Input: RenderList, using the same data structure corresponding to Rearrange function and has the key volume_traded
    This will output the result of average coefficient across five years into csv file
    Return: nothing
    """
    
    for i in range(1, 7):
        
        YEAR = str(i+2014)
        
        # Returned dataframe
        return_df = pd.DataFrame(index=range(10) , columns= columns)
        return_df = return_df.fillna(0)
        
        ## For debug purpose
        print('We are at year ' + str(i))
        
        # Sort by volume traded and split into ten sub-lists
        stock_list = RemoveNoKey(RenderList[i], 'volume_traded')
        stock_list = sorted(stock_list, key = lambda x: x['volume_traded'])
        ten_splits = np.array_split(stock_list, 10)
        
        for j in range(10):
            ## For debug purpose
            print('---> We are at decile of ' + str(j) + ' consisting of '+ str(len(ten_splits[j])) + ' firms!')
            
            X = [0 for x in range(len(columns))]
            
            for Render in ten_splits[j]:                
                for col_idx in range(len(columns)):
                    X[col_idx] += Render[columns[col_idx]]
                    
            for col_idx in range(len(columns)):
                X[col_idx] = X[col_idx]/len(ten_splits[j])
                return_df.iloc[j, col_idx] = X[col_idx]
        
        return_df.to_csv('Model_V2_' + YEAR + '.csv')          
                    

In [71]:
cleaned_dfs = GenerateFeatures(PRICE_FILES, VOLUME_FILES)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [72]:
RenderList = Rearrange(cleaned_dfs)

In [73]:
GetVolumeTraded(RenderList)

In [74]:
len(RenderList[-1])

538

In [75]:
COLS = ['up3', 'up5', 'up10', 'up14', 'up20', 'down3', 'down5', 'down10', 'down14', 'down20', 'MACD', 'mom3', 'mom5', 'mom10', 'mom14', 'mom20', 'std3', 'std5', 'std10', 'std14', 'std20', 'vol_sig']
GenerateTable(RenderList, COLS)

We are at year 1
---> We are at decile of 0 consisting of 27 firms!
---> We are at decile of 1 consisting of 27 firms!
---> We are at decile of 2 consisting of 27 firms!
---> We are at decile of 3 consisting of 27 firms!
---> We are at decile of 4 consisting of 27 firms!
---> We are at decile of 5 consisting of 26 firms!
---> We are at decile of 6 consisting of 26 firms!
---> We are at decile of 7 consisting of 26 firms!
---> We are at decile of 8 consisting of 26 firms!
---> We are at decile of 9 consisting of 26 firms!
We are at year 2
---> We are at decile of 0 consisting of 25 firms!
---> We are at decile of 1 consisting of 25 firms!
---> We are at decile of 2 consisting of 25 firms!
---> We are at decile of 3 consisting of 25 firms!
---> We are at decile of 4 consisting of 25 firms!
---> We are at decile of 5 consisting of 25 firms!
---> We are at decile of 6 consisting of 24 firms!
---> We are at decile of 7 consisting of 24 firms!
---> We are at decile of 8 consisting of 24 firm