In [1]:
import pandas as pd
import glob
import talib
import datetime
import numpy as np
# import pandas_ta as ta
from scipy import stats
import matplotlib.pyplot as plt
import six
from itertools import combinations
import multiprocessing


import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 5000)
pd.set_option('max_colwidth', -1)

In [2]:
class Indicator:
    
    df_expiry = pd.DataFrame()
    
    frequency = {'5min':'5Min', '60min':'H', 'daily':'D', 'weekly':'W', 'monthly':'M', 'quarterly':'Q', 'yearly':'A', 'weekly-thursday': 'W-THU'}
    resampling_dict = {'open':'first', 'high':'max', 'low':'min', 'close':'last', 'volume':'sum', 'time':'last', 'date':'last'}
    renaming_dict = {'open':'open', 'high':'high', 'low':'low', 'close':'close', 'volume':'volume', 'time':'time', 'date':'date'}
    
    @staticmethod
    def resample_df(df, period, expiry=False, opex=False):
        
        df.set_index('datetime', inplace=True)
        
        if expiry:
            resampling_dict = {'open': 'first', 'high': 'max', 'low': 'min',
                           'close': 'last', 'date': 'first', 'time': 'first', 'expiry_date':'first'}
        elif opex:
            resampling_dict = {'open': 'first', 'high': 'max', 'low': 'min',
                           'close': 'last', 'date': 'first', 'time': 'first', 'opex':'first'}
        else:
            resampling_dict = {'open': 'first', 'high': 'max', 'low': 'min',
                           'close': 'last', 'date': 'first', 'time': 'first'}

        df = df.resample(period).apply(resampling_dict)
        df = df.dropna()
        df = df.reset_index()
        
        return df
    
    @staticmethod
    def get_unique_time_list(df, period):
        
        if period == '1H':
            times_list = [datetime.time(9, 15),datetime.time(10, 0),datetime.time(11, 0),datetime.time(12, 0),datetime.time(13, 0),datetime.time(14, 0),datetime.time(15, 0)]
        elif period == '5min':
            times_list = [i for i in df['time'].unique() if i.minute%5 == 0 and i<datetime.time(15,25)]
        
        return times_list
    
    @staticmethod
    def get_A_level(df, params):
        
        # df=None, atr_lookback=14, premium_multiplier=0.2, timeframe='daily', open_range='5min'
        timeframe = params[0]
        atr_lookback = params[1]
        premium_multiplier = params[2]
        open_range = params[3]
        
        df_with_prem = get_prem(df=df, timeframe=timeframe, atr_lookback=atr_lookback, premium_multiplier=premium_multiplier)
        df[f'{timeframe}_atr'] = df_with_prem[f'{timeframe}_atr']
        df[f'{timeframe}_atr'] = df[f'{timeframe}_atr'].shift()
        df[f'{timeframe}_atr'].ffill(inplace = True)
        df[f'{timeframe}_prem'] = df_with_prem[f'{timeframe}_prem']
        open_range_bars = int(open_range.split('min')[0])    
        df[f'{timeframe}_prem'] = df[f'{timeframe}_prem'].shift(open_range_bars)
        df['bar_position'] = get_bar_position(df=df, timeframe=timeframe)
        df[f'{timeframe}_or_high'] = get_or_range(df=df, column='high', or_bars=open_range_bars)
        df[f'{timeframe}_or_low'] = get_or_range(df=df, column='low', or_bars=open_range_bars)
    #     print(df.tail(100))
        df[f'{timeframe}_aup'] = df[f'{timeframe}_or_high'] + df[f'{timeframe}_prem']
        df[f'{timeframe}_adown'] = df[f'{timeframe}_or_low'] - df[f'{timeframe}_prem']

        df[f'{timeframe}_aup'].ffill(inplace = True)
        df[f'{timeframe}_adown'].ffill(inplace = True)

        df[f'{timeframe}_aup'] = np.where(df['bar_position'] <= open_range_bars, np.nan, df[f'{timeframe}_aup'])
        df[f'{timeframe}_adown'] = np.where(df['bar_position'] <= open_range_bars, np.nan, df[f'{timeframe}_adown'])

        df = df.drop(['bar_position'], axis = 1)
        if timeframe == 'weekly':
            df = df.drop('week', axis = 1)
        if timeframe == 'monthly':
            df = df.drop('month_count', axis = 1)
        return df
    
    
    @staticmethod
    def rsi(df, params):
        
        df_rsi = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        lookback = params[1]
        lower_bound = params[2]
        upper_bound = params[3]
        
        df_rsi = Indicator.resample_df(df_rsi, resample_period)
    
        df_rsi[f'RSI_{resample_period}_{lookback}'] = talib.RSI(df_rsi['close'], timeperiod=lookback)

        df_rsi[f'RSI_{resample_period}_{lookback}'] = df_rsi[f'RSI_{resample_period}_{lookback}'].shift()

        if resample_period == 'D':
            df = df.merge(
                df_rsi[['date', f'RSI_{resample_period}_{lookback}']], how='left', on='date')
            df = df.ffill()
        else:
            df = df.merge(
                df_rsi[['datetime', f'RSI_{resample_period}_{lookback}']], how='left', on='datetime')
            df = df.ffill()

        condition_list = [(df[f'RSI_{resample_period}_{lookback}'] >= upper_bound), 
                          ((df[f'RSI_{resample_period}_{lookback}'] < upper_bound) & (df[f'RSI_{resample_period}_{lookback}'] > lower_bound)), 
                          (df[f'RSI_{resample_period}_{lookback}'] <= lower_bound)]

        choice_list = [f'RSI_{resample_period}_{lookback}>{upper_bound}', 
                       f'{lower_bound}<RSI_{resample_period}_{lookback}<{upper_bound}', 
                       f'RSI_{resample_period}_{lookback}<{lower_bound}']

        df[f'RSI_{resample_period}_{lookback}_{lower_bound}_{upper_bound}_conditions'] = np.select(condition_list, choice_list)

        return df
    
    @staticmethod
    def get_gap(df, params):
        
        gap_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        print(params)
        resample_period = params[0]
        
        gap_df = Indicator.resample_df(gap_df, resample_period)
        
        condition_list = [(gap_df['open'] >= gap_df['close'].shift()) & (gap_df['open'] <= gap_df['high'].shift()), 
                          (gap_df['open'] < gap_df['close'].shift()) & (gap_df['open'] >= gap_df['low'].shift()), 
                          (gap_df['open'] > gap_df['high'].shift()), 
                          (gap_df['open'] < gap_df['low'].shift())]

        choice_list = [f'gap-up', f'gap-down', f'full-gap-up', f'full-gap-down']
        
        gap_df['gap_conditions'] = np.select(condition_list, choice_list)
        
        df = df.merge(gap_df[['date', 'gap_conditions']], how='left', on='date')
        df = df.ffill()
        
        return df
    
    @staticmethod
    def log_volatility(df, params, factor):
        
        resample_period = params[0]
        lookback = params[1]
        annualize = factor[resample_period]*252
        
        vol_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        vol_df = Indicator.resample_df(vol_df, resample_period)
        
        vol_df[f'log_vol_{resample_period}_{lookback}'] = np.log(vol_df['close']/vol_df['close'].shift()).rolling(window = lookback).std() * math.sqrt(annualize) * 100
        
        vol_df[f'log_vol_{resample_period}_{lookback}'] = vol_df[f'log_vol_{resample_period}_{lookback}'].shift()
        
        if resample_period == 'D':
            df = df.merge(
                df_rsi[['date', f'log_vol_{resample_period}_{lookback}']], how='left', on='date')
            df = df.ffill()
        else:
            df = df.merge(
                df_rsi[['datetime', f'log_vol_{resample_period}_{lookback}']], how='left', on='datetime')
            df = df.ffill()
            
        return df
    
    #vol_conditions['name':name, 'func_name':Indicators.log_volatility, 'params':[x, y, x], 'annualizing_factor_dict':{'5m':no_of_bars, '1H':no_of_bars..}]
    @staticmethod
    def vol_compare(df, params):
        
        vol_conditions1 = params['fast']
        df = vol_conditions1['func_name'](df, vol_conditions1['params'], vol_conditions1['annualizing_factor'])
        
        vol_conditions2 = params['slow']
        df = vol_conditions2['func_name'](df, vol_conditions2['params'], vol_conditions1['annualizing_factor'])
        
        resample_period1 = vol_conditions1['params'][0]
        lookback1 = vol_conditions1['params'][1]       
        resample_period2 = vol_conditions2['params'][0]
        lookback2 = vol_conditions2['params'][1]
        
        conditions = [df[f'log_vol_{resample_period1}_{lookback1}'] >= df[f'log_vol_{resample_period2}_{lookback2}'],
                      df[f'log_vol_{resample_period1}_{lookback1}'] < df[f'log_vol_{resample_period2}_{lookback2}']]
        
        choices = [f'log_vol_{resample_period1}_{lookback1}>=log_vol_{resample_period2}_{lookback2}',
                   f'log_vol_{resample_period1}_{lookback1}<log_vol_{resample_period2}_{lookback2}']
        
        
        df['vol_condition'] = np.select(conditions, choices)
        
        # number of period fast vol below slow vol and number of days fast above slow
        df['days_vol_fast_below_slow'] = np.where(df['vol_condition'] == f'log_vol_{resample_period1}_{lookback1}<log_vol_{resample_period2}_{lookback2}', 1, 0)
        
        for _ in range(1, len(df)):
            df['days_vol_fast_below_slow'].iloc[_] += df['days_vol_fast_below_slow'].iloc[_-1]
            
            if df['days_vol_fast_below_slow'].iloc[_] == df['days_vol_fast_below_slow'].iloc[_-1]:
                df['days_vol_fast_below_slow'].iloc[_] = 0
        
        # number of period fast vol below slow vol and number of days fast above slow
        df['days_vol_fast_above_slow'] = np.where(df['vol_condition'] == f'log_vol_{resample_period1}_{lookback1}>=log_vol_{resample_period2}_{lookback2}', 1, 0)
        
        for _ in range(1, len(df)):
            df['days_vol_fast_above_slow'].iloc[_] += df['days_vol_fast_above_slow'].iloc[_-1]
            
            if df['days_vol_fast_above_slow'].iloc[_] == df['days_vol_fast_above_slow'].iloc[_-1]:
                df['days_vol_fast_above_slow'].iloc[_] = 0
        
        return df
    
    @staticmethod
    def up_down(df, params):
        
        resample_period = params[0]
        lookback = params[1]
        
        up_down_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        up_down_df = Indicator.resample_df(up_down_df, resample_period)
        
        up_down_df['0_prevpattern'] = np.where(up_down_df['close'] >= up_down_df['close'].shift(), 'U', 'D')
        column_list = []
        for _ in range(0, lookback):
            up_down_df[f'{lookback - _}_prevpattern'] = up_down_df['0_prevpattern'].shift(lookback - _)
            column_list.append(f'{lookback - _}_prevpattern')
        
        up_down_df[f'{lookback}_day_pattern'] = up_down_df[column_list].apply(lambda row: ''.join(row.values.astype(str)), axis = 1)
        up_down_df = up_down_df.drop(column_list, axis = 1)
        
        df = df.merge(
                up_down_df[['date', f'{lookback}_day_pattern']], how='left', on='date')
        return df
    
    @staticmethod
    def day_of_week(df):
        
        df['day_of_week'] = df['date'].apply(lambda x: x.strftime("%A"))
        
        return df
    
    @staticmethod
    def max_min_move(df, params):
        
        max_min_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        lookback = params[1]
        
        max_min_df = Indicator.resample_df(max_min_df, resample_period)
        
        max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'] = np.where(abs(np.log(max_min_df['close']/max_min_df['close'].shift())).rolling(window = lookback).max() == abs(np.log(max_min_df['close']/max_min_df['close'].shift())), 'max', '-')
        
        max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'] = np.where((max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'] == 'max') &
                                                        (np.log(max_min_df['close']/max_min_df['close'].shift()) < 0),
                                                        f'min-move_{resample_period}_{lookback}',
                                                        np.where((max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'] == 'max') & (np.log(max_min_df['close']/max_min_df['close'].shift()) > 0),
                                                        f'max-move_{resample_period}_{lookback}', '-'))
        
        max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'] = max_min_df[f'max_min_move_{resample_period}_{lookback}_periods_conditions'].shift()
        
        if resample_period == 'D':
            df = df.merge(
                max_min_df[['date', f'max_min_move_{resample_period}_{lookback}_periods_conditions']], how='left', on='date')
            df = df.ffill()
        else:
            df = df.merge(
                max_min_df[['datetime', f'max_min_move_{resample_period}_{lookback}_periods_conditions']], how='left', on='datetime')
            df = df.ffill()
        
        return df
    
    @staticmethod
    def max_min_move_no_shift(df, params):
        
        max_min_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        lookback = params[1]
        
        max_min_df = Indicator.resample_df(max_min_df, resample_period)
        
        max_min_df[f'max_min_move_{resample_period}_{period}_periods'] = np.where(abs(np.log(max_min_df['close']/max_min_df['close'].shift())).rolling(window = lookback).max() == abs(np.log(max_min_df['close']/max_min_df['close'].shift())), 'max', '-')
        
        max_min_df[f'max_min_move_{resample_period}_{period}_periods'] = np.where((max_min_df[f'max_min_move_{period}_days'] == 'max') &
                                                        (np.log(max_min_df['close']/max_min_df['close'].shift()) < 0),
                                                        f'min-move_{resample_period}_{lookback}',
                                                        np.where((max_min_df[f'max_min_move_{period}_days'] == 'max') & (np.log(max_min_df['close']/max_min_df['close'].shift()) > 0),
                                                        f'max-move_{resample_period}_{lookback}', '-'))

        
        if resample_period == 'D':
            df = df.merge(
                max_min_df[['date', f'max_min_move_{resample_period}_{period}_periods']], how='left', on='date')
            df = df.ffill()
        else:
            df = df.merge(
                max_min_df[['datetime', f'max_min_move_{resample_period}_{period}_periods']], how='left', on='datetime')
            df = df.ffill()
        
        return df
    
    @staticmethod
    def max_min_move_crosssection(df, params):
        
        max_min_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        lookback = params[1]
        
        max_min_df = Indicator.resample_df(max_min_df, resample_period)
        
        if resample_period != 'D':
            
            times_list = Indicator.get_time_list(max_min_df, resample_period)

            for time in times_list:
                temp = max_min_df[max_min_df['time']==time].copy()

                temp[f'max_min_move_{resample_period}_{lookback}_periods_{str(time)}'] = np.where(abs(np.log(temp['close']/temp['open'])).rolling(window = lookback).max() == abs(np.log(temp['close']/temp['open'])), 'max', '')
                temp[f'max_min_move_{resample_period}_{lookback}_periods_{str(time)}'] = np.where((temp[f'max_min_move_{resample_period}_{lookback}_periods_{str(time)}'] == 'max') & (np.log(temp['close']/temp['open']) < 0), f'min-move_{resample_period}_{lookback}', np.where((temp[f'max_min_move_{resample_period}_{lookback}_periods_{str(time)}'] == 'max') & (np.log(temp['close']/temp['open']) > 0), f'max-move_{resample_period}_{lookback}', ''))

                max_min_df = max_min_df.merge(temp[['datetime', f'max_min_move_{resample_period}_{lookback}_periods_{str(time)}']], on='datetime', how='left')
                max_min_df.fillna('', inplace=True)

            column_list = [i for i in max_min_df.columns if i.split(resample_period)[0] == 'max_min_move_']
            max_min_df[f'max_min_{resample_period}_{lookback}'] = max_min_df[column_list].apply(lambda row: ''.join(row.values.astype(str)), axis = 1)
            max_min_df = max_min_df.drop(column_list, axis = 1)
            max_min_df[f'max_min_{resample_period}_{lookback}'] = np.where(max_min_df[f'max_min_{resample_period}_{lookback}'] == '', '-', max_min_df[f'max_min_{resample_period}_{lookback}'])
            
            max_min_df[f'max_min_{resample_period}_{lookback}'] = max_min_df[f'max_min_{resample_period}_{lookback}'].shift()
            
            df = df.merge(
                max_min_df[['datetime', f'max_min_{resample_period}_{lookback}']], how='left', on='datetime')
            df = df.ffill()
            
        else:
            max_min_df = Indicator.day_of_week(max_min_df)
            days_list = max_min_df['day_of_week'].unique()
            days_list = [i for i in max_min_df['day_of_week'].unique() if i not in ['Saturday', 'Sunday']]
            
            max_min_df['day_move'] = np.log(max_min_df['close']/max_min_df['close'].shift())
            
            for day in days_list:
                temp = max_min_df[max_min_df['day_of_week']==day].copy()
                
                temp[f'max_min_move_{resample_period}_{lookback}_periods_{day}'] = np.where(abs(temp['day_move']) == abs(temp['day_move']).rolling(window=lookback).max(), 'max', '')
                temp[f'max_min_move_{resample_period}_{lookback}_periods_{day}'] = np.where((temp[f'max_min_move_{resample_period}_{lookback}_periods']=='max') & (temp['day_move'] > 0), f'max-move_{resample_period}_{lookback}_{day}',
                                                                                     np.where((temp[f'max_min_move_{resample_period}_{lookback}_periods']=='max') & (temp['day_move'] < 0), f'min-move_{resample_period}_{lookback}_{day}', ''))
                
                max_min_df = max_min_df.merge(temp[['date', f'max_min_move_{resample_period}_{lookback}_periods_{day}']], on='date', how='left')
            
            column_list = [i for i in max_min_df.columns if i.split(resample_period)[0] == 'max_min_move_']
            max_min_df[f'max_min_{resample_period}_{lookback}'] = max_min_df[column_list].apply(lambda row: ''.join(row.values.astype(str)), axis = 1)
            max_min_df = max_min_df.drop(column_list, axis = 1)
            max_min_df[f'max_min_{resample_period}_{lookback}'] = np.where(max_min_df[f'max_min_{resample_period}_{lookback}'] == '', '-', max_min_df[f'max_min_{resample_period}_{lookback}'])
            
            max_min_df[f'max_min_{resample_period}_{lookback}'] = max_min_df[f'max_min_{resample_period}_{lookback}'].shift()
            
            df = df.merge(
                max_min_df[['date', f'max_min_{resample_period}_{lookback}']], how='left', on='date')
        
        return df
    
    @staticmethod
    def get_opex_column(df):
        
        df_expiry = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close', 'expiry_date']].copy()
        df_expiry = Indicator.resample_df(df_expiry, 'D', expiry='True')
        
        conditions = [df_expiry['date'] == df_expiry['expiry_date'],
         df_expiry['date'].shift(-1) == df_expiry['expiry_date'],
         (df_expiry['date'].shift(-2) == df_expiry['expiry_date']),
         (df_expiry['date'].shift(-3) == df_expiry['expiry_date']),
         (df_expiry['date'].shift(-4) == df_expiry['expiry_date'])]

        choices = ['opex', 'opex1', 'opex2', 'opex3', 'opex4']

        df_expiry['opex'] = np.select(conditions, choices)
        
        df = df.merge(df_expiry[['date','opex']], on='date', how='left')
        df = df.ffill()
        return df
    
    @staticmethod
    def first_max_min_move_expiry(df, params):
        
        df = Indicator.get_opex_column(df)
        
        resample_period = params[0]
        lookback = params[1]
        
        max_min_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close', 'opex']].copy()     
        max_min_df = Indicator.resample_df(max_min_df, resample_period, opex=True)
        
        max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] = np.where(abs(np.log(max_min_df['close']/max_min_df['close'].shift())).rolling(window = lookback).max() == abs(np.log(max_min_df['close']/max_min_df['close'].shift())), 'max', '-')
        
        max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] = np.where((max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == 'max') &
                                                        (np.log(max_min_df['close']/max_min_df['close'].shift()) < 0),
                                                        f'min-move_{resample_period}_{lookback}',
                                                        np.where((max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == 'max') & (np.log(max_min_df['close']/max_min_df['close'].shift()) > 0),
                                                        f'max-move_{resample_period}_{lookback}', '-'))
        
        if resample_period != 'D':
            max_min_df['first_min_move'] = np.where((max_min_df['opex'].shift()=='opex')&(max_min_df['opex']!='opex'), 1, None)
            max_min_df['first_min_move'] = np.where(max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == f'min-move_{resample_period}_{lookback}', f'min-move_{resample_period}_{lookback}', max_min_df['first_min_move'])
            max_min_df.loc[:,'first_min_move'] = max_min_df.loc[:,'first_min_move'].ffill()
            max_min_df['first_min_move'] = np.where((max_min_df['first_min_move'].shift()==1)&(max_min_df['first_min_move']==f'min-move_{resample_period}_{lookback}'),f'min-move_{resample_period}_{lookback}','')
            
            max_min_df['first_max_move'] = np.where((max_min_df['opex'].shift()=='opex')&(max_min_df['opex']!='opex'), 1, None)
            max_min_df['first_max_move'] = np.where(max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == f'max-move_{resample_period}_{lookback}', f'max-move_{resample_period}_{lookback}', max_min_df['first_max_move'])
            max_min_df.loc[:,'first_max_move'] = max_min_df.loc[:,'first_max_move'].ffill()
            max_min_df['first_max_move'] = np.where((max_min_df['first_max_move'].shift()==1)&(max_min_df['first_max_move']==f'max-move_{resample_period}_{lookback}'),f'max-move_{resample_period}_{lookback}','')
            
            max_min_df[f'first_max_min_move_{resample_period}_{lookback}_periods_conditions'] = max_min_df[['first_min_move', 'first_max_move']].apply(lambda row: ''.join(row.values.astype(str)), axis = 1)
        else:
            max_min_df['first_min_move'] = np.where((max_min_df['opex'].shift()=='opex')&(max_min_df['opex']!='opex'), 1, None)
            max_min_df['first_min_move'] = np.where(max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == f'min-move_{resample_period}_{lookback}', f'min-move_{resample_period}_{lookback}', max_min_df['first_min_move'])
            max_min_df.loc[:,'first_min_move'] = max_min_df.loc[:,'first_min_move'].ffill()
            conditions = [(max_min_df['first_min_move'] == f'min-move_{resample_period}_{lookback}') & (max_min_df['opex'].shift()=='opex'),
                          (max_min_df['first_min_move'] == f'min-move_{resample_period}_{lookback}') & (max_min_df['first_min_move'].shift()==1)]
            choices = [f'min-move_{resample_period}_{lookback}', f'min-move_{resample_period}_{lookback}']
            max_min_df['first_min_move'] = np.select(conditions, choices)
            max_min_df['first_min_move'] = np.where(max_min_df['first_min_move'] == '0', '', max_min_df['first_min_move'])
            
            
            max_min_df['first_max_move'] = np.where((max_min_df['opex'].shift()=='opex')&(max_min_df['opex']!='opex'), 1, None)
            max_min_df['first_max_move'] = np.where(max_min_df[f'max_min_move_{resample_period}_{lookback}_periods'] == f'max-move_{resample_period}_{lookback}', f'max-move_{resample_period}_{lookback}', max_min_df['first_max_move'])
            max_min_df.loc[:,'first_max_move'] = max_min_df.loc[:,'first_max_move'].ffill()
            conditions = [(max_min_df['first_max_move'] == f'max-move_{resample_period}_{lookback}') & (max_min_df['opex'].shift()=='opex'),
                          (max_min_df['first_max_move'] == f'max-move_{resample_period}_{lookback}') & (max_min_df['first_max_move'].shift()==1)]
            choices = [f'max-move_{resample_period}_{lookback}', f'max-move_{resample_period}_{lookback}']
            max_min_df['first_max_move'] = np.select(conditions, choices)
            max_min_df['first_max_move'] = np.where(max_min_df['first_max_move'] == '0', '', max_min_df['first_max_move'])
            
            max_min_df[f'first_max_min_move_{resample_period}_{lookback}_periods_conditions'] = max_min_df[['first_min_move', 'first_max_move']].apply(lambda row: ''.join(row.values.astype(str)), axis = 1)
        
        
        max_min_df[f'first_max_min_move_{resample_period}_{lookback}_periods_conditions'] = max_min_df[f'first_max_min_move_{resample_period}_{lookback}_periods_conditions'].shift()
        
        if resample_period == 'D':
            df = df.merge(
                max_min_df[['date', f'first_max_min_move_{resample_period}_{lookback}_periods_conditions']], how='left', on='date')
            df = df.ffill()
        else:
            df = df.merge(
                max_min_df[['datetime', f'first_max_min_move_{resample_period}_{lookback}_periods_conditions']], how='left', on='datetime')
            df = df.ffill()
        
        return df
    
    @staticmethod
    def pdh_pdl(df, params):
        
        pdh_pdl_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        lookback = params[1]
        
        pdh_pdl_df = Indicator.resample_df(pdh_pdl_df, resample_period)
        
        pdh_pdl_df[f'rolling_high_{lookback}'] = pdh_pdl_df['high'].rolling(window=lookback).max().shift()
        pdh_pdl_df[f'rolling_low_{lookback}'] = pdh_pdl_df['low'].rolling(window=lookback).min().shift()
        
        df = df.merge(pdh_pdl_df[['date', f'rolling_high_{lookback}', f'rolling_low_{lookback}']], on='date', how='left')
        
        conditions = [df['close']>df[f'rolling_high_{lookback}'], df['close']<df[f'rolling_low_{lookback}'],
                      (df['close']<=df[f'rolling_high_{lookback}']) & (df['close']>=df[f'rolling_low_{lookback}'])]
        choices = [f'above_high_{lookback}', f'below_low_{lookback}', f'between_high_low_{lookback}']
        
        df[f'above_below_high_low_{lookback}_conditions'] = np.select(conditions, choices)
        
        print('Done with pdh, pdl')
        return df
    
    @staticmethod
    def get_orh_orl(df, resample_period):
        
        df = Indicator.resample_df(df, resample_period)
        
        df[f'orh_{resample_period}'] = np.where(df['time']==datetime.time(9,15), df['high'], np.nan)
        df[f'orl_{resample_period}'] = np.where(df['time']==datetime.time(9,15), df['low'], np.nan)
        
        df = df.ffill()
        df['datetime'] = np.where(df['time'] == datetime.time(9, 15), df[['date', 'time']].apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis = 1), df['datetime'])
        
        return df
        
        
    @staticmethod
    def above_orh_below_orl(df, params):
        
        orh_orl_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        
        orh_orl_df = Indicator.get_orh_orl(orh_orl_df, resample_period)
        
        df = df.merge(orh_orl_df[['datetime', f'orh_{resample_period}', f'orl_{resample_period}']], on='datetime', how='left')        
        df = df.ffill()

        conditions = [df['close']>df[f'orh_{resample_period}'],
                      df['close']<df[f'orl_{resample_period}'],
                      (df['close']<=df[f'orh_{resample_period}'])&(df['close']>=df[f'orl_{resample_period}'])]
        choices = [f'above_orh_{resample_period}',
                   f'below_orl_{resample_period}',
                   f'between_orh_orl_{resample_period}']
        
        df[f'orh_orl_{resample_period}_conditions'] = np.select(conditions, choices)
        
        print('Done with orh, orl')
        return df       
        
    @staticmethod
    def number_above_orh_below_orl(df, params):
        
        orh_orl_df = df[['date', 'datetime', 'time', 'open', 'high', 'low', 'close']].copy()
        
        resample_period = params[0]
        high_greater_threshold = params[1][0]
        high_lower_threshold = params[1][1]
        low_greater_threshold = params[2][0]
        low_lower_threshold = params[2][1]
        
        orh_orl_df = Indicator.get_orh_orl(orh_orl_df, resample_period)
        
        df = df.merge(orh_orl_df[['datetime', f'orh_{resample_period}', f'orl_{resample_period}']], on='datetime', how='left')
        df = df.ffill()
        
        df['above_orh'] = np.where(df['close']>df[f'orh_{resample_period}'], 1, 0)
        df['below_orl'] = np.where(df['close']<df[f'orl_{resample_period}'], 1, 0)
        
        df['above_orh'] = df.groupby('date')['above_orh'].apply(lambda x: x.cumsum())
        df['below_orl'] = df.groupby('date')['below_orl'].apply(lambda x: x.cumsum())
        
        df.drop([f'orh_{resample_period}', f'orl_{resample_period}'], axis = 1, inplace = True)
        df[f'above_orh_greater_number_{resample_period}_conditions'] = np.where(df['above_orh'] >= high_greater_threshold, f'close>orh_above_{high_greater_threshold}_bars', '-')
        df[f'above_orh_lesser_number_{resample_period}_conditions'] = np.where(df['above_orh'] <= high_lower_threshold, f'close>orh_below_{high_lower_threshold}_bars', '-')
        
        df[f'below_orl_greater_number_{resample_period}_conditions'] = np.where(df['below_orl'] >= low_greater_threshold, f'close<orl_above_{low_greater_threshold}_bars', '-')
        df[f'below_orl_lesser_number_{resample_period}_conditions'] = np.where(df['below_orl'] <= low_lower_threshold, f'close<orl_below_{low_lower_threshold}_bars', '-')
        
        print('Done with number of bars above orh and below orl')
        return df
        

In [3]:
class DataMining:
    
    holding_entry_exit_list = []
    timebasedentry = None
    spot_df = None
    read_path = {}
    
    pnl_df = {}
    
    df_pnl = pd.DataFrame()
    
    dates_list = []
    
    @staticmethod
    def get_read_path_dict():
        DataMining.read_path['csv'] = DataMining.read_csv
        DataMining.read_path['pkl'] = DataMining.read_pickle
        DataMining.read_path['parquet'] = DataMining.read_parquet
             
    @staticmethod
    def get_pnl_df_dict():
        DataMining.pnl_df[0] = DataMining.get_intraday_pnl_df
        DataMining.pnl_df[1] = DataMining.get_positional_pnl_df
        
    @staticmethod
    def read_csv(path):        
        return pd.read_csv(path)
    
    @staticmethod
    def read_pickle(path):        
        return pd.read_pickle(path)
    
    @staticmethod
    def read_parquet(path):        
        return pd.read_parquet(path)
    
    @staticmethod
    def set_entry_exit(holding_entry_exit):
        DataMining.holding_entry_exit_list = holding_entry_exit
        
    @staticmethod
    def set_time_based_entry(timebasedentry):
        DataMining.timebasedentry = timebasedentry
    
    @staticmethod
    def load_data(path, path_type):
        DataMining.spot_df = DataMining.read_path[path_type](path)
    
    @staticmethod
    def get_current_expiry_dates(path, path_type):
        
        df = DataMining.read_path[path_type](path)
        DataMining.spot_df = DataMining.spot_df.merge(df, on='date', how='left')
        DataMining.spot_df.dropna(inplace=True)
    
    
    @staticmethod
    def get_dates_list():
        DataMining.dates_list = list(DataMining.spot_df['date'].unique())
        
    @staticmethod
    def feature_handler(indicator_dict):       
        for indicator in indicator_dict:         
            DataMining.spot_df = indicator_dict[indicator]['function'](DataMining.spot_df, indicator_dict[indicator]['params'])
    
    @staticmethod
    def get_entry_exit_df(holding_period=0):
        
        DataMining.get_dates_list()
        
        if holding_period == 0:
            entry_dates_list = DataMining.dates_list
            exit_dates_list = entry_dates_list
        else:
            entry_dates_list = DataMining.dates_list
            exit_dates_list = entry_dates_list[holding_period:]
            for _ in range(holding_period):
                exit_dates_list.append(np.nan)

        df = pd.DataFrame({'entry_date': entry_dates_list, 'exit_date': exit_dates_list})
        df.dropna(inplace=True)
        
        return df     
        
    @staticmethod
    def get_intraday_pnl_df(holding_period, entry_time, exit_time):      
        
        df_pnl = DataMining.get_entry_exit_df(holding_period)
        df_pnl['entry_time'] = entry_time
        df_pnl['exit_time'] = exit_time
        
        df_pnl['entry_datetime'] = df_pnl[['entry_date', 'entry_time']].apply(lambda row: 
                                                datetime.datetime.combine(row['entry_date'], row['entry_time']), axis = 1)
        df_pnl['exit_datetime'] = df_pnl[['exit_date', 'exit_time']].apply(lambda row: 
                                                datetime.datetime.combine(row['exit_date'], row['exit_time']), axis = 1)
        
        return df_pnl
    
    @staticmethod
    def get_positional_pnl_df(holding_period, entry_time, exit_time):
        
        df_pnl = DataMining.get_entry_exit_df(holding_period)
        df_pnl['entry_time'] = entry_time
        df_pnl['exit_time'] = exit_time
        
        df_pnl['entry_datetime'] = df_pnl[['entry_date', 'entry_time']].apply(lambda row: 
                                                datetime.datetime.combine(row['entry_date'], row['entry_time']), axis = 1)
        df_pnl['exit_datetime'] = df_pnl[['exit_date', 'exit_time']].apply(lambda row: 
                                                datetime.datetime.combine(row['exit_date'], row['exit_time']), axis = 1)
        
        return df_pnl
    
    @staticmethod
    def merge_spot_df(df):
        
        #df = df.iloc[0:0]
        #df.loc[0 if math.isnan(df.index.max()) else df.index.max() + 1] = data
        
        df_copy = DataMining.spot_df.copy()
        df_copy = df_copy.rename(columns={'datetime':'entry_datetime', 'close':'entry_close'})
        df_copy = df_copy[['entry_datetime', 'entry_close']]       
        df = df.merge(df_copy, on='entry_datetime', how='left')
        
        df_copy = DataMining.spot_df.copy()
        df_copy = df_copy.rename(columns={'datetime':'exit_datetime', 'close':'exit_close'})
        df_copy = df_copy[['exit_datetime', 'exit_close']]       
        df = df.merge(df_copy, on='exit_datetime', how='left')
        
        df_copy = DataMining.spot_df.copy()
        df_copy = df_copy.rename(columns={'datetime':'entry_datetime'})
        columns = [i for i in df_copy.columns if i.split('_')[-1] == 'conditions']
        columns.append('entry_datetime')
        df_copy = df_copy[columns]
        df = df.merge(df_copy, on='entry_datetime', how='left')
        
        return df
            
    @staticmethod
    def entry_exit_pnl():       
            
        holding_period = DataMining.holding_entry_exit_list[0]
        entry_time = DataMining.holding_entry_exit_list[1]
        exit_time = DataMining.holding_entry_exit_list[2]

        df = DataMining.pnl_df[holding_period>0](holding_period, entry_time, exit_time)

        # After getting the df with entry and exit datetime, we have to get the corresponding close and the condition
        # at the entry time.

        df = DataMining.merge_spot_df(df)
        df.dropna(inplace=True)
        
        df['pnl'] = df['exit_close'] - df['entry_close']

        df['entry_exit_conditions'] = str(entry_time.strftime("%H:%M")) + '-' + str(exit_time.strftime("%H:%M")) + '-' + str(holding_period)
        
        DataMining.df_pnl = df
        
    @staticmethod
    def datamining_metrics():
        
        column_name_list = DataMining.df_pnl.columns
        condition_column_list = [i for i in column_name_list if 'conditions' in i.split('_')[-1]]

        # print(condition_column_list)

        strategy_description = 'datamining-table-' + '-'.join(condition_column_list)

        # print('self.strategy_description: ', self.strategy_description)

        DataMining.df_pnl['strategy_combination'] = DataMining.df_pnl[condition_column_list].apply(lambda row: '-'.join(row.values.astype(str)), axis = 1)

        unique_strategies = DataMining.df_pnl['strategy_combination'].unique()

        columns = ['action', 'number_of_trades', 'win_percentage', 'average', 'median', 'max', 'min', 'average_profit', 'std_dev',
                   't-test', 'net_profit', 'p-value', 'gross_profit', 'gross_loss', 'profit_factor', 'outlier_adjusted_profit_factor']

        DataMining.datamining_table = pd.DataFrame(columns=columns, index=unique_strategies)
        
        for strategy in unique_strategies:

            temp_return_data = DataMining.df_pnl.copy()

            DataMining.datamining_table['average'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].mean(), 2)

            if DataMining.datamining_table['average'].loc[strategy] < 0:
                temp_return_data['pnl'] = -1*temp_return_data['pnl']
                DataMining.datamining_table['action'].loc[strategy] = 'short'
            else:
                DataMining.datamining_table['action'].loc[strategy] = 'long'

            DataMining.datamining_table['number_of_trades'].loc[strategy] = len(temp_return_data[temp_return_data['strategy_combination'] == strategy])

            DataMining.datamining_table['win_percentage'].loc[strategy] = round(len(temp_return_data[(temp_return_data['strategy_combination'] == strategy) & (temp_return_data['pnl'] > 0)])*100 / len(temp_return_data[temp_return_data['strategy_combination'] == strategy]), 2)

            DataMining.datamining_table['average'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].mean(), 2)

            DataMining.datamining_table['median'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].median(), 2)

            DataMining.datamining_table['max'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].max(), 2)

            DataMining.datamining_table['min'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].min(), 2)

            DataMining.datamining_table['average_profit'].loc[strategy] = round(temp_return_data[(temp_return_data['strategy_combination'] == strategy) & (temp_return_data['pnl'] > 0)]['pnl'].mean(), 2)

            DataMining.datamining_table['std_dev'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].std(), 2)

            DataMining.datamining_table['net_profit'].loc[strategy] = round(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'].sum(), 2)

            DataMining.datamining_table['t-test'].loc[strategy] = round(stats.ttest_1samp(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'], 0)[0], 2)

            DataMining.datamining_table['p-value'].loc[strategy] = round((stats.ttest_1samp(temp_return_data[temp_return_data['strategy_combination'] == strategy]['pnl'], 0)[1])/2, 2)

            DataMining.datamining_table['gross_profit'].loc[strategy] = round(temp_return_data[(temp_return_data['strategy_combination'] == strategy) & (temp_return_data['pnl'] > 0)]['pnl'].sum(), 2)

            DataMining.datamining_table['gross_loss'].loc[strategy] = round(temp_return_data[(temp_return_data['strategy_combination'] == strategy) & (temp_return_data['pnl'] < 0)]['pnl'].sum(), 2)

        DataMining.datamining_table['profit_factor'] = abs(DataMining.datamining_table['gross_profit'] / DataMining.datamining_table['gross_loss'])

        DataMining.datamining_table['profit_factor'] = DataMining.datamining_table['profit_factor'].apply(lambda x: round(x, 2))

        DataMining.datamining_table['outlier_adjusted_profit_factor'] = abs((DataMining.datamining_table['gross_profit']
                                                                   - DataMining.datamining_table['max']) / DataMining.datamining_table['gross_loss'])

        DataMining.datamining_table['outlier_adjusted_profit_factor'] = DataMining.datamining_table['outlier_adjusted_profit_factor'].apply(lambda x: round(x, 2))    

In [11]:
# path = 'banknifty_spot_data.pkl'
path = 'BANKNIFTY.pkl'
path_type = 'pkl'
DataMining.get_read_path_dict()
DataMining.load_data(path, path_type)
# expiry_path = 'banknifty_front_expiry.pkl'
# DataMining.get_current_expiry_dates(expiry_path, 'pkl')

In [None]:
gap_indicator_params = {}
gap_indicator_params['function'] = Indicator.get_gap
gap_indicator_params['params'] = ['D']
indicator_dict = {}
indicator_dict['gap'] = gap_indicator_params

In [None]:
indicator_dict.clear()

In [None]:
max_min_indicator_params = {}
max_min_indicator_params['function'] = Indicator.max_min_move
max_min_indicator_params['params'] = ['30Min', 5]
indicator_dict = {}
indicator_dict['max_min'] = max_min_indicator_params

In [12]:
indicator_dict = {}

orh_orl_number_indicator_params = {}
orh_orl_number_indicator_params['function'] = Indicator.number_above_orh_below_orl
orh_orl_number_indicator_params['params'] = ['1H', [30,5],[30,5]]
indicator_dict['orh_orl_number'] = orh_orl_number_indicator_params

orh_orl_indicator_params = {}
orh_orl_indicator_params['function'] = Indicator.above_orh_below_orl
orh_orl_indicator_params['params'] = ['1H']
indicator_dict['orh_orl'] = orh_orl_indicator_params

pdh_pdl_indicator_params = {}
pdh_pdl_indicator_params['function'] = Indicator.pdh_pdl
pdh_pdl_indicator_params['params'] = ['D', 1]
indicator_dict['pdh_pdl'] = pdh_pdl_indicator_params

In [13]:
DataMining.feature_handler(indicator_dict)

Done with number of bars above orh and below orl
Done with orh, orl
Done with pdh, pdl


In [14]:
holding_entry_exit = [1, datetime.time(15, 15), datetime.time(9, 20)]
DataMining.set_entry_exit(holding_entry_exit)
DataMining.get_pnl_df_dict()
DataMining.entry_exit_pnl()

In [15]:
DataMining.datamining_metrics()
DataMining.datamining_table

Unnamed: 0,action,number_of_trades,win_percentage,average,median,max,min,average_profit,std_dev,t-test,net_profit,p-value,gross_profit,gross_loss,profit_factor,outlier_adjusted_profit_factor
--close>orh_below_5_bars-close<orl_above_30_bars---below_orl_1H-0-15:15-09:20-1,short,1,100.0,87.9,87.9,87.9,87.9,87.9,,,87.9,,87.9,0.0,inf,
close>orh_above_30_bars-----close<orl_below_5_bars-above_orh_1H-between_high_low_1-15:15-09:20-1,long,127,66.14,62.61,71.25,874.3,-1521.85,161.91,232.0,3.04,7950.85,0.0,13600.05,-5649.2,2.41,2.25
----close<orl_above_30_bars---below_orl_1H-between_high_low_1-15:15-09:20-1,short,18,50.0,5.41,-1.15,459.7,-473.7,183.28,248.95,0.09,97.4,0.46,1649.5,-1552.1,1.06,0.77
--close>orh_below_5_bars-close<orl_above_30_bars---below_orl_1H-below_low_1-15:15-09:20-1,short,283,50.18,19.02,0.05,2313.75,-1036.85,184.71,275.7,1.16,5383.55,0.12,26229.1,-20845.55,1.26,1.15
close>orh_above_30_bars-----close<orl_below_5_bars-between_orh_orl_1H-between_high_low_1-15:15-09:20-1,short,86,43.02,5.5,-13.07,1404.4,-768.1,181.62,255.65,0.2,472.65,0.42,6719.95,-6247.3,1.08,0.85
close>orh_above_30_bars-----close<orl_below_5_bars-above_orh_1H-above_high_1-15:15-09:20-1,long,300,66.0,58.68,62.57,1290.35,-1502.2,167.19,246.98,4.11,17602.65,0.0,33102.8,-15500.15,2.14,2.05
--close>orh_below_5_bars---close<orl_below_5_bars-between_orh_orl_1H-above_high_1-15:15-09:20-1,long,28,64.29,47.25,41.88,343.45,-548.1,122.99,165.23,1.51,1323.1,0.07,2213.85,-890.75,2.49,2.1
--close>orh_below_5_bars-close<orl_above_30_bars---below_orl_1H-above_high_1-15:15-09:20-1,short,8,62.5,116.12,13.3,1018.9,-103.3,230.55,369.82,0.89,928.95,0.2,1152.75,-223.8,5.15,0.6
close>orh_above_30_bars-------between_orh_orl_1H-between_high_low_1-15:15-09:20-1,short,16,43.75,31.96,-14.1,548.8,-197.0,181.62,182.3,0.7,511.3,0.25,1271.35,-760.05,1.67,0.95
------close<orl_below_5_bars-above_orh_1H-between_high_low_1-15:15-09:20-1,long,11,54.55,88.46,83.05,517.15,-243.5,224.09,199.79,1.47,973.05,0.09,1344.55,-371.5,3.62,2.23
