# Notebook Instructions
<i>You can run the notebook document sequentially (one cell at a time) by pressing <b> shift + enter</b>. While a cell is running, a [*] will display on the left. When it has been run, a number will display indicating the order in which it was run in the notebook [8].</i>

<i>Enter edit mode by pressing <b>`Enter`</b> or using the mouse to click on a cell's editor area. Edit mode is indicated by a green cell border and a prompt showing in the editor area.</i>

# Hyperparameter tuning

Hyperparameters cannot be learned by the model but need to be specified by the user before training the models. In this notebook, we will find the best hyperparameters for random forest model created in the previous section using random search and grid search cross validation techniques.

Let's start with below steps which you already know!
1. Import the data
2. Define predictor variables and a target variable
3. Split the data into train and test dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV ,GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
import pickle
import talib as ta

data_total = pd.read_csv('ICICIBANK19JANFUT.csv')

In [None]:
data_total=data_total[1000:]
data_total.close.plot(figsize=(15,7))

In [None]:
atr=ta.ATR(data_total.high,data_total.low,data_total.close,timeperiod=30)
#ret=data_total.close.pct_change().shift(-1)
#atr[ret.abs()>0.005]=0
atr.plot(figsize=(15,7))

In [None]:
data_total.head()

In [None]:
ta.get_function_groups().keys()

### Function to create the features and train-test datasets

In [None]:
def create_features(data):
    
    # Returns
    data=data.copy()
    col1=set(data.columns)
    data['ret1'] = data.close.pct_change()
    data['ret2'] = data.close.pct_change(2)
    data['ret5'] = data.close.pct_change(5)
    data['ret20'] = data.close.pct_change(20)
    data['ret30'] = data.close.pct_change(30)

    data['retl1'] = data.low.pct_change()
    data['retl2'] = data.low.pct_change(2)
    data['retl5'] = data.low.pct_change(5)
    data['reth1'] = data.high.pct_change()
    data['reth2'] = data.high.pct_change(2)
    data['reth5'] = data.high.pct_change(5)
    
    
    data['retr5'] = data.ret1.rolling(5).sum()
    data['retr10'] = data.ret1.rolling(10).sum()
    data['retr20'] = data.ret1.rolling(20).sum()
    data['retr40'] = data.ret1.rolling(40).sum()

    # Standard Deviation
    data['std5'] = data.ret1.rolling(5).std()
    data['std10'] = data.ret1.rolling(10).std()
    data['std20'] = data.ret1.rolling(20).std()
    data['std40'] = data.ret1.rolling(40).std()


    data['vel1'] = (2*data.close-data.high-data.low)
    data['vel5'] = data.vel1.rolling(5).sum()
    data['vel10'] = data.vel1.rolling(10).sum()
    data['vel20'] = data.vel1.rolling(20).sum()
    data['vel40'] = data.vel1.rolling(40).sum()
    
    data['stdv5'] = data.vel1.rolling(5).std()
    data['stdv10'] = data.vel1.rolling(10).std()
    data['stdv20'] = data.vel1.rolling(20).std()
    data['stdv40'] = data.vel1.rolling(40).std()
    
    
    data['stdv5'] = data.volume.rolling(5).std()
    data['stdvv10'] = data.volume.rolling(10).std()
    data['stdvv20'] = data.volume.rolling(20).std()
    data['stdvv40'] = data.volume.rolling(40).std()
    
        
    # ADDED volume profile and acc, this reduced low vol peformance but increased the high vol performance
 
    data['vol1'] = data.volume.diff()
    data['vol5'] = data.volume.diff(5)
    data['vol10'] = data.volume.diff(10)
    data['vol20'] = data.volume.diff(20)
    data['vol40'] = data.volume.diff(40)
    
    data['vols5'] = data.volume.rolling(5).sum()
    data['vols10'] = data.volume.rolling(10).sum()
    data['vols20'] = data.volume.rolling(20).sum()
    data['vols40'] = data.volume.rolling(40).sum()
    
    data['acc1'] = data.vel1.diff()
    data['acc5'] = data.vel1.diff(5)
    data['acc10'] = data.vel1.diff(10)
    data['acc20'] = data.vel1.diff(20)
    data['acc40'] = data.vel1.diff(40)
     
    # Candlestick Patterns
    data['HAMMER']=ta.CDLHAMMER(data.open, data.high, data.low, data.close)
    data['DOJI']=ta.CDLDOJI(data.open, data.high, data.low, data.close)
    data['SHOOTINGSTAR']=ta.CDLSHOOTINGSTAR(data.open, data.high, data.low, data.close)
    
    # Technical Indicators
    
# changed the timeperiod from 14 to 30
    data['AROONOSC']=ta.AROONOSC( data.high, data.low ,timeperiod=30)
    data['RSI']=ta.RSI(data.close,timeperiod=30)
    data['ADXR']=ta.ADXR(  data.high, data.low, data.close,timeperiod=30)    
    data['ATR']=ta.ATR(data.high,data.low,data.close,timeperiod=30)
    data['ATR']=ta.ATR(data.high,data.low,data.close,timeperiod=5)

    # Future returns
#    data['retFut1'] = data.ret1.shift(-1)
# changed k from 30 to 60
    k=30
    data['retFut1'] = np.where(((data.close>data.close.rolling(k).mean())
                               &(data.close>data.shift(-k).close.rolling(k).mean())),-1,
                               np.where(((data.close<data.close.rolling(k).mean())
                                        &(data.close<data.shift(-k).close.rolling(k).mean())),1,0))

    
    col2=set(data.columns)

    # Define predictor variables (X) and a target variable (y)
    data = data.dropna()
    predictor_list = list(col2-col1)
    predictor_list.remove('retFut1')
    X = data[predictor_list]
#    y = np.where((data.close>data.close.rolling(30).mean())&(data.close>data.shift(-30).close.rolling(30).mean()),1,
#                 np.where((data.close<data.close.rolling(30).mean())&(data.close<data.shift(-30).close.rolling(30).mean()),-1,0))
    y = data.retFut1
#    y = np.where(data.retFut1>0,1,-1)
    y = pd.Series(y)
    return X , y

In [None]:
X,y=create_features(data_total)

In [None]:
X.shape

In [None]:
min_len=len(data_total)-len(X)
len_exec=375*1
perf=0
min_len, len_exec, perf

In [None]:
data_exe=data_total[-(min_len+perf+len_exec):].copy()
data=data_total[:-(min_len+perf+len_exec)].copy()

X,y=create_features(data)

In [None]:
X.shape

In [None]:
data_exe.tail()

The key hyperparameters in random forest method are
- n_estimators,
- max_features, 
- max_depth, 
- min_samples_leaf, 
- and bootstrap.   

We have defined below a range of values for each of these hyperparameters.

### Create a hyperparameter grid

In [None]:
def create_dt_hp_grid():


    # Number of features to consider at every split
    max_features = [round(x,2) for x in np.linspace(start = 0.1, stop = 1.0, num = 100)]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 51, num = 50)]


    # Save these parameters in a dictionry
    param_grid = {
                   'max_features': max_features,
                   'min_samples_leaf': min_samples_leaf,
                  }
    return param_grid

## Random Search
The RandomizedSearchCV function from sklearn.model_selection package is used to find best hyperparameter values.

In [None]:
param_grid=create_dt_hp_grid()
# Create the base model to tune
def perform_RSCV(param_grid,X_train, y_train):
    random_forest = RandomForestClassifier(n_estimators = 100,random_state= 42,bootstrap=True)#,class_weight="balanced_subsample")
    rf_random = RandomizedSearchCV(estimator = random_forest, 
                                   param_distributions = param_grid, 
                                   n_iter = 60,                               
                                   random_state= 42,
                                   iid=False,
                                   cv =10,
                                   verbose=1
                                   )
    rf_random.fit(X_train, y_train)
    return rf_random.best_params_

The RandomizedSearchCV takes following parameter as input

1. estimator: The base estimator model for which best hyperparameter values are found.
2. param_distributions: Dictionary of parameter names and list of values to try.
3. n_iter: Number of parameters that are tried to find the best values.
4. random_state: The random seed value.

The best hyperparameters values for the random forest model is found below.

In [None]:
best_params=perform_RSCV(param_grid,X, y)

In [None]:
best_params

### Perform GridSearch in the vicinity of the best parameters 

In [None]:
def perform_GSCV(best_params,X_train, y_train):
    random_forest = RandomForestClassifier(n_estimators = 100,random_state= 42,bootstrap= True)#,class_weight="balanced_subsample")
    
    
    if best_params['max_features'] >0.95:
        max_f=[float(i) for i in np.linspace(best_params['max_features']-0.05,1,4)]
    elif best_params['max_features'] <0.05:
        max_f=[float(i) for i in np.linspace(0.01,best_params['max_features']+0.05,4)]
    else:
        max_f=[float(i) for i in np.linspace(best_params['max_features']-0.05,best_params['max_features']+0.05,4)]
        
    if best_params['min_samples_leaf'] <2:
        min_s=[int(i) for i in np.linspace(1,best_params['min_samples_leaf']+ 4,4)]
    else:
        min_s=[int(i) for i in np.linspace(best_params['min_samples_leaf']- 2,best_params['min_samples_leaf']+ 2,4)]        
         

    
    param_grid = {
               'max_features': max_f ,
               'min_samples_leaf': min_s
              }
    
    
    rf_grid = GridSearchCV(estimator = random_forest, 
                               param_grid = param_grid,  
                               iid=False,
                               cv=10
                               )
    rf_grid.fit(X_train, y_train)

    return rf_grid

model=perform_GSCV(best_params,X, y)

In [None]:
model=perform_GSCV(best_params,X, y)

In this step, we train the model created using the best hyperparameter values.

# Grid search

Similarly, we can find the best model using grid search cross validation technique. Since this method is time consuming as it tries out all possible combinations, we have defined below less hyperparameters values for illustration purpose only. You may specify more values for hyperparameter.

The below code finds the best hyperparameter values.

In [None]:
model.best_estimator_

### Save the model for future use

In [None]:
model_pickle_path = 'model_pickle_Minute_Best_reverse_signal.pkl'

In [None]:
def save_model(model,model_pickle_path):
    model_pickle = open(model_pickle_path, 'wb')
    pickle.dump(model, model_pickle)
    model_pickle.close()
    
save_model(model,model_pickle_path)

### Retrain the model

In [None]:
def re_train(data,model_pickle_path):
    X,y=create_features(data)
    param_grid=create_dt_hp_grid()
    best_params=perform_RSCV(param_grid,X, y)
    model=perform_GSCV(best_params,X, y)
    save_model(model,model_pickle_path)

In [None]:
X,y=create_features(data_exe)
with open(model_pickle_path, 'rb') as model_unpickle:
    model1 = pickle.load(model_unpickle)
cls1=model1.best_estimator_
predictions= cls1.predict(X)
#cluster=knn.fit_predict(X)
#predictions=pd.Series(predictions*cluster)
predictions=pd.Series(predictions)
predictions=predictions.replace(0,method='ffill')
import matplotlib.pyplot as plt
%matplotlib inline  

ret=data_exe.close.pct_change().shift(-1).iloc[-len_exec:]
ret[ret.abs()>0.005]=0
strategy_perf1=pd.Series(predictions[-len_exec:]*ret.values)
plt.plot(np.nancumprod(strategy_perf1+1))
plt.plot(np.nancumprod(ret+1))
plt.legend(['Strat'])

#plt.plot(np.nancumprod(data_exe.close.pct_change().shift(-1).iloc[-len_exec:]+1))

### Execute the model on test data

#### We will retrain the model whenever the past 20 days (1 month)  performance is negative. 

In [None]:
strategy_perf=[]
for i in range(len(data_total)):
    if i>373: 
        X,y=create_features(data_total.iloc[:i])
        model_unpickle = open(model_pickle_path, 'rb')
        model = pickle.load(model_unpickle)
        cls=model.best_estimator_
        predictions= cls.predict(X)
        strategy_perf.append(predictions[-1])
        
        if i//374==0:
            re_train(data_total.iloc[:i],model_pickle_path)
        else:
            pass
    else:
        model_unpickle = open(model_pickle_path, 'rb')
        model = pickle.load(model_unpickle)
        cls=model.best_estimator_
        predictions= cls.predict(X)
        strategy_perf.append(predictions[-1])

In [None]:
data_total.iloc[-1]

In [None]:
X.iloc[-1]

In [None]:
len(strategy_perf)

In [None]:
strategy_perf =pd.Series(strategy_perf).replace(0,method='ffill')
strategy_perf

### Check the strategy performance on the test data

In [None]:
len(data_exe)-375-min_len

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

strategy_perf1=pd.Series(strategy_perf*data_total.close.pct_change().shift(-1).iloc[-len(strategy_perf):].values)
plt.plot(np.nancumprod(strategy_perf1+1))

## Practice

You can try it yourself of how the random forest model created through RandomSearchCV and GridSearchCV performs on test dataset.