# Chapter 10: Features and target engineering


### 10.1 Motivation and intuition

###### 10.1.1 Features engineering

###### 10.1.2 Target engineering

###### 10.1.3 Why it is so important?


### 10.2 Trading application

###### 10.2.1 Create trading indicators and useful trading features

###### 10.2.2 Target labelling


In [22]:
from scipy.optimize import minimize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use("seaborn")
import warnings

warnings.filterwarnings("ignore")
import yfinance as yf
import talib as ta

In [23]:
df = yf.download("GOOG")

df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276
...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300


### 10.2 Trading application

###### 10.2.1 Create trading indicators and useful trading features

In [24]:
""" QUANTITATIVE FEATURES """

# N previous days variation
n = 10
df[f"var_{n}"] = df["Adj Close"].pct_change(n)

m = 200
df[f"var_{m}"] = df["Adj Close"].pct_change(m)

# Moving correlation
col_1 = "var_200"
col_2 = "var_10"
df["moving_correlation"] = df[col_1].rolling(50).corr(df[col_2])


df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,
...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054


In [25]:
""" PRICE ACTION PATTERNS """

# DOJI
df["DOJI"] = ta.CDLDOJI(df["Open"], df["High"], df["Low"], df["Close"]) / 100


# ENGULFING
df["candle_way"] = -1
df.loc[(df["Open"] - df["Close"]) < 0, "candle_way"] = 1


# Amplitude
df["amplitude_abs"] = np.abs(df["Close"] - df["Open"])
df["Engulfing"] = 0

df.loc[
    # Yersteday red candlestick and Today increase
    (df["candle_way"].shift(1) == -1) & (df["candle_way"] == 1) &
    # Close of the last decreasing candlestick = Open of today increase candlestick
    (df["Close"].shift(1) < df["Open"] * (1 + 0.5 / 100))
    & (df["Close"].shift(1) > df["Open"] * (1 - 0.5 / 100))
    &
    # Last decreaing candlestick is less strong than the Today increasing candlestick
    (df["amplitude_abs"].shift(1) * 1.5 < df["amplitude_abs"]),
    "Engulfing",
] = 1

df.loc[
    # Yersteday green candlestick and Today decrease
    (df["candle_way"].shift(1) == 1) & (df["candle_way"] == -1) &
    # Close of the last decreasing candlestick = Open of today decrease candlestick
    (df["Close"].shift(1) < df["Open"] * (1 + 0.5 / 100))
    & (df["Close"].shift(1) > df["Open"] * (1 - 0.5 / 100))
    &
    # Last decreaing candlestick is less strong than the Today  candlestick
    (df["amplitude_abs"].shift(1) * 1.5 < df["amplitude_abs"]),
    "Engulfing",
] = -1
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation,DOJI,candle_way,amplitude_abs,Engulfing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,,0.0,1,0.169365,0
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,,0.0,1,3.636368,0
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,,0.0,-1,0.672482,0
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,,0.0,-1,3.173103,0
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,,0.0,1,0.518059,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652,0.0,-1,42.550049,0
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090,0.0,-1,39.780029,0
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935,0.0,-1,9.040039,0
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054,0.0,1,12.280029,0


In [26]:
""" TECHNICAL ANALYSIS """

# Max value in the last n days
n = 150
df["resistance"] = df["Close"].rolling(n).max()


# RSI
n = 15
df["RSI"] = ta.RSI(df["Close"], timeperiod=n)


df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation,DOJI,candle_way,amplitude_abs,Engulfing,resistance,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,,0.0,1,0.169365,0,,
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,,0.0,1,3.636368,0,,
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,,0.0,-1,0.672482,0,,
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,,0.0,-1,3.173103,0,,
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,,0.0,1,0.518059,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652,0.0,-1,42.550049,0,2974.409912,53.279409
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090,0.0,-1,39.780029,0,2974.409912,50.812601
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935,0.0,-1,9.040039,0,2974.409912,47.094640
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054,0.0,1,12.280029,0,2974.409912,46.081082


###### 10.2.2 Target labelling

In [27]:
""" Next N days variations """

n = 1
df[f"target_var_{n}"] = df["Close"].pct_change(n).shift(n)

df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation,DOJI,candle_way,amplitude_abs,Engulfing,resistance,RSI,target_var_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,,0.0,1,0.169365,0,,,
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,,0.0,1,3.636368,0,,,
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,,0.0,-1,0.672482,0,,,0.079430
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,,0.0,-1,3.173103,0,,,0.010064
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,,0.0,1,0.518059,0,,,-0.041408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652,0.0,-1,42.550049,0,2974.409912,53.279409,0.007229
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090,0.0,-1,39.780029,0,2974.409912,50.812601,-0.030341
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935,0.0,-1,9.040039,0,2974.409912,47.094640,-0.014358
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054,0.0,1,12.280029,0,2974.409912,46.081082,-0.023183


In [28]:
""" Next N days variations (dummy)"""

n = 1
df[f"target_var_{n}"] = df["Close"].pct_change(n).shift(n)

df["target_dummy"] = 1
df.loc[df[f"target_var_{n}"] < 0, "target_dummy"] = -1


df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation,DOJI,candle_way,amplitude_abs,Engulfing,resistance,RSI,target_var_1,target_dummy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,,0.0,1,0.169365,0,,,,1
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,,0.0,1,3.636368,0,,,,1
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,,0.0,-1,0.672482,0,,,0.079430,1
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,,0.0,-1,3.173103,0,,,0.010064,1
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,,0.0,1,0.518059,0,,,-0.041408,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652,0.0,-1,42.550049,0,2974.409912,53.279409,0.007229,1
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090,0.0,-1,39.780029,0,2974.409912,50.812601,-0.030341,-1
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935,0.0,-1,9.040039,0,2974.409912,47.094640,-0.014358,-1
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054,0.0,1,12.280029,0,2974.409912,46.081082,-0.023183,-1


In [45]:
""" Classify the variations"""

# find the variations
n = 1
df[f"target_var_{n}"] = df["Close"].pct_change(n).shift(n)


# Find the centile 33 and 67 on the train set
split = int(0.80 * len(df))
centile_33 = np.percentile(df[[f"target_var_{n}"]].iloc[:split].dropna(), 33)
centile_67 = np.percentile(df[f"target_var_{n}"].iloc[:split].dropna().values, 67)

# Dummy variable
df["target_dummy"] = 0
df.loc[df[f"target_var_{n}"] > centile_67, "target_dummy"] = 1
df.loc[df[f"target_var_{n}"] < centile_33, "target_dummy"] = -1

df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,var_10,var_200,moving_correlation,DOJI,candle_way,amplitude_abs,Engulfing,resistance,RSI,target_var_1,target_dummy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2004-08-19,49.813290,51.835709,47.800831,49.982655,49.982655,44871361,,,,0.0,1,0.169365,0,,,,0
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942874,,,,0.0,1,3.636368,0,,,,0
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342897,,,,0.0,-1,0.672482,0,,,0.079430,1
2004-08-24,55.412300,55.591629,51.591621,52.239197,52.239197,15319808,,,,0.0,-1,3.173103,0,,,0.010064,1
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232276,,,,0.0,1,0.518059,0,,,-0.041408,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-11,2373.000000,2375.889893,2324.689941,2330.449951,2330.449951,1335900,-0.017003,-0.173239,0.014652,0.0,-1,42.550049,0,2974.409912,53.279409,0.007229,1
2022-07-12,2336.770020,2356.989990,2292.300049,2296.989990,2296.989990,1248500,-0.015203,-0.190211,0.066090,0.0,-1,39.780029,0,2974.409912,50.812601,-0.030341,-1
2022-07-13,2252.780029,2303.139893,2236.459961,2243.739990,2243.739990,1947900,-0.003416,-0.213457,0.110935,0.0,-1,9.040039,0,2974.409912,47.094640,-0.014358,-1
2022-07-14,2216.520020,2239.750000,2186.510010,2228.800049,2228.800049,1618300,-0.007273,-0.212444,0.166054,0.0,1,12.280029,0,2974.409912,46.081082,-0.023183,-1
