## Feature Selection/Engineering

In [1]:
%%capture
#Load data_preparation_and_preprocessing file
%run data_exploration.ipynb

##### Create new variables(Features)

In [2]:
#Create Relative Strength Indicator (RSI)
n = 14  #Number of days for RSI calculation
#Calculate price changes
dataset["Price Change"] = dataset["Close"].diff()

#Separate gains and losses
gains = dataset["Price Change"].where(dataset["Price Change"] > 0, 0)
losses = -dataset["Price Change"].where(dataset["Price Change"] < 0, 0)

#Calculate average gains and losses over n days
average_gains = gains.rolling(window=n, min_periods=1).mean()
average_losses = losses.rolling(window=n, min_periods=1).mean()

#Calculate RS and RSI
rs = average_gains / average_losses
rsi = 100 - (100 / (1 + rs))

#Add RSI to the dataset
dataset["RSI"] = rsi

In [3]:
#Create Trading Volume Ratios
n = 5  #Number of days for volume ratio

#Calculate the 5 day average trading volume
dataset["5 day Avg Volume"] = dataset["Volume"].rolling(window=n, min_periods=1).mean()

# Calculate the 5-day trading volume ratio
dataset["5 day Avg Volume"] = dataset["Volume"] / dataset["5 day Avg Volume"]

In [4]:
dataset.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'MA_50',
       'MA_200', '50 day MA Volume', '200 day MA Volume', '20 day SMA',
       '20 day StdDev', 'Upper Bollinger', 'Lower Bollinger', 'Price Change',
       'RSI', '5 day Avg Volume'],
      dtype='object')

In [5]:
#Get variables to use(New dataset)
dataset = dataset[["Date", "Open", "High", "Low", "Close", "Adj Close", "Volume", "RSI", "5 day Avg Volume"]]

In [6]:
# Set Date as the index
dataset.set_index("Date", inplace=True)
#Extract date values
dataset["Year"] = dataset.index.year
dataset["Month"] = dataset.index.month
dataset["Quarter"] = dataset.index.quarter
dataset["DayOfWeek"] = dataset.index.dayofweek
dataset["DayOfMonth"] = dataset.index.day

In [7]:
dataset = dataset[["Year", "Month", "Quarter", "DayOfWeek", "DayOfMonth", "Open", "High", 
         "Low", "Close", "Adj Close", "Volume", "RSI", "5 day Avg Volume"]]

In [8]:
dataset.head()

Unnamed: 0_level_0,Year,Month,Quarter,DayOfWeek,DayOfMonth,Open,High,Low,Close,Adj Close,Volume,RSI,5 day Avg Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-06-29,2010,6,2,1,29,19.0,25.0,17.540001,23.889999,23.889999,18766300,,1.0
2010-06-30,2010,6,2,2,30,25.790001,30.42,23.299999,23.83,23.83,17187100,0.0,0.956076
2010-07-01,2010,7,3,3,1,25.0,25.92,20.27,21.959999,21.959999,8218800,0.0,0.558188
2010-07-02,2010,7,3,4,2,23.0,23.1,18.709999,19.200001,19.200001,5139800,0.0,0.416921
2010-07-06,2010,7,3,1,6,20.0,20.0,15.83,16.110001,16.110001,6866900,0.0,0.611164


In [9]:
#Fill NaN with 0
dataset = dataset.fillna(0)

##### Feature Selection

In [10]:
#!pip install stepwise_regression

In [11]:
X = dataset[["Year", "Month", "Quarter", "DayOfWeek", "DayOfMonth", "Open", "High", 
         "Low", "Adj Close", "Volume", "RSI", "5 day Avg Volume"]]
y = dataset["Close"]

In [12]:
#Select features using stepwise selection(Forward regression)
selected = step_reg.forward_regression(X, y, 0.05, verbose=True)

Add  Open                           with p-value 0.0
Add  Low                            with p-value 0.0
Add  High                           with p-value 0.0
Add  Adj Close                      with p-value 0.0
Add  Volume                         with p-value 4.57868e-95


In [13]:
selected

['Open', 'Low', 'High', 'Adj Close', 'Volume']