In [14]:
# Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
sns.set(rc={'figure.figsize': (20, 8)})

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler,
                                   MinMaxScaler)
from sklearn.pipeline import Pipeline

# Regressor
from sklearn.linear_model import (LinearRegression,
                                  Lasso,
                                  Ridge,
                                  ElasticNet)

# Metrics
from sklearn.metrics import mean_squared_error

# Set display options
pd.options.display.float_format = "{:.4f}".format
pd.set_option('display.max_columns', 100)

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Load nifty index data
df = pd.read_csv('https://raw.githubusercontent.com/kannansingaravelu/datasets/main/niftyindex.csv', 
                 index_col=0,
                 dayfirst=True)

# Descriptive statistics
df.describe()


Unnamed: 0,Open,High,Low,Close,Volume
count,2719.0,2719.0,2719.0,2719.0,2719.0
mean,10124.898,10175.2921,10057.4698,10117.571,289222065.6433
std,3709.6071,3725.527,3685.3052,3706.9455,195276778.8519
min,4640.2,4645.95,4588.05,4636.75,2826000.0
25%,7637.0,7681.0,7581.65,7623.8,159178008.0
50%,9376.95,9431.9,9301.35,9351.85,219496064.0
75%,11700.85,11755.125,11631.325,11684.0,340908832.0
max,18871.95,18887.6,18778.2,18812.5,1810971008.0


In [16]:
# Check for missing values
df.isnull().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [17]:
# Create features
def create_features(frame):
    df = frame.copy()
    multiplier = 2
    
    # Features
    df['OC'] = df['Close'] / df['Open'] - 1
    df['HC'] = df['High'] / df['Low'] - 1
    df['GAP'] = df['Open'] / df['Close'].shift(1) - 1
    df['RET'] = np.log(df['Close'] / df['Close'].shift(1))
    
    for i in [7, 14, 28]:
        df['PCHG' +  str(i)] = df['Close'].pct_change(i)
        df['VCHG' +  str(i)] = df['Volume'].pct_change(i)
        df['RET' + str(i)] = df['RET'].rolling(i).sum()
        df['MA' + str(i)] = df['Close'] / df['Close'].rolling(i).mean()
        df['VMA' + str(i)] = df['Volume'] / df['Volume'].rolling(i).mean()
        df['OC' + str(i)] = df['OC'].rolling(i).mean()
        df['HC' + str(i)] = df['HC'].rolling(i).mean()
        df['GAP' + str(i)] = df['GAP'].rolling(i).mean()
        df['STD' + str(i)] = df['RET'].rolling(i).std()
        df['UB' + str(i)] = df['Close'].rolling(i).mean() + df['Close'].rolling(i).std()*multiplier
        df['LB' + str(i)] = df['Close'].rolling(i).mean() - df['Close'].rolling(i).std()*multiplier
        
    # Drop NaN values
    df['Label'] = df['Close'].shift(-1)
    df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
    df.dropna(inplace=True)
    
    return df

In [18]:
# Features
df1 = create_features(df)
display(df1.shape)

# Verify the output
df1.head(2)

(2690, 38)

Unnamed: 0,OC,HC,GAP,RET,PCHG7,VCHG7,RET7,MA7,VMA7,OC7,HC7,GAP7,STD7,UB7,LB7,PCHG14,VCHG14,RET14,MA14,VMA14,OC14,HC14,GAP14,STD14,UB14,LB14,PCHG28,VCHG28,RET28,MA28,VMA28,OC28,HC28,GAP28,STD28,UB28,LB28,Label
10/02/2012,-0.0034,0.0162,-0.0023,-0.0057,0.0279,-0.0406,0.0275,1.0058,0.9455,0.0008,0.0145,0.0032,0.0065,5442.199,5259.1296,0.066,0.0195,0.0639,1.0249,1.0192,0.002,0.0142,0.0026,0.0107,5486.145,5015.7836,0.1606,1.0777,0.149,1.0646,1.0967,0.0029,0.0138,0.0024,0.0104,5509.5275,4600.9761,5390.2
13/02/2012,0.0015,0.013,0.0001,0.0016,0.0228,-0.4628,0.0226,1.0042,0.8406,0.0011,0.0146,0.0022,0.0064,5428.6264,5307.0736,0.0682,0.1478,0.0659,1.0217,0.815,0.0018,0.0146,0.003,0.0106,5489.4954,5061.5689,0.1311,0.2383,0.1232,1.0616,0.8783,0.0023,0.0135,0.0022,0.0094,5534.1504,4620.9889,5416.05
