# Import Lib

In [1]:
!pip install nsepythonserver

Collecting nsepythonserver
  Downloading nsepythonserver-2.93-py3-none-any.whl.metadata (7.4 kB)
Downloading nsepythonserver-2.93-py3-none-any.whl (25 kB)
Installing collected packages: nsepythonserver
Successfully installed nsepythonserver-2.93


In [2]:
from nsepythonserver import equity_history

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [15,8]

from sklearn.preprocessing import StandardScaler, normalize
scale = StandardScaler()

In [4]:
from datetime import date, datetime, timedelta

In [5]:
import warnings
warnings.filterwarnings('ignore')

# Import Data

In [6]:
# Mount google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Set output path
data_path =  '/content/drive/MyDrive/MyColabProject/Data'

In [8]:
symbol_range_43 = ['BPCL','POWERGRID','NTPC','SUNPHARMA','TATACONSUM','ONGC','HINDALCO','ICICIBANK','SBIN','BHARTIARTL',
                   'WIPRO','ITC','AXISBANK','JSWSTEEL','COALINDIA','HDFCLIFE','TATAMOTORS']

symbol_range_100 = ['SBILIFE', 'KOTAKBANK','CIPLA','TECHM','HCLTECH','INFY','GRASIM','HDFCBANK','INDUSINDBK','TATASTEEL','ADANIPORTS']

symbol_range_281 = ['TCS','SRTRANSFIN','HINDUNILVR','ASIANPAINT','RELIANCE','LT','TITAN','SHRIRAMFIN','HEROMOTOCO']

In [9]:
series = "EQ"
start_date = "01-04-2020"
#yesterday = datetime.now() - timedelta(1)
end_date = "23-11-2024" #datetime.strftime(yesterday, '%d-%m-%Y')

In [10]:
ohlc = ['open','high','low','close']
MAX_TIME_STEPS = 512
MAX_RANGE = {'symbol_range_43':43, 'symbol_range_100': 100, 'symbol_range_281':281}

In [11]:
MAX_RANGE['symbol_range_43']

43

# Data Preprocessing

In [12]:
class DataPreprocessing():
  def __init__(self, *, start_date, end_date, symbol, max_time_steps, max_range):
    super().__init__()
    self.start_date = start_date
    self.end_date = end_date
    self.symbol = symbol
    self.series = 'EQ'
    self.max_time_steps = max_time_steps
    self.max_range = max_range

  def data_preprocessing(self):
    # Load the dataset from nsepy
    data_df = pd.DataFrame(equity_history(self.symbol,self.series,self.start_date,self.end_date))

    # Select desired column
    data_df = data_df[['TIMESTAMP','mTIMESTAMP','CH_OPENING_PRICE','CH_TRADE_HIGH_PRICE',
                        'CH_TRADE_LOW_PRICE','CH_LAST_TRADED_PRICE']].sort_values(['TIMESTAMP'], ascending=[True]).copy()

    # Remove Duolicate
    data_df.drop_duplicates(ignore_index = True, inplace=True)

    # Set timestamp as index
    data_df['mTIMESTAMP'] = pd.to_datetime(data_df.mTIMESTAMP)
    data_df.set_index(data_df.mTIMESTAMP, verify_integrity=True, drop=True, inplace=True)

    # Drop unwanted columns
    data_df.drop(labels=['TIMESTAMP','mTIMESTAMP'], axis=1, inplace=True)

    # Rename the columns
    data_df.rename(columns={'CH_OPENING_PRICE':'open','CH_TRADE_HIGH_PRICE':'high',
                       'CH_TRADE_LOW_PRICE':'low','CH_LAST_TRADED_PRICE':'close'}, inplace=True)
    # Return the result
    return data_df


  def create_change_values(self, data_df):
    # Create past column
    req_cols = data_df.columns
    data_df_chg = data_df.copy()

    # Create past columns and change columns
    for col in req_cols:
      data_df_chg[str(col+'_chg')] = data_df[col] - data_df[col].shift(1)

    data_df_chg.drop(labels=['open',	'high',	'low',	'close'],axis=1, inplace=True)

    # Fill in null values
    data_df_chg.fillna(0.00,inplace=True)

    # return the result
    return data_df_chg

  def create_timeseries(self, data_df, ohlc_val):
    # Create future and target variavles
    sub_data_df = pd.DataFrame(data_df[str(ohlc_val+'_chg')].copy())
    sub_data_df.rename(columns={str(ohlc_val+'_chg'):0},inplace=True)
    for i in np.arange(self.max_time_steps,0,-1):
      sub_data_df.insert(loc=int(self.max_time_steps-i),column=i,value=sub_data_df[0].shift(i))

    sub_data_df.dropna(inplace=True)

    # Remove outliers
    reduced_matrix = []
    for i in sub_data_df.values:
      reduced_arr = []
      for j in i:
        if (j >= self.max_range):
          reduced_arr.append(self.max_range)
        elif (j <= -self.max_range):
          reduced_arr.append(-self.max_range)
        else:
          reduced_arr.append(j)
      reduced_matrix.append(reduced_arr)

    # Convert matrix into dataframe
    reduced_data_df = pd.DataFrame(reduced_matrix)
    reduced_data_df = pd.concat([reduced_data_df, pd.Series(sub_data_df.index)], axis=1, verify_integrity=True)
    reduced_data_df.set_index('mTIMESTAMP', inplace=True)

    # Return the result
    return reduced_data_df

  #def derive_inputs(self, data_df, scaler):
  #  # drop 0th column to get the sequence of 32 inputs
  #    # input features: cols 1 to 32
  #    # expected output: to predict 33rd val (unknown future value)
  #  final_data_df = data_df.copy()
  #  final_data_df.drop(columns=0, axis=1, inplace=True)

  #  # Scaling
  #  X_data_sc_matrix = scaler.transform(final_data_df)
  #  X_data_sc_df = pd.DataFrame(pd.concat([pd.Series(final_data_df.index), pd.DataFrame(X_data_sc_matrix)], axis=1, verify_integrity=True)).set_index('mTIMESTAMP')

  #  return X_data_sc_df.tail(1)

# Time Series Generation - Range 43

In [None]:
# Declare dictionaries
symbol_objs = {}
data_df_combined = {}

# Create combined DataFrames
data_df_merged = pd.DataFrame()
for val in ohlc:
  data_df_combined[val] = pd.DataFrame()

# Create processed data
for symbol in symbol_range_43:
  print(symbol)
  # Defigning objects
  symbol_objs[symbol] = DataPreprocessing(start_date=start_date, end_date=end_date, symbol=symbol, max_time_steps=MAX_TIME_STEPS, max_range=43)

  # Data Loading and Preprocessing
  processed_df = symbol_objs[symbol].data_preprocessing()

  # Creating Change Dataset
  change_df = symbol_objs[symbol].create_change_values(processed_df)

  # Creating TimeSeries
  timeseries_set = {}
  for val in ohlc:
    timeseries_set[val] = symbol_objs[symbol].create_timeseries(change_df,val)
    data_df_combined[val] = pd.concat([data_df_combined[val], pd.DataFrame(timeseries_set[val].values)], ignore_index=True, axis=0)

  # update it to dataframe
  data_df_complete = pd.concat([pd.DataFrame(timeseries_set['open'].values),
                              pd.DataFrame(timeseries_set['high'].values),
                              pd.DataFrame(timeseries_set['low'].values),
                              pd.DataFrame(timeseries_set['close'].values)],
                             ignore_index=True, axis=0)
  data_df_complete.to_csv(data_path+'/Individual/'+str(symbol)+'_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)
  data_df_merged = pd.concat([data_df_merged, data_df_complete], ignore_index=True, axis=0)


# Update the data to csv
for val in ohlc:
  data_df_combined[val].to_csv(data_path+'/'+str(val)+'_range43_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

data_df_merged.to_csv(data_path+'/range43_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

BPCL
POWERGRID
NTPC
SUNPHARMA
TATACONSUM
ONGC
HINDALCO
ICICIBANK
SBIN
BHARTIARTL
WIPRO
ITC
AXISBANK
JSWSTEEL
COALINDIA
HDFCLIFE
TATAMOTORS


# Time Series Generation - Range 100

In [13]:
# Declare dictionaries
symbol_objs = {}
data_df_combined = {}

# Create combined DataFrames
data_df_merged = pd.DataFrame()
for val in ohlc:
  data_df_combined[val] = pd.DataFrame()

# Create processed data
for symbol in symbol_range_100:
  print(symbol)
  # Defigning objects
  symbol_objs[symbol] = DataPreprocessing(start_date=start_date, end_date=end_date, symbol=symbol, max_time_steps=MAX_TIME_STEPS, max_range=100)

  # Data Loading and Preprocessing
  processed_df = symbol_objs[symbol].data_preprocessing()

  # Creating Change Dataset
  change_df = symbol_objs[symbol].create_change_values(processed_df)

  # Creating TimeSeries
  timeseries_set = {}
  for val in ohlc:
    timeseries_set[val] = symbol_objs[symbol].create_timeseries(change_df,val)
    data_df_combined[val] = pd.concat([data_df_combined[val], pd.DataFrame(timeseries_set[val].values)], ignore_index=True, axis=0)

  # update it to dataframe
  data_df_complete = pd.concat([pd.DataFrame(timeseries_set['open'].values),
                              pd.DataFrame(timeseries_set['high'].values),
                              pd.DataFrame(timeseries_set['low'].values),
                              pd.DataFrame(timeseries_set['close'].values)],
                             ignore_index=True, axis=0)
  data_df_complete.to_csv(data_path+'/Individual/'+str(symbol)+'_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)
  data_df_merged = pd.concat([data_df_merged, data_df_complete], ignore_index=True, axis=0)


# Update the data to csv
for val in ohlc:
  data_df_combined[val].to_csv(data_path+'/'+str(val)+'_range100_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

data_df_merged.to_csv(data_path+'/range100_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

SBILIFE
KOTAKBANK
CIPLA
TECHM
HCLTECH
INFY
GRASIM
HDFCBANK
INDUSINDBK
TATASTEEL
ADANIPORTS


# Time Series Generation - Range 281

In [14]:
# Declare dictionaries
symbol_objs = {}
data_df_combined = {}

# Create combined DataFrames
data_df_merged = pd.DataFrame()
for val in ohlc:
  data_df_combined[val] = pd.DataFrame()

# Create processed data
for symbol in symbol_range_281:
  print(symbol)
  # Defigning objects
  symbol_objs[symbol] = DataPreprocessing(start_date=start_date, end_date=end_date, symbol=symbol, max_time_steps=MAX_TIME_STEPS, max_range=281)

  # Data Loading and Preprocessing
  processed_df = symbol_objs[symbol].data_preprocessing()

  # Creating Change Dataset
  change_df = symbol_objs[symbol].create_change_values(processed_df)

  # Creating TimeSeries
  timeseries_set = {}
  for val in ohlc:
    timeseries_set[val] = symbol_objs[symbol].create_timeseries(change_df,val)
    data_df_combined[val] = pd.concat([data_df_combined[val], pd.DataFrame(timeseries_set[val].values)], ignore_index=True, axis=0)

  # update it to dataframe
  data_df_complete = pd.concat([pd.DataFrame(timeseries_set['open'].values),
                              pd.DataFrame(timeseries_set['high'].values),
                              pd.DataFrame(timeseries_set['low'].values),
                              pd.DataFrame(timeseries_set['close'].values)],
                             ignore_index=True, axis=0)
  data_df_complete.to_csv(data_path+'/Individual/'+str(symbol)+'_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)
  data_df_merged = pd.concat([data_df_merged, data_df_complete], ignore_index=True, axis=0)


# Update the data to csv
for val in ohlc:
  data_df_combined[val].to_csv(data_path+'/'+str(val)+'_range_281_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

data_df_merged.to_csv(data_path+'/range281_raw_nsepy_inp512_differencedVal_fourierTransform.csv', index=False)

TCS
SRTRANSFIN
HINDUNILVR
ASIANPAINT
RELIANCE
LT
TITAN
SHRIRAMFIN
HEROMOTOCO
