In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
# Load the information of stocks (i.e., name & category of sector)
NIFTY50_name = pd.read_csv("D:\\capstone_project\\capstone_project\\NIFTY50_category.csv")

In [4]:
# Load the original information of stock price
NIFTY50_stock = {}
for target in NIFTY50_name["company"]:
    da = {}
    da['category'] = NIFTY50_name[NIFTY50_name.company == target]['category'].iloc[0]
    try:
        da['stock_price'] = pd.read_csv(f"D:\\capstone_project\\capstone_project\\NIFTY50\\SP500_dataset\\{target}.csv")
    except FileNotFoundError:
        print(f"File not found for {target}. Skipping...")
        continue

    if 'Date ' not in da['stock_price'].columns:
        print(f"'Date' column not found in {target}.csv. Skipping...")
        continue

    NIFTY50_stock[target] = da

In [5]:
NIFTY50_stock['ONGC']['stock_price']

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,vwap,52W H,52W L,VOLUME,VALUE,No of trades
0,26-Jun-2024,EQ,266.90,269.15,264.10,267.00,267.8,267.75,266.57,292.95,155.85,10650910,2839200588.35,105797
1,25-Jun-2024,EQ,270.10,270.80,265.00,269.90,267.1,267.00,267.16,292.95,155.85,14425434,3853915651.60,119470
2,24-Jun-2024,EQ,269.65,271.25,266.60,269.65,270.1,269.90,269.22,292.95,155.85,7282347,1960531156.75,74981
3,21-Jun-2024,EQ,271.45,274.75,268.90,271.85,270.0,269.65,271.26,292.95,155.40,17406416,4721650927.75,105432
4,20-Jun-2024,EQ,271.80,274.00,269.10,271.55,272.2,271.85,271.96,292.95,155.40,12826509,3488316117.15,145663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,01-Jul-2022,EQ,148.95,151.15,130.00,151.55,131.4,131.05,136.30,194.95,108.50,125788305,17144326289.40,509039
493,30-Jun-2022,EQ,153.00,154.45,149.25,154.15,150.9,151.55,151.62,194.95,108.50,38725546,5871630059.15,185317
494,29-Jun-2022,EQ,149.35,157.40,146.20,149.35,153.6,154.15,152.82,194.95,108.50,165860322,25346803135.95,272601
495,28-Jun-2022,EQ,142.80,151.75,142.15,141.50,148.8,149.35,146.92,194.95,108.50,54387675,7990599371.65,215624


In [6]:
# Check if all stocks have the same dates
need_day = np.array(NIFTY50_stock["ONGC"]["stock_price"]['Date '])
for target in NIFTY50_stock.keys():
    if 'Date ' in NIFTY50_stock[target]["stock_price"].columns:
        NIFTY50_stock[target]["stock_price"] = NIFTY50_stock[target]["stock_price"][
            NIFTY50_stock[target]["stock_price"]['Date '].isin(need_day)].reset_index(drop=True)
        NIFTY50_stock[target]["stock_price"].index = NIFTY50_stock[target]["stock_price"]['Date ']
    else:
        print(f"'Date' column missing for {target}. Skipping...")

In [8]:
# Normalize stock price
normalize_scalar = {}
for target in NIFTY50_stock.keys():
    scaler = StandardScaler()
    # Check if the 'close ' column is of type string, if not convert it to string
    if NIFTY50_stock[target]["stock_price"]["close "].dtype != 'object':
        NIFTY50_stock[target]["stock_price"]["close "] = NIFTY50_stock[target]["stock_price"]["close "].astype(str)
    # Remove commas and convert to float
    NIFTY50_stock[target]["stock_price"]["close "] = NIFTY50_stock[target]["stock_price"]["close "].str.replace(',', '').astype(float)
    nor_data = scaler.fit_transform(np.array(NIFTY50_stock[target]["stock_price"]["close "]).reshape(-1, 1)).ravel()
    NIFTY50_stock[target]["stock_price"]["nor_close"] = nor_data
    normalize_scalar[target] = scaler

In [15]:
# Calculate return ratio
for target in NIFTY50_stock.keys():
    return_tratio = []
    data = np.array(NIFTY50_stock[target]["stock_price"]["close "])
    for i in range(len(data)):
        if i == 0:
            return_tratio.append(0)
        else:
            return_tratio.append((data[i] - data[i - 1]) / data[i - 1])
    NIFTY50_stock[target]["stock_price"]["return_ratio"] = return_tratio

In [16]:
# Calculate the percentage change
for target in NIFTY50_stock.keys():
    function = lambda x, y: (x / y) - 1
    data = NIFTY50_stock[target]["stock_price"]

# Ensure columns are converted to float
    if data["OPEN "].dtype == 'object':
        data["OPEN "] = data["OPEN "].str.replace(',', '').astype(float)
    if data["HIGH "].dtype == 'object':
        data["HIGH "] = data["HIGH "].str.replace(',', '').astype(float)
    if data["LOW "].dtype == 'object':
        data["LOW "] = data["LOW "].str.replace(',', '').astype(float)

    data["c_open"] = list(map(function, data["OPEN "], data["close "]))
    data["c_high"] = list(map(function, data["HIGH "], data["close "]))
    data["c_low"] = list(map(function, data["LOW "], data["close "]))


In [17]:
# 5 / 10 / 15 / 20 / 25 / 30 days moving average
for target in NIFTY50_stock.keys():
    data = NIFTY50_stock[target]["stock_price"]["close "]
    for i in [5, 10, 15, 20, 25, 30]:
        q = []
        for day in range(len(data)):
            if day >= i - 1:
                q.append((np.mean(data.iloc[day - i + 1:day + 1]) / data.iloc[day]) - 1)
            if day < i - 1:
                q.append(0)
        NIFTY50_stock[target]["stock_price"][f"{i}-days"] = q

In [18]:
# Category of sector (one hot encoding)
label = LabelEncoder()
label.fit(NIFTY50_name["category"].unique())

for target in NIFTY50_stock.keys():
    for lbl in NIFTY50_name["category"].unique():
        cate = NIFTY50_stock[target]['category']
        if lbl != cate:
            NIFTY50_stock[target]["stock_price"][f"label_{lbl}"] = 0
        if lbl == cate:
            NIFTY50_stock[target]["stock_price"][f"label_{lbl}"] = 1

In [19]:
# Total feature
features = {}
for target in NIFTY50_stock.keys():
    features[target] = NIFTY50_stock[target]["stock_price"].iloc[30:, 7:].reset_index(drop=True)

In [20]:
# Movement of stock
Y_buy_or_not = {}
for target in NIFTY50_stock.keys():
    Y_buy_or_not[target] = (features[target]['return_ratio'] >= 0) * 1

In [21]:
# Training & Testing
train_size = 0.2
test_size = 0.8
days = len(features["ONGC"])

In [22]:
train_day = int(days * train_size)

# Data of training set and testing set
train_data = {}
test_data = {}
train_Y_buy_or_not = {}
test_Y_buy_or_not = {}

In [23]:
for i in NIFTY50_stock.keys():
    train_data[i] = features[i].iloc[:train_day, :]
    train_Y_buy_or_not[i] = Y_buy_or_not[i][:train_day]
    test_data[i] = features[i].iloc[train_day:, :]
    test_Y_buy_or_not[i] = Y_buy_or_not[i][train_day:]

In [89]:
import pickle
import numpy as np
import pandas as pd

def str_to_float(value):
    if isinstance(value, str):
        try:
            return float(value)
        except ValueError:
            return value
    return value

In [274]:
# Load the pickle file
with open('D:\\capstone_project\\capstone_project\\converted_pickle_file.pkl', 'rb') as f:
    dt = pickle.load(f)

In [277]:
dt['train']['y_up_or_down']

array([[1.0, 1.0, 1.0, ..., 1.0, 1.0, 0.0],
       [0.0, 1.0, 0.0, ..., 0.0, 0.0, 1.0],
       [1.0, 1.0, 1.0, ..., 0.0, 1.0, 1.0],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 1.0, 0.0, 0.0]], dtype=object)

In [226]:
# Check the type of data
print(f"Type of loaded data: {type(data)}")

Type of loaded data: <class 'dict'>


In [12]:
# If it's a list or numpy array
if isinstance(data, (list, np.ndarray)):
    # Convert string numbers to floats
    converted_data = [str_to_float(item) for item in data]
# If it's a dictionary
elif isinstance(data, dict):
    # Convert string numbers to floats in dictionary values
    converted_data = {k: str_to_float(v) for k, v in data.items()}
else:
    print("Unsupported data type. Please provide more information about your data structure.")
    converted_data = data

In [227]:
def convert_to_int(s):
    # Remove comma
    s = s.replace(',', '')
    
    # Round to nearest integer and convert to int
    return np.float64(s)

In [228]:
print(converted_data['train'].keys())

dict_keys(['x1', 'x2', 'x3', 'x4', 'y_return_ratio', 'y_up_or_down'])


In [249]:
df = data
df

{'train': {'x1': array([[[[3037.55, '2,995.85', '3,350.00', ..., 0, 0, 0],
           [2879.6, '2,839.27', '3,350.00', ..., 0, 0, 0],
           [2797.25, '2,791.48', '3,350.00', ..., 0, 0, 0],
           ...,
           [2860.55, '2,860.65', '3,350.00', ..., 0, 0, 0],
           [2835.95, '2,857.91', '3,350.00', ..., 0, 0, 0],
           [2874.65, '2,890.39', '3,350.00', ..., 0, 0, 0]],
  
          [[2362.55, '2,352.04', '2,769.65', ..., 0, 0, 0],
           [2325.65, '2,332.54', '2,769.65', ..., 0, 0, 0],
           [2341.05, '2,340.63', '2,769.65', ..., 0, 0, 0],
           ...,
           [2256.2, '2,239.72', '2,769.65', ..., 0, 0, 0],
           [2217.05, '2,214.90', '2,769.65', ..., 0, 0, 0],
           [2226.3, '2,232.05', '2,769.65', ..., 0, 0, 0]],
  
          [[2533.2, '2,534.73', '27,693.00', ..., 0, 0, 0],
           [2513.45, '2,514.86', '27,693.00', ..., 0, 0, 0],
           [2539.0, '2,522.54', '27,693.00', ..., 0, 0, 0],
           ...,
           [2457.4, '2,459.83',

In [231]:
list(df.keys())

['train', 'test']

In [232]:
list(df['train'].keys())

['x1', 'x2', 'x3', 'x4', 'y_return_ratio', 'y_up_or_down']

In [250]:
A = ['train', 'test']
B = ['x1', 'x2', 'x3', 'x4']

for a in A:
    for b in B:
        x,y,z,w = df[a][b].shape
        for i in range(x):
            for j in range(y):
                for k in range(z):
                    for l in range(w):
                        if(type(df[a][b][i][j][k][l])==str):
                            df[a][b][i][j][k][l] = convert_to_int(df[a][b][i][j][k][l])
                        else:
                            df[a][b][i][j][k][l] = np.float64(df[a][b][i][j][k][l])

In [256]:
A = ['train', 'test']
B = ['y_return_ratio', 'y_up_or_down']


for a in A:
    for b in B:
        x,y = df[a][b].shape
        for i in range(x):
            for j in range(y):
                        if(type(df[a][b][i][j])!=np.float64):
                              df[a][b][i][j] = np.float64(df[a][b][i][j])

In [262]:
type(df['test']['y_up_or_down'][0][0])

numpy.float64

In [264]:
lis = []

In [272]:
lis

[]

In [271]:
a = 'test'
b = 'y_return_ratio'
x,y = df[a][b].shape
for i in range(x):
        for j in range(y):
            if(type(df[a][b][i][j])!=np.float64):
                tem = []
                tem.append(i)
                tem.append(j)
                lis.append(tem)

In [217]:
x,y = len(lis), len(lis[0])

In [219]:
y

2

In [224]:
a = 'test'
b = 'y_up_or_down'

for q in lis:
    df[a][b][q[0]][q[1]] = np.float64(df[a][b][q[0]][q[1]])



In [214]:
len(lis[0])

2

In [95]:
for a in list(df.keys()):
    for b in list(df[a].keys()):
        x,y,z,w = df[a][b].shape
        for i in range(x):
            for j in range(y):
                for k in range(z):
                    for l in range(w):
                        if(type(df[a][b][i][j][k][l])==str):
                            df[a][b][i][j][k][l] = convert_to_int(df[a][b][i][j][k][l])
                        elif(type(df[a][b][i][j][k][l])==int):
                            df[a][b][i][j][k][l] = float(df[a][b][i][j][k][l])
        print(a, b)

train x1
train x2
train x3
train x4


ValueError: not enough values to unpack (expected 4, got 2)

In [56]:
for i in range(df.shape[0]):
    for j in range(df.shape[1]):
        for k in range(df.shape[2]):
            for l in range(df.shape[3]):
                if(type(df[i][j][k][l])==str):
                    df[i][j][k][l] = convert_to_int(df[i][j][k][l])
                elif(type(df[i][j][k][l])==int):
                    df[i][j][k][l] = float(df[i][j][k][l])

In [273]:
with open('converted_pickle_file.pkl', 'wb') as f:
    pickle.dump(df, f)

print("Conversion complete. New file saved as 'converted_pickle_file.pkl'")

Conversion complete. New file saved as 'converted_pickle_file.pkl'


In [None]:


# Save the converted data back to a pickle file
with open('converted_pickle_file.pkl', 'wb') as f:
    pickle.dump(converted_data, f)

print("Conversion complete. New file saved as 'converted_pickle_file.pkl'")

In [27]:
# Week represents the number of our inputs
def before_day(week):
    # Train
    train = {}
    for w in range(week):
        train_x = []
        for tr_ind in range(len(train_data["ONGC"]) - 7 - (week - 2) - 1):
            tr = []
            for target in NIFTY50_stock.keys():
                data = train_data[target]
                if tr_ind + w + 7 <= len(data):
                    tr.append(data.iloc[tr_ind + w:tr_ind + w + 7, :].values)
            if len(tr) == len(NIFTY50_stock):  # Ensure all targets are included
                train_x.append(tr)
        train[f"x{w + 1}"] = np.array(train_x, dtype=object)

    train_y1, train_y2 = [], []
    for tr_ind in range(len(train_data["ONGC"]) - 7 - (week - 2) - 1):
        tr_y1, tr_y2 = [], []
        for target in NIFTY50_stock.keys():
            data = train_data[target]
            if tr_ind + (week - 1) + 7 < len(data):
                tr_y1.append(data["return_ratio"].iloc[tr_ind + (week - 1) + 7])
                tr_y2.append(train_Y_buy_or_not[target].iloc[tr_ind + (week - 1) + 7])
        if len(tr_y1) == len(NIFTY50_stock) and len(tr_y2) == len(NIFTY50_stock):
            train_y1.append(tr_y1)
            train_y2.append(tr_y2)
    train['y_return_ratio'] = np.array(train_y1, dtype=object)
    train["y_up_or_down"] = np.array(train_y2, dtype=object)
    
    # Test
    test = {}
    for w in range(week):
        test_x = []
        for te_ind in range(len(test_data["ONGC"]) - 7 - (week - 2) - 1):
            te = []
            for target in NIFTY50_stock.keys():
                data = test_data[target]
                if te_ind + w + 7 <= len(data):
                    te.append(data.iloc[te_ind + w:te_ind + w + 7, :].values)
            if len(te) == len(NIFTY50_stock):  # Ensure all targets are included
                test_x.append(te)
        test[f'x{w + 1}'] = np.array(test_x, dtype=object)

    test_y1, test_y2 = [], []
    for te_ind in range(len(test_data["ONGC"]) - 7 - (week - 2) - 1):
        te_y1, te_y2 = [], []
        for target in NIFTY50_stock.keys():
            data = test_data[target]
            if te_ind + (week - 1) + 7 < len(data):
                te_y1.append(data["return_ratio"].iloc[te_ind + (week - 1) + 7])
                te_y2.append(test_Y_buy_or_not[target].iloc[te_ind + (week - 1) + 7])
        if len(te_y1) == len(NIFTY50_stock) and len(te_y2) == len(NIFTY50_stock):
            test_y1.append(te_y1)
            test_y2.append(te_y2)
    test['y_return_ratio'] = np.array(test_y1, dtype=object)
    test["y_up_or_down"] = np.array(test_y2, dtype=object)

    data = {"train": train, "test": test}

    return data

In [28]:
# Example usage
data1 = data

In [29]:
data1 = before_day(4)

In [30]:
data1

{'train': {'x1': array([[[[3037.55, '2,995.85', '3,350.00', ..., 0, 0, 0],
           [2879.6, '2,839.27', '3,350.00', ..., 0, 0, 0],
           [2797.25, '2,791.48', '3,350.00', ..., 0, 0, 0],
           ...,
           [2860.55, '2,860.65', '3,350.00', ..., 0, 0, 0],
           [2835.95, '2,857.91', '3,350.00', ..., 0, 0, 0],
           [2874.65, '2,890.39', '3,350.00', ..., 0, 0, 0]],
  
          [[2362.55, '2,352.04', '2,769.65', ..., 0, 0, 0],
           [2325.65, '2,332.54', '2,769.65', ..., 0, 0, 0],
           [2341.05, '2,340.63', '2,769.65', ..., 0, 0, 0],
           ...,
           [2256.2, '2,239.72', '2,769.65', ..., 0, 0, 0],
           [2217.05, '2,214.90', '2,769.65', ..., 0, 0, 0],
           [2226.3, '2,232.05', '2,769.65', ..., 0, 0, 0]],
  
          [[2533.2, '2,534.73', '27,693.00', ..., 0, 0, 0],
           [2513.45, '2,514.86', '27,693.00', ..., 0, 0, 0],
           [2539.0, '2,522.54', '27,693.00', ..., 0, 0, 0],
           ...,
           [2457.4, '2,459.83',

In [3]:
# Example usage
data = before_day(4)

# Save the processed data to a pickle file
with open('NIFTY50_data.pkl', 'wb') as f:
    pickle.dump(data, f)

print("code is succesfully collected..")




code is succesfully collected..
