In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_hub as hub
import tensorflow_text as text

# PREPROCESSING THE DATA

#### DATASETS:
* itc_final_posts (excel) - money control posts divided into spam and legit
* ITC.NS (excel) - ITC stock market data collected from yahoo finance

In [2]:
df_itc = pd.read_csv(r"itc/ITC.NS.csv",
                     parse_dates = ["Date"],
                     index_col = ["Date"])
df_itc.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-31,203.766663,206.266663,202.066666,205.100006,162.141357,13790790.0
2013-02-01,205.166672,206.633331,203.03334,206.266663,163.063675,9906601.0
2013-02-04,206.266663,207.266663,204.166672,204.833328,161.930557,11610360.0
2013-02-05,203.333328,204.46666,201.333328,201.633331,159.400803,7980306.0
2013-02-06,202.233337,203.933334,200.733337,203.233337,160.665695,12331005.0


In [3]:
df_itc_posts = pd.read_excel("itc/itc_final_posts.xlsx")
df_itc_posts.head()

Unnamed: 0.1,Unnamed: 0,Messages,Time Stamp,Spam
0,0,NO ISSUE IN MANAGEMENT THEY ARE FOCUSED ON BUS...,2022-02-10 19:23:04,1.0
1,1,Adani Wilmar up 72% in just 3 sessions from li...,2022-02-10 16:12:41,1.0
2,2,"Sure, one day this company will be taken over ...",2022-02-10 17:53:08,1.0
3,3,Timeless financial quotes give investors a bet...,2022-02-10 19:43:23,1.0
4,4,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39,0.0


#### STEPS TO PERFORM:
* Removing unnecessary columns
* Removing spam posts
* Getting the correct dates
* Sliding a window of 7 days and preparing the data

#### Removing unnecessary columns

In [4]:
df_itc_posts.drop(["Unnamed: 0"], axis=1,inplace=True)
df_itc_posts.head()

Unnamed: 0,Messages,Time Stamp,Spam
0,NO ISSUE IN MANAGEMENT THEY ARE FOCUSED ON BUS...,2022-02-10 19:23:04,1.0
1,Adani Wilmar up 72% in just 3 sessions from li...,2022-02-10 16:12:41,1.0
2,"Sure, one day this company will be taken over ...",2022-02-10 17:53:08,1.0
3,Timeless financial quotes give investors a bet...,2022-02-10 19:43:23,1.0
4,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39,0.0


#### Removing spam posts

In [5]:
df_itc_posts = df_itc_posts[df_itc_posts.Spam==0.0]
df_itc_posts.drop(["Spam"],axis=1,inplace=True)
df_itc_posts.head()

Unnamed: 0,Messages,Time Stamp
4,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39
7,Strong Long Term Fundamental Strength with an ...,2022-02-10 15:59:02
8,after results today some kind up sides movemen...,2022-02-10 15:43:25
9,do not misguide people in this common forum ev...,2022-02-10 15:42:33
14,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39


#### Getting the correct dates

* ITC.NS has stock market data from 31-01-2013 to 11-02-2022 and itc_final_posts has data from 12-11-2021 to 10-02-2022.
* 12-11-2021 has only one post. Hence, the price on 14-11-2021 can be predicted using posts from 13-11-2021. But 14-11-2021 is   a sunday. Hence, the price from 15-11-2021 can be predicted

#### Sliding a window of 7 days and preparing the data

In [7]:
from stock_helper import prepare_data
x,y = prepare_data(df_itc)
x

Unnamed: 0_level_0,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0,MACDh_12_26_9,RSI_14,AROONOSC_14,Close + 1,BBL_20_2.0 + 1,...,AROONOSC_14 + 6,Close + 7,BBL_20_2.0 + 7,BBM_20_2.0 + 7,BBU_20_2.0 + 7,BBB_20_2.0 + 7,BBP_20_2.0 + 7,MACDh_12_26_9 + 7,RSI_14 + 7,AROONOSC_14 + 7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-04-01,191.946030,200.301666,208.657303,8.343053,0.769180,0.826599,59.372993,92.857140,206.300003,191.646591,...,85.714287,204.199997,190.924728,197.585007,204.245285,6.741683,0.996601,1.614461,64.485970,78.571426
2013-04-02,192.759949,200.831665,208.903381,8.038295,0.743752,0.702402,59.263271,78.571426,204.800003,191.946030,...,85.714287,204.800003,190.519272,197.824997,205.130737,7.386054,0.977365,1.593910,65.764305,85.714287
2013-04-03,193.529572,201.335007,209.140427,7.753670,0.739043,0.601454,59.979939,78.571426,204.766663,192.759949,...,85.714287,203.333328,190.415604,198.104996,205.794388,7.762946,0.839970,1.414442,60.072098,85.714287
2013-04-04,194.260071,201.566666,208.873260,7.249799,0.372261,0.157366,44.797306,71.428574,205.066666,193.529572,...,85.714287,203.733337,190.666824,198.558334,206.449844,7.948807,0.827884,1.260681,61.061985,85.714287
2013-04-05,195.124146,201.708328,208.292526,6.528425,-0.070179,-0.487384,35.015064,-35.714287,199.699997,194.260071,...,92.857140,201.600006,191.053146,198.883331,206.713516,7.874147,0.673474,0.966548,53.451057,85.714287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-02-07,210.069550,222.669998,235.270447,11.317599,0.798799,1.845161,60.010979,50.000000,234.300003,209.845398,...,-35.714287,214.600006,212.705643,219.389999,226.074356,6.093582,0.141701,-0.658273,40.944489,-35.714287
2022-02-08,209.922806,223.067505,236.212204,11.785402,0.811247,1.664053,61.201462,50.000000,230.199997,210.069550,...,-35.714287,217.600006,212.912766,219.470001,226.027237,5.975522,0.357410,-0.431872,46.709850,-35.714287
2022-02-09,209.990723,223.477493,236.964279,12.069926,0.747372,1.382966,59.212685,50.000000,231.250000,209.922806,...,28.571428,220.199997,213.046463,219.577499,226.108536,5.948732,0.547657,-0.090360,51.160023,-35.714287
2022-02-10,209.942627,223.952499,237.962372,12.511462,0.796131,1.255015,61.766991,50.000000,230.149994,209.990723,...,35.714287,227.750000,212.579086,220.009995,227.440918,6.755075,1.020797,0.621503,61.273457,28.571428


In [8]:
y

Date
2013-04-01    204.800003
2013-04-02    204.766663
2013-04-03    205.066666
2013-04-04    199.699997
2013-04-05    194.199997
                 ...    
2022-02-07    230.199997
2022-02-08    231.250000
2022-02-09    230.149994
2022-02-10    232.250000
2022-02-11    232.449997
Name: Close, Length: 2149, dtype: float32

In [9]:
final_x = x[np.datetime64("2021-11-13"):]
final_y = y[np.datetime64("2021-11-13"):]

In [10]:
df_itc_posts = df_itc_posts[::-1]
df_itc_posts

Unnamed: 0,Messages,Time Stamp
2045,ITC safe investment bet lowest PE in FMCG sect...,2021-11-12 21:31:26
2042,"Aravachan, The expected price Target for ITC t...",2021-11-13 18:02:15
2040,This is only buy for investment purpose for 25...,2021-11-13 18:03:55
2037,Tips for Monday....buy ITC at 232 sl 230 and t...,2021-11-13 18:48:36
2035,"ITC Infotech eyes Rs 3,000 crore revenue mark ...",2021-11-13 19:00:12
...,...,...
14,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39
9,do not misguide people in this common forum ev...,2022-02-10 15:42:33
8,after results today some kind up sides movemen...,2022-02-10 15:43:25
7,Strong Long Term Fundamental Strength with an ...,2022-02-10 15:59:02


In [11]:
final_posts = df_itc_posts[7:]
final_posts

Unnamed: 0,Messages,Time Stamp
2030,ITC safe investment bet lowest PE in FMCG sect...,2021-11-14 07:31:06
2028,ITC safe investment bet lowest PE in FMCG sect...,2021-11-14 09:20:37
2025,ITC Infotech is open for inorganic growth. Don...,2021-11-13 23:20:56
2023,The bonus may be announced by ITC in 2022. So ...,2021-11-14 10:35:23
2019,Technical and financial analysis shows a stron...,2021-11-14 20:39:57
...,...,...
14,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39
9,do not misguide people in this common forum ev...,2022-02-10 15:42:33
8,after results today some kind up sides movemen...,2022-02-10 15:43:25
7,Strong Long Term Fundamental Strength with an ...,2022-02-10 15:59:02


In [12]:
len(final_x),len(final_posts)

(63, 614)

# CALCULATING SENTIMENTS SCORE

In [13]:
sent_model = tf.keras.models.load_model("final_bert")
sent_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 preprocessor (KerasLayer)      {'input_word_ids':   0           ['inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

#### Removing duplicates

In [14]:
final_posts.drop_duplicates(subset=['Messages'])

Unnamed: 0,Messages,Time Stamp
2030,ITC safe investment bet lowest PE in FMCG sect...,2021-11-14 07:31:06
2025,ITC Infotech is open for inorganic growth. Don...,2021-11-13 23:20:56
2023,The bonus may be announced by ITC in 2022. So ...,2021-11-14 10:35:23
2019,Technical and financial analysis shows a stron...,2021-11-14 20:39:57
2018,Itc is biggest beneficiary for boom in sales t...,2021-11-14 20:41:18
...,...,...
38,after results today some kind up sides movemen...,2022-02-10 15:43:25
37,Strong Long Term Fundamental Strength with an ...,2022-02-10 15:59:02
36,M Ambani Or G Adani should buy controlling sta...,2022-02-10 17:16:29
34,"Positive Rally in Nifty, No Rally in ITC and N...",2022-02-10 19:28:39


In [15]:
sentiments = []
indices = []
prev = np.datetime64("2015-11-12 21:31:26")
for i in final_y.index:
    total=0
    cnt=0
    for j in final_posts.itertuples():
        k,msg,time = j
        indices.append(k)
        if np.datetime64(time)<np.datetime64(i) and np.datetime64(time)>prev:
            total += tf.squeeze(sent_model.predict([msg])).numpy()
            print(f'Score: {tf.squeeze(sent_model.predict([msg])).numpy()}')
            cnt+=1
    prev = np.datetime64(i)
    if(cnt==0):
        sentiments.append(0)
    else:
        sentiments.append(total/cnt)

Score: 0.9444242715835571
Score: 0.9444242715835571
Score: 0.9895212650299072
Score: 0.9904130101203918
Score: 0.9859876036643982
Score: 0.9878939986228943
Score: 0.9913119077682495
Score: 0.56297367811203
Score: 0.9855405688285828
Score: 0.9444242715835571
Score: 0.9900887608528137
Score: 0.9864593744277954
Score: 0.8831420540809631
Score: 0.9906072616577148
Score: 0.9881131052970886
Score: 0.9378788471221924
Score: 0.9728671312332153
Score: 0.9444242715835571
Score: 0.9893563389778137
Score: 0.7976952791213989
Score: 0.9805727005004883
Score: 0.8743585348129272
Score: 0.9097331762313843
Score: 0.12076428532600403
Score: 0.9446114897727966
Score: 0.9894975423812866
Score: 0.938491940498352
Score: 0.9903286099433899
Score: 0.9900509715080261
Score: 0.9887735843658447
Score: 0.9857649207115173
Score: 0.8578930497169495
Score: 0.9444242715835571
Score: 0.9726449847221375
Score: 0.9897757768630981
Score: 0.9899868369102478
Score: 0.9444242715835571
Score: 0.9891798496246338
Score: 0.09316

Score: 0.9847087860107422
Score: 0.06511615216732025
Score: 0.9884053468704224
Score: 0.8030979633331299
Score: 0.15169541537761688
Score: 0.9802668690681458
Score: 0.9874778985977173
Score: 0.9471572637557983
Score: 0.9907183051109314
Score: 0.08679325133562088
Score: 0.04434254765510559
Score: 0.9205825924873352
Score: 0.07107432931661606
Score: 0.9907665252685547
Score: 0.7942423224449158
Score: 0.18081220984458923
Score: 0.9892515540122986
Score: 0.9900057911872864
Score: 0.9842122197151184
Score: 0.9724715948104858
Score: 0.9386971592903137
Score: 0.035825181752443314
Score: 0.6403425335884094
Score: 0.08066266775131226
Score: 0.9788405895233154
Score: 0.021515879780054092
Score: 0.9305002093315125
Score: 0.9889113903045654
Score: 0.704558789730072
Score: 0.9879283308982849
Score: 0.9676231741905212
Score: 0.9781097769737244
Score: 0.9715830087661743
Score: 0.9212403297424316
Score: 0.9858099818229675
Score: 0.7468068599700928
Score: 0.7792054414749146
Score: 0.9387190341949463
Sc

# SOLVING THE 0 SENTIMENT SCORE PROBLEM

#### Steps to solve the problem:
* Getting the indices where sentiment score is 0
* Removing those values from the sentiments array and final_x
* Removing the added index column
* Using N-beats model to make predictions on the prepared final_x datset
* Make the final dataset to train final model

#### Getting the indices where sentiment score is 0

In [33]:
zero_index = []
for i,j in enumerate(sentiments):
    if(j==0):
        zero_index.append(i)
zero_index 

[4, 5, 6, 7, 8, 9, 10, 11, 12, 57]

#### Removing those values from the sentiments array and final_x

In [47]:
final_x_zeros = final_x.copy()
final_y_zeros = final_y.copy()
final_y_zeros = final_y_zeros.to_frame()
final_x_zeros['removal_assist'] = np.arange(0,len(final_x),1)
final_y_zeros['removal_assist'] = np.arange(0,len(final_x),1)

In [48]:
final_y_zeros = final_y_zeros[final_y_zeros.removal_assist.isin(zero_index)==False]
final_x_zeros = final_x_zeros[final_x_zeros.removal_assist.isin(zero_index)==False]

In [49]:
len(final_y_zeros),len(final_x_zeros)

(53, 53)

In [50]:
sentiments = np.delete(sentiments,zero_index)
sentiments

array([0.93790942, 0.94711164, 0.90710024, 0.8002191 , 0.98364121,
       0.34421938, 0.81026217, 0.93727617, 0.85286945, 0.81385339,
       0.91488078, 0.84711482, 0.78174605, 0.78755366, 0.79193413,
       0.88688747, 0.81942778, 0.86818131, 0.57996973, 0.94231537,
       0.81385345, 0.71057868, 0.79632066, 0.71666523, 0.83461632,
       0.75409228, 0.76133595, 0.64567702, 0.74668035, 0.75681433,
       0.90270813, 0.90020501, 0.66226926, 0.91934999, 0.89887066,
       0.89318242, 0.74312352, 0.75146003, 0.76075468, 0.77344375,
       0.86227876, 0.9537311 , 0.79507104, 0.93639518, 0.85952354,
       0.62649698, 0.78937954, 0.98839545, 0.87384896, 0.90128844,
       0.65563229, 0.61601235, 0.81838229])

#### Removing the added index column

In [53]:
final_x_zeros.drop(["removal_assist"], axis=1,inplace=True)
final_y_zeros.drop(["removal_assist"], axis=1, inplace=True)

#### Using N-beats model to make predictions on the prepared final_x datset

In [51]:
model_nbeats = tf.keras.models.load_model("nbeats_itc")
model_nbeats.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 71)]         0           []                               
                                                                                                  
 blocks_1 (blocks)              ((None, 71),         861768      ['input_1[0][0]']                
                                 (None, 1))                                                       
                                                                                                  
 subtract (Subtract)            (None, 71)           0           ['input_1[0][0]',                
                                                                  'blocks_1[0][0]']               
                                                                                              

In [59]:
preds = tf.squeeze(model_nbeats.predict(final_x_zeros)).numpy()

#### Make the final dataset to train final model

In [61]:
final_df = pd.DataFrame({"Predictions": preds,
                        "Sentiments":sentiments})

In [62]:
len(final_df),len(final_y_zeros)

(53, 53)

# BUILDING AND TRAINING THE FINAL MODEL

In [63]:
tf.random.set_seed(42)
inputs = tf.keras.Input(shape=(2))
x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(inputs)
x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(x)
x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.models.Model(inputs=inputs,outputs=outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2)]               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 24        
                                                                 
 dense_3 (Dense)             (None, 8)                 72        
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 177
Trainable params: 177
Non-trainable params: 0
_________________________________________________________________


In [64]:
xtrain,ytrain,xtest,ytest = final_df[:42],final_y_zeros[:42],final_df[42:],final_y_zeros[42:]

In [65]:
model.compile(loss = "mae", optimizer = tf.keras.optimizers.Adam(), metrics=["mae","mse"])
history = model.fit(xtrain,ytrain,
                    epochs = 1000,
                    validation_data=(xtest,ytest), 
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                            patience=200, 
                                                            restore_best_weights=True),
                          tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", 
                                                               patience=100, 
                                                               verbose=1)])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000


Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000


Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000


Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 00168: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000


Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000


Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/1000
Epoch 231/1000
Epoch 232/1000
Epoch 233/1000
Epoch 234/1000
Epoch 235/1000
Epoch 236/1000
Epoch 237/1000
Epoch 238/1000
Epoch 239/1000
Epoch 240/1000
Epoch 241/1000
Epoch 242/1000
Epoch 243/1000
Epoch 244/1000
Epoch 245/1000
Epoch 246/1000
Epoch 247/1000
Epoch 248/1000
Epoch 249/1000
Epoch 250/1000
Epoch 251/1000
Epoch 252/1000
Epoch 253/1000
Epoch 254/1000
Epoch 255/1000
Epoch 256/1000
Epoch 257/1000
Epoch 258/1000
Epoch 259/1000
Epoch 260/1000
Epoch 261/1000
Epoch 262/1000
Epoch 263/1000
Epoch 264/1000
Epoch 265/1000
Epoch 266/1000
Epoch 267/1000
Epoch 268/1000
Epoch 00268: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.


In [66]:
model.evaluate(xtest,ytest)



[1.2398127317428589, 1.2398127317428589, 2.7874674797058105]

# Training a conv1d model on the same data

In [72]:
tf.random.set_seed(42)
inputs = tf.keras.Input(shape=(71,1), name="inputs")
x = tf.keras.layers.Conv1D(64, 3, activation="relu", padding="causal")(inputs)
x = tf.keras.layers.MaxPooling1D()(x)
x = tf.keras.layers.Conv1D(64, 3, activation="relu", padding="causal")(x)
x = tf.keras.layers.MaxPooling1D()(x)
x = tf.keras.layers.Conv1D(64, 3, activation="relu", padding="causal")(x)
x = tf.keras.layers.MaxPooling1D()(x)
x = tf.keras.layers.Conv1D(64, 3, activation="relu", padding="causal")(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.models.Model(inputs, outputs)
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 71, 1)]           0         
                                                                 
 conv1d_8 (Conv1D)           (None, 71, 64)            256       
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 35, 64)           0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 35, 64)            12352     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 17, 64)           0         
 1D)                                                             
                                                                 
 conv1d_10 (Conv1D)          (None, 17, 64)            1235

In [73]:
model.compile(loss = "mae", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=["mae","mse"])
history = model.fit(final_x_zeros[:42],final_y_zeros[:42],
                    epochs = 1000,
                    validation_data=(final_x_zeros[42:], final_y_zeros[42:]), 
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                            patience=200, 
                                                            restore_best_weights=True),
                          tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", 
                                                               patience=100, 
                                                               verbose=1)])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000


Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000


Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 00125: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000


Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000


Epoch 178/1000
Epoch 179/1000
Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000


Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 00225: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.


In [101]:
model.evaluate(final_x_zeros[42:],final_y_zeros[42:])
model.save("conv1d_itc")

INFO:tensorflow:Assets written to: conv1d_itc\assets


# BINGO!!! NOW LETS MAKE THE FINAL FUNCTION

In [100]:
def model_scores_comparator(add_time,add_posts,bert,stock,l):
    '''
    Args:
    add_time: address of the file that has time series data of the stock
    add_posts: address of the  file that has all the posts of the stock, labelled as spam or not spam
    bert: address of the bert sentiment analysis model
    stock: name of the stock
    l: list of all models that the target model's results have to be compared to

    Returns:
    A pandas dataframe containing the MAEs and MSEs of all the models
    '''
    import pandas as pd
    import tensorflow as tf
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import tensorflow_hub as hub
    import tensorflow_text as text
    df_stock = pd.read_csv(add_time,
                     parse_dates = ["Date"],
                     index_col = ["Date"])
    df_posts = pd.read_excel(add_posts)
    # removing the unnecessary columns
    df_posts.drop(["Unnamed: 0"], axis=1,inplace=True)
    # removing spam posts
    df_posts = df_posts[df_itc_posts.Spam==0.0]
    df_posts.drop(["Spam"],axis=1,inplace=True)
    # sliding a window of 7 days and adding all the TIs
    from stock_helper import prepare_data
    x,y = prepare_data(df_stock)
    # slicing the data
    final_x = x[np.datetime64("2021-11-13"):]
    final_y = y[np.datetime64("2021-11-13"):]
    # reversing the posts data
    df_posts = df_posts[::-1]
    final_posts = df_posts[7:]
    # loading the sentiment analysis model
    sent_model = tf.keras.models.load_model(bert)
    # removing duplicates from the data
    final_posts.drop_duplicates(subset=['Messages'])
    # calculating the sentiments score
    sentiments = []
    prev = np.datetime64("2015-11-12 21:31:26")
    for i in final_y.index:
        total=0
        cnt=0
        for j in final_posts.itertuples():
            _,msg,time = j
            if np.datetime64(time)<np.datetime64(i) and np.datetime64(time)>prev:
                total += tf.squeeze(sent_model.predict([msg])).numpy()
                cnt+=1
        prev = np.datetime64(i)
        if(cnt==0):
            sentiments.append(0)
        else:
            sentiments.append(total/cnt)
    # getting indices where sentiments score is 0
    zero_index = []
    for i,j in enumerate(sentiments):
        if(j==0):
            zero_index.append(i)
    # removing all the zero values indices
    sentiments = np.delete(sentiments,zero_index)
    final_x_zeros = final_x.copy()
    final_y_zeros = final_y.copy()
    final_y_zeros = final_y_zeros.to_frame()
    final_x_zeros['removal_assist'] = np.arange(0,len(final_x),1)
    final_y_zeros['removal_assist'] = np.arange(0,len(final_x),1)
    final_y_zeros = final_y_zeros[final_y_zeros.removal_assist.isin(zero_index)==False]
    final_x_zeros = final_x_zeros[final_x_zeros.removal_assist.isin(zero_index)==False]
    # removing the added helper column
    final_x_zeros.drop(["removal_assist"], axis=1,inplace=True)
    final_y_zeros.drop(["removal_assist"], axis=1, inplace=True)
    # loading the nbeats model for the given stock
    model_nbeats = tf.keras.models.load_model("nbeats_" + stock)
    # making predictions and building the final dataframe 
    preds = tf.squeeze(model_nbeats.predict(final_x_zeros)).numpy()
    final_df = pd.DataFrame({"Predictions": preds,
                        "Sentiments":sentiments})
    # building and training the target model
    tf.random.set_seed(42)
    inputs = tf.keras.Input(shape=(2))
    x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(inputs)
    x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(x)
    x = tf.keras.layers.Dense(8, activation="selu", kernel_initializer="lecun_normal")(x)
    outputs = tf.keras.layers.Dense(1)(x)
    model = tf.keras.models.Model(inputs=inputs,outputs=outputs)
    
    xtrain,ytrain,xtest,ytest = final_df[:42],final_y_zeros[:42],final_df[42:],final_y_zeros[42:]
    
    model.compile(loss = "mae", optimizer = tf.keras.optimizers.Adam(), metrics=["mae","mse"])
    history = model.fit(xtrain,ytrain,
                        epochs = 1000,
                        validation_data=(xtest,ytest), 
                        callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                                patience=200, 
                                                                restore_best_weights=True),
                              tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", 
                                                                   patience=100, 
                                                                   verbose=1)])
    target_result = model.evaluate(xtest,ytest)
    dict_res = {
        "Target_model":target_result[1]
    }
    df = pd.DataFrame(dict_res)
    frames = []
    frames.append(df)
    # getting the results of all the other models
    for i in l:
        name = i + "_" + stock
        model = tf.keras.models.load_model(name)
        result = model.evaluate(final_x_zeros[42:], final_y_zeros[42:])
        dict_res = {
         i:result[1]
        }
        df = pd.DataFrame(dict_res)
        frames.append(df)
    return pd.concat(frames, axis=1)

In [103]:
df = model_scores_comparator("itc/ITC.NS.csv", "itc/itc_final_posts.xlsx", 
                             "final_bert", "itc", ["conv1d"])

AttributeError: 'DataFrame' object has no attribute 'Spam'

In [112]:
z = res[0]
dict1 = {
    "f":z
}
g = pd.DataFrame.from_dict(dict1)
g

ValueError: If using all scalar values, you must pass an index