In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,Conv1D,GlobalMaxPooling1D,MaxPooling1D,Dense,AdditiveAttention,MultiHeadAttention,Bidirectional,LSTM,Flatten,Cropping3D,UpSampling3D,Attention,Dropout

In [2]:
# importing data
data=pd.read_csv('stock_data.csv')

In [3]:
#checking for null data
print(data['Text'].isnull().sum())
print(data['Sentiment'].isnull().sum())
data

0
0


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


In [4]:
data.isnull().values.any()

False

In [5]:
data['Text'][1]

'user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  '

In [6]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [9]:
X = []
sentences = list(data['Text'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [10]:
print(X[0])
print(data['Text'][0])

kickers watchlist xide tit soq pnk cpw bpz aj trade method method see prev posts
Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method 1 or method 2, see prev posts


In [11]:
data['Sentiment']=data['Sentiment'].map({-1:0,1:1})

In [12]:
# x=np.array(X)
y=np.array(data['Sentiment'])
x=X

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y)
y_train.shape,y_test.shape

((4343,), (1448,))

In [14]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(x_train)

x_train = word_tokenizer.texts_to_sequences(x_train)
x_test = word_tokenizer.texts_to_sequences(x_test)

In [15]:
vocab_length = len(word_tokenizer.word_index) +1

vocab_length


7775

In [16]:
maxlen = 25

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [17]:
x_train.shape,y_train.shape

((4343, 25), (4343,))

In [18]:
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [19]:
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [20]:
embedding_matrix.shape


(7775, 100)

In [21]:
model=Sequential()

In [22]:
model.add(Embedding(vocab_length,100,weights=[embedding_matrix], input_length=maxlen))
model.add(Conv1D(128,5,padding ="same",activation="relu"))
model.add(MaxPooling1D())
model.add(Dropout(0.2))
model.add(Conv1D(256,5,padding="same",activation="relu"))
#model.add(MultiHeadAttention(key_dim=128, value_dim=128, num_heads=1,value=10))
model.add(MaxPooling1D())
model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(20)))
model.add(Bidirectional(LSTM(40)))
#model.add(Flatten())
# model.add(Cropping3D(cropping=((1, 1), (1, 1), (1, 1))))
# model.add(UpSampling3D(size=(2, 2, 2)))
# model.add(AdditiveAttention()) 
#model.add(MultiHeadAttention(num_heads=2, key_dim=128, value=128))
# model.add(Flatten())

model.add(Dense(units=1,activation="sigmoid"))

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 100)           777500    
                                                                 
 conv1d (Conv1D)             (None, 25, 128)           64128     
                                                                 
 max_pooling1d (MaxPooling1  (None, 12, 128)           0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 12, 128)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 12, 256)           164096    
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 6, 256)            0         
 g1D)                                                   

In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [25]:
model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x25fc414c850>

In [26]:
# prediction=model.predict(x_test)
y_test_pred=model.predict(x_test)
model.evaluate(x_test,y_test)





[1.171769142150879, 0.7734806537628174]

In [27]:
y_test_pred
threshold = 0.5

# Convert the data to 1 and 0 using the threshold
binary_data = (y_test_pred >= threshold).astype(int)

print(binary_data)
print(y_test_pred)
y_test_pred=binary_data

[[0]
 [1]
 [1]
 ...
 [0]
 [1]
 [1]]
[[3.1331790e-04]
 [9.6300381e-01]
 [9.2777377e-01]
 ...
 [2.6509830e-01]
 [9.9988484e-01]
 [9.9957901e-01]]


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))


              precision    recall  f1-score   support

           0       0.67      0.70      0.69       515
           1       0.83      0.81      0.82       933

    accuracy                           0.77      1448
   macro avg       0.75      0.76      0.76      1448
weighted avg       0.78      0.77      0.77      1448

