In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim import downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stock_data = pd.read_csv('/content/drive/MyDrive/stock_data.csv')

In [None]:
df = pd.DataFrame(stock_data)
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [None]:
missing_values = df.isnull().sum()
print(missing_values)

Text         0
Sentiment    0
dtype: int64


In [None]:
df['Text'][1]

'user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  '

In [None]:
unique_characters = pd.Series(list(''.join(df['Text']))).unique()
print(unique_characters)

['K' 'i' 'c' 'k' 'e' 'r' 's' ' ' 'o' 'n' 'm' 'y' 'w' 'a' 't' 'h' 'l' 'X'
 'I' 'D' 'E' 'T' 'S' 'O' 'Q' 'P' 'N' 'C' 'W' 'B' 'Z' 'A' 'J' 'd' '1' '2'
 ',' 'p' 'v' 'u' ':' 'M' 'V' '.' '5' '%' 'f' 'F' '/' 'G' 'j' "'" 'b' '-'
 'g' '0' '3' '7' '4' 'Y' 'H' '&' 'x' '!' '6' '+' '(' ')' '9' 'q' '~' '8'
 '#' '=' '>' '?' '_' 'z' '[' ']' '*' ';' '^' '<' '|' 'U' 'â' '€' '¦' '™'
 'R' '@' 'L' '$' 'œ' '\x9d' '”' '\n' '"' '˜' 'Â' '\xa0' '\x81' '©' '£' '…'
 '’']


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [None]:
df['Text'] = df['Text'].apply(preprocess_text)

In [None]:
df['Text'][1]

'user aap movie NUM% return for the feageed indicator just NUM trade for the year awesome'

In [None]:
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [None]:
df.head()

Unnamed: 0,Text,Sentiment
0,kicker on my watchlist xide tit soq pnk cpw bp...,1
1,user aap movie NUM% return for the feageed ind...,1
2,user id be afraid to short amzn they are looki...,1
3,mnta over NUM,1
4,oi over NUM,1


In [None]:
X = df['Text']
y = df['Sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_len = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [None]:
glove_gensim = api.load('glove-wiki-gigaword-300')



In [None]:
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_gensim:
        embedding_matrix[i] = glove_gensim[word]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.6656 - loss: 0.6208 - val_accuracy: 0.7179 - val_loss: 0.5500
Epoch 2/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7684 - loss: 0.4906 - val_accuracy: 0.7394 - val_loss: 0.5179
Epoch 3/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7988 - loss: 0.4377 - val_accuracy: 0.7532 - val_loss: 0.5144
Epoch 4/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8333 - loss: 0.3820 - val_accuracy: 0.7636 - val_loss: 0.5297
Epoch 5/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8794 - loss: 0.2907 - val_accuracy: 0.7627 - val_loss: 0.5541
Epoch 6/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9036 - loss: 0.2312 - val_accuracy: 0.7739 - val_loss: 0.6251


In [None]:
y_pred_prob = model.predict(X_test_pad)

y_pred = (y_pred_prob > 0.5).astype(int)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.7532355478861087
              precision    recall  f1-score   support

           0       0.71      0.56      0.63       427
           1       0.77      0.87      0.82       732

    accuracy                           0.75      1159
   macro avg       0.74      0.71      0.72      1159
weighted avg       0.75      0.75      0.75      1159



In [None]:
def balanced_binary_focal_loss(y_true, y_pred, gamma=2.0, alpha=0.25):

    y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)

    bce_loss = -y_true * tf.math.log(y_pred) - (1 - y_true) * tf.math.log(1 - y_pred)


    focal_loss = alpha * tf.math.pow(1 - y_pred, gamma) * y_true * bce_loss + \
                 (1 - alpha) * tf.math.pow(y_pred, gamma) * (1 - y_true) * bce_loss

    return tf.reduce_mean(focal_loss)

In [None]:
model.compile(loss=balanced_binary_focal_loss, optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.7666 - loss: 0.0468 - val_accuracy: 0.7256 - val_loss: 0.0590
Epoch 2/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8501 - loss: 0.0337 - val_accuracy: 0.7386 - val_loss: 0.0658
Epoch 3/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8612 - loss: 0.0287 - val_accuracy: 0.7291 - val_loss: 0.0703
Epoch 4/10
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9080 - loss: 0.0212 - val_accuracy: 0.7213 - val_loss: 0.0824


In [None]:
y_pred_prob = model.predict(X_test_pad)

y_pred = (y_pred_prob > 0.5).astype(int)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

target_names = ['negative', 'positive']

report = classification_report(y_test, y_pred, target_names=target_names)
print(report)

Accuracy: 0.725625539257981
              precision    recall  f1-score   support

    negative       0.61      0.72      0.66       427
    positive       0.82      0.73      0.77       732

    accuracy                           0.73      1159
   macro avg       0.71      0.73      0.72      1159
weighted avg       0.74      0.73      0.73      1159

