In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim import downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf



In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
finance_data = pd.read_csv('/content/drive/MyDrive/finance_data.csv')

In [None]:
df = pd.DataFrame(finance_data)
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
missing_values = df.isnull().sum()
print(missing_values)

Sentence     0
Sentiment    0
dtype: int64


In [None]:
df['Sentence'][0]

"The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model ."

In [None]:
unique_characters = pd.Series(list(''.join(df['Sentence']))).unique()
print(unique_characters)

['T' 'h' 'e' ' ' 'G' 'o' 'S' 'l' 'u' 't' 'i' 'n' 's' 'c' 'g' 'y' 'w' 'v'
 'r' 'a' 'B' 'f' "'" 'P' 'b' 'p' 'd' 'L' ',' 'C' 'm' '.' '$' 'E' 'I' '1'
 '5' '0' '2' 'K' 'F' 'q' 'U' 'R' '3' '7' '6' 'z' '-' 'x' 'A' 'j' '4' 'k'
 'Y' 'D' 'M' 'H' 'O' 'N' 'X' '8' ':' '%' 'Q' '#' '?' '/' '9' 'V' '+' 'ñ'
 'J' '`' 'W' '@' '&' '(' ')' 'Z' 'ú' '!' '>' 'ó' 'Â' '£' '"' ';' 'à' '®'
 '¦' '=' 'ä' 'â' '€' '“' 'Ã' '¶' 'Ñ' '_' '📈' 'é' '…' '”' '^' 'á' '«' '|'
 '[' ']' '~' '{' '}' '¼' '¬' 'í' '<' 'Á' '’']


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [None]:
df['Sentence'] = df['Sentence'].apply(preprocess_text)

In [None]:
df['Sentence'][0]

'the geosolutions technology will leverage benefon s gps solution by providing location based search technology a community platform location relevant multimedia content and a new and powerful commercial model'

In [None]:
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [None]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,2
1,$esi on low down $NUM to $NUM bk a real possib...,0
2,for the last quarter of NUM componenta s net s...,2
3,according to the finnishrussian chamber of com...,1
4,the swedish buyout firm ha sold it remaining N...,1


In [None]:
X = df['Sentence']
y = to_categorical(df['Sentiment'], num_classes=3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
max_len = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [None]:
glove_gensim = api.load('glove-wiki-gigaword-300')



In [None]:
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_gensim:
        embedding_matrix[i] = glove_gensim[word]

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.5812 - loss: 0.8982 - val_accuracy: 0.6801 - val_loss: 0.7249
Epoch 2/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7129 - loss: 0.6709 - val_accuracy: 0.7143 - val_loss: 0.6209
Epoch 3/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7671 - loss: 0.5441 - val_accuracy: 0.6681 - val_loss: 0.6716
Epoch 4/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7998 - loss: 0.4552 - val_accuracy: 0.7365 - val_loss: 0.5993
Epoch 5/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8229 - loss: 0.3951 - val_accuracy: 0.7322 - val_loss: 0.6275
Epoch 6/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8374 - loss: 0.3458 - val_accuracy: 0.7494 - val_loss: 0.5965
Epoch 7/10
[1m147/147

In [None]:
y_pred_prob = model.predict(X_test_pad)

y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [None]:
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_true, y_pred, target_names=le.classes_)
print(report)

Accuracy: 0.7493584260051326
              precision    recall  f1-score   support

    negative       0.47      0.36      0.41       175
     neutral       0.77      0.87      0.82       622
    positive       0.82      0.73      0.77       372

    accuracy                           0.75      1169
   macro avg       0.69      0.65      0.67      1169
weighted avg       0.74      0.75      0.74      1169



In [None]:
def categorical_focal_loss(y_true, y_pred, gamma=2.0, alpha=0.25):

    y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)

    ce_loss = -y_true * tf.math.log(y_pred)

    focal_loss = alpha * tf.math.pow(1 - y_pred, gamma) * ce_loss

    return tf.reduce_mean(tf.reduce_sum(focal_loss, axis=-1))

In [None]:
model.compile(loss=categorical_focal_loss, optimizer='adam', metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.8754 - loss: 0.0196 - val_accuracy: 0.7151 - val_loss: 0.0768
Epoch 2/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8283 - loss: 0.0344 - val_accuracy: 0.7365 - val_loss: 0.0848
Epoch 3/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8871 - loss: 0.0142 - val_accuracy: 0.7382 - val_loss: 0.1021
Epoch 4/10
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8898 - loss: 0.0148 - val_accuracy: 0.7348 - val_loss: 0.0996


In [None]:
y_pred_prob = model.predict(X_test_pad)

y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [None]:
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_true, y_pred, target_names=le.classes_)
print(report)

Accuracy: 0.7151411462788708
              precision    recall  f1-score   support

    negative       0.45      0.80      0.57       175
     neutral       0.83      0.69      0.75       622
    positive       0.78      0.72      0.75       372

    accuracy                           0.72      1169
   macro avg       0.69      0.74      0.69      1169
weighted avg       0.76      0.72      0.73      1169

