In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim import downloader as api
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf



In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
finance_data = pd.read_csv('/content/drive/MyDrive/Fin_Cleaned.csv')

In [4]:
df = pd.DataFrame(finance_data)
df.head()

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive


In [5]:
df = df.drop(columns=['Date_published', 'Synopsis', 'Headline'])

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

Full_text       0
Final Status    0
dtype: int64


In [7]:
df['Full_text'][1]

'AgenciesThe bank presently has 20 million active users on its mobile app, with plans to reach 30 million customers in a year\'s time.After overhauling its IT infrastructure to set up digital banking departments internally, public sector lender Bank of Baroda is now targeting at least 65% of retail originations and 35% of MSME loans (value-wise) to be done digitally by the end of the current fiscal year. The bank is also targeting â‚¹50,000 crore of digital lending in the current fiscal year.\n\n"We believe that this year we will disburse loans of over â‚¹50,000 crore through our digital bank this year alone," said Akhil Handa, chief digital officer, Bank of Baroda. "This will be a combination of retail (home, auto, personal) loans and small ticket MSME loans (Mudra loans and small ticket business loans). We have a substantial advantage over peer banks that are getting started with their digital journey."\n\nHanda added that at least â‚¹35,000 crore-â‚¹40,000 crore will come from lendi

In [8]:
unique_characters = pd.Series(list(''.join(df['Full_text']))).unique()
print(unique_characters)

['R' 'e' 'u' 't' 'r' 's' 'P' 'a' 'y' 'm' 'n' ' ' 'c' 'o' 'p' 'i' 'd' 'b'
 'k' 'l' 'g' 'h' 'v' 'f' '-' ',' 'w' '.' '\n' 'T' 'N' 'C' 'I' '(' ')' 'â'
 '‚' '¹' '7' '0' '1' '5' 'x' 'M' 'D' '"' 'E' 'O' 'W' 'L' 'S' "'" 'U' 'A'
 'q' 'B' 'V' '2' 'G' '6' '%' '3' '8' '9' '4' 'H' 'j' 'K' 'F' 'z' 'J' '&'
 '/' 'Y' '€' 'œ' '\x9d' '$' ';' ':' '™' '”' '?' '¦' '•' 'Â' '«' 'Q' '˜'
 'X' '!' '\t' '@' '*' '“' 'Z' '~' '¢' '’' '[' ']' '+' '\x90' 'Ã' '£'
 '\xa0' '|' '®']


In [9]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [12]:
df['Full_text'] = df['Full_text'].apply(preprocess_text)

In [13]:
df['Full_text'][1]

'agenciesthe bank presently ha NUM million active user on it mobile app with plan to reach NUM million customer in a year timeafter overhauling it it infrastructure to set up digital banking department internally public sector lender bank of baroda is now targeting at least NUM% of retail origination and NUM% of msme loan valuewise to be done digitally by the end of the current fiscal year the bank is also targeting NUM crore of digital lending in the current fiscal year we believe that this year we will disburse loan of over NUM crore through our digital bank this year alone said akhil handa chief digital officer bank of baroda this will be a combination of retail home auto personal loan and small ticket msme loan mudra loan and small ticket business loan we have a substantial advantage over peer bank that are getting started with their digital journey handa added that at least NUM crore40000 crore will come from lending to the retail sector while the balance will be contributed by th

In [14]:
df['Final Status'] = df['Final Status'].str.strip().str.capitalize()

In [15]:
le = LabelEncoder()
df['Final Status'] = le.fit_transform(df['Final Status'])

In [16]:
df.head()

Unnamed: 0,Full_text,Final Status
0,reuterspayments company and bank are at logger...,0
1,agenciesthe bank presently ha NUM million acti...,1
2,ptikarnataka chief minister basavaraj bommaika...,1
3,getty imagesthe budget for fy22 had imposed in...,0
4,agenciesthere is a view in the insurance indus...,1


In [17]:
X = df['Full_text']
y = df['Final Status']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [20]:
max_len = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [21]:
glove_gensim = api.load('glove-wiki-gigaword-300')



In [22]:
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_gensim:
        embedding_matrix[i] = glove_gensim[word]

In [24]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [27]:
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=16,
                    validation_data=(X_test_pad, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 236ms/step - accuracy: 0.5565 - loss: 0.6919 - val_accuracy: 0.4750 - val_loss: 0.6966
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 206ms/step - accuracy: 0.6363 - loss: 0.6686 - val_accuracy: 0.5000 - val_loss: 0.7353
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.6621 - loss: 0.6065 - val_accuracy: 0.5625 - val_loss: 0.7976
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 209ms/step - accuracy: 0.7062 - loss: 0.5659 - val_accuracy: 0.4500 - val_loss: 0.8530


In [28]:
y_pred_prob = model.predict(X_test_pad)

y_pred = (y_pred_prob > 0.5).astype(int)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 248ms/step


In [29]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

report = classification_report(y_test, y_pred, target_names=le.classes_)
print(report)

Accuracy: 0.475
              precision    recall  f1-score   support

    Negative       0.47      0.47      0.47        40
    Positive       0.47      0.47      0.47        40

    accuracy                           0.47        80
   macro avg       0.47      0.47      0.47        80
weighted avg       0.47      0.47      0.47        80

