In [1]:
from google.colab import drive
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
finance_data = '/content/drive/MyDrive/Fin_Cleaned.csv'

In [4]:
df = pd.read_csv(finance_data)
df.head()

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive


In [5]:
df = df.drop(columns=['Date_published', 'Synopsis', 'Headline'])

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

Full_text       0
Final Status    0
dtype: int64


In [8]:
class_counts = df['Final Status'].value_counts()
print(class_counts)

Final Status
Positive     215
Negative     184
Positive       1
Name: count, dtype: int64


In [9]:
df['Final Status'] = df['Final Status'].str.strip().str.capitalize()

In [10]:
class_counts = df['Final Status'].value_counts()
print(class_counts)

Final Status
Positive    216
Negative    184
Name: count, dtype: int64


In [12]:
df['Full_text'][1]

'AgenciesThe bank presently has 20 million active users on its mobile app, with plans to reach 30 million customers in a year\'s time.After overhauling its IT infrastructure to set up digital banking departments internally, public sector lender Bank of Baroda is now targeting at least 65% of retail originations and 35% of MSME loans (value-wise) to be done digitally by the end of the current fiscal year. The bank is also targeting â‚¹50,000 crore of digital lending in the current fiscal year.\n\n"We believe that this year we will disburse loans of over â‚¹50,000 crore through our digital bank this year alone," said Akhil Handa, chief digital officer, Bank of Baroda. "This will be a combination of retail (home, auto, personal) loans and small ticket MSME loans (Mudra loans and small ticket business loans). We have a substantial advantage over peer banks that are getting started with their digital journey."\n\nHanda added that at least â‚¹35,000 crore-â‚¹40,000 crore will come from lendi

In [14]:
unique_characters = pd.Series(list(''.join(df['Full_text']))).unique()
print(unique_characters)

['R' 'e' 'u' 't' 'r' 's' 'P' 'a' 'y' 'm' 'n' ' ' 'c' 'o' 'p' 'i' 'd' 'b'
 'k' 'l' 'g' 'h' 'v' 'f' '-' ',' 'w' '.' '\n' 'T' 'N' 'C' 'I' '(' ')' 'â'
 '‚' '¹' '7' '0' '1' '5' 'x' 'M' 'D' '"' 'E' 'O' 'W' 'L' 'S' "'" 'U' 'A'
 'q' 'B' 'V' '2' 'G' '6' '%' '3' '8' '9' '4' 'H' 'j' 'K' 'F' 'z' 'J' '&'
 '/' 'Y' '€' 'œ' '\x9d' '$' ';' ':' '™' '”' '?' '¦' '•' 'Â' '«' 'Q' '˜'
 'X' '!' '\t' '@' '*' '“' 'Z' '~' '¢' '’' '[' ']' '+' '\x90' 'Ã' '£'
 '\xa0' '|' '®']


In [15]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s\$\%\#@]', '', text)
    text = re.sub(r'\b\d+\b', 'NUM', text)

    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

In [18]:
df['Full_text'] = df['Full_text'].apply(preprocess_text)

In [20]:
df['Full_text'][1]

'agenciesthe bank presently ha NUM million active user on it mobile app with plan to reach NUM million customer in a year timeafter overhauling it it infrastructure to set up digital banking department internally public sector lender bank of baroda is now targeting at least NUM% of retail origination and NUM% of msme loan valuewise to be done digitally by the end of the current fiscal year the bank is also targeting NUM crore of digital lending in the current fiscal year we believe that this year we will disburse loan of over NUM crore through our digital bank this year alone said akhil handa chief digital officer bank of baroda this will be a combination of retail home auto personal loan and small ticket msme loan mudra loan and small ticket business loan we have a substantial advantage over peer bank that are getting started with their digital journey handa added that at least NUM crore40000 crore will come from lending to the retail sector while the balance will be contributed by th

In [21]:
le = LabelEncoder()
df['Final Status'] = le.fit_transform(df['Final Status'])

In [22]:
df.head()

Unnamed: 0,Full_text,Final Status
0,reuterspayments company and bank are at logger...,0
1,agenciesthe bank presently ha NUM million acti...,1
2,ptikarnataka chief minister basavaraj bommaika...,1
3,getty imagesthe budget for fy22 had imposed in...,0
4,agenciesthere is a view in the insurance indus...,1


In [23]:
X = df['Full_text']
y = df['Final Status']

In [24]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf = tfidf.fit_transform(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [28]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

target_names = ["negative", "positive"]
report = classification_report(y_test, y_pred, target_names = target_names)

print(f"Accuracy: {accuracy}")

print(report)

Accuracy: 0.65
              precision    recall  f1-score   support

    negative       0.70      0.53      0.60        40
    positive       0.62      0.78      0.69        40

    accuracy                           0.65        80
   macro avg       0.66      0.65      0.64        80
weighted avg       0.66      0.65      0.64        80



In [29]:
xgb_model = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f"Accuracy: {accuracy}")

print(report)


Accuracy: 0.65
              precision    recall  f1-score   support

    Negative       0.70      0.53      0.60        40
    Positive       0.62      0.78      0.69        40

    accuracy                           0.65        80
   macro avg       0.66      0.65      0.64        80
weighted avg       0.66      0.65      0.64        80

