In [None]:
import os
import pandas as pd

def read_bank_speeches(folder_path):
    data = []
    # Loop through files in the specified folder
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            file_path = os.path.join(folder_path, file)
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                data.append({"File": file, "Content": content})
    return pd.DataFrame(data)

folder_path = r"/content/drive/MyDrive/reserve_bank_of_india"
df = read_bank_speeches(folder_path)

if not df.empty:
    # Now you have the DataFrame df containing your data
    print(df.head())  # Print the first few rows of the DataFrame
else:
    print("No text files found in the specified folder.")


               File                                            Content
0  2007-05-22_c.txt  Y V Reddy: The Indian economy – review and pro...
1  2007-10-26_e.txt  V Leeladhar: Customer centricity and the Reser...
2  2002-07-25_d.txt  Bimal Jalan: Seminar on international financia...
3  2005-06-29_i.txt  V Leeladhar: Challenges in banking security\nI...
4  2007-04-04_d.txt  Y V Reddy: Role of monetary policy in attainin...


In [None]:
!pip install -U sec-edgar-downloader

Collecting sec-edgar-downloader
  Downloading sec_edgar_downloader-5.0.2-py3-none-any.whl (14 kB)
Collecting pyrate-limiter>=3.1.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.6.1-py3-none-any.whl (26 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.6.1 sec-edgar-downloader-5.0.2


In [None]:
from sec_edgar_downloader import Downloader

import os

def move_txt_files(source_dir):

  # Create the target directory if it doesn't exist
  target_dir = os.path.join(source_dir, "text_files")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)

  for filename in os.listdir(source_dir):
    if filename.endswith(".txt"):
      filepath = os.path.join(source_dir, filename)
      new_filepath = os.path.join(target_dir, filename)
      os.rename(filepath, new_filepath)
  print(f"Moved all .txt files to a new folder named 'text_files'")



def download_10k_filings(ticker, output_dir, company_name="Company Name", email_address="your_email@example.com"):
  """
  Downloads 10-K filings for a company ticker from 1995 to 2023.

  Args:
      ticker: The company ticker symbol.
      output_dir: The directory to save the downloaded filings.
      company_name (optional): The company name (default: "Company Name").
      email_address (optional): Your email address (default: "your_email@example.com").
  """
  downloader = Downloader(company_name=company_name, email_address=email_address,download_folder="/content/")
  downloader.get("10-K",ticker,download_details=False)

  for filename in os.listdir(source_dir):
    if filename.endswith(".txt"):
      filepath = os.path.join(source_dir, filename)
      new_filepath = os.path.join(target_dir, filename)
      os.rename(filepath, new_filepath)
  print(f"Moved all .txt files to {target_dir}")


# Example usage (replace with your chosen tickers)
companies = ["AAPL", "GOOG", "MSFT"]

for company in companies:
  download_10k_filings(company, "filings")

print("Download complete!")


Download complete!


In [None]:
from bs4 import BeautifulSoup

def extract_text_soup(html_content):
  soup = BeautifulSoup(html_content, "html.parser")
  # Extract text from all elements (excluding script and style tags)
  text = [element.get_text(strip=True) for element in soup.findAll(text=True) if element.name not in ["script", "style"]]
  return " ".join(text).strip()

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words and non-alphabetic tokens, and lemmatize tokens
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Assuming 'df' is your original DataFrame with the content of speeches

# Create a list to store preprocessed content
preprocessed_content_list = []

# Iterate through each row in the original DataFrame and preprocess content
for index, row in df.iterrows():
    preprocessed_content = preprocess_text(row['Content'])
    preprocessed_content_list.append({'File': row['File'], 'Preprocessed Content': preprocessed_content})

# Create a new DataFrame from the list of preprocessed content
preprocessed_df = pd.DataFrame(preprocessed_content_list)
T_df=preprocessed_df.copy()
# Print the first few rows of the new DataFrame
print(preprocessed_df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TypeError: argument of type 'NoneType' is not iterable

TF_IDF VECTORIZATION AND FREQUENCY INTERPRETATION


In [None]:
T_df

Unnamed: 0,File,Preprocessed Content
0,2007-05-22_c.txt,v reddy indian economy review prospect speech ...
1,2007-10-26_e.txt,v leeladhar customer centricity reserve bank o...
2,2002-07-25_d.txt,bimal jalan seminar international financial ar...
3,2005-06-29_i.txt,v leeladhar challenge banking security inaugur...
4,2007-04-04_d.txt,v reddy role monetary policy attaining growth ...
...,...,...
816,2011-08-18_d.txt,deepak mohanty changing inflation dynamic indi...
817,2010-05-25_e.txt,k c chakrabarty bank credit micro small medium...
818,2013-08-05_e.txt,duvvuri subbarao responsible innovation regula...
819,2012-09-24_b.txt,b mahapatra underlying concept principle dynam...


In [None]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess and tokenize the text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')  # Remove English stop words
tfidf_matrix = tfidf_vectorizer.fit_transform(T_df['Preprocessed Content'])

# Extract feature names (words/tokens) and their corresponding TF-IDF scores
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_weights = tfidf_matrix.toarray()

# Create a DataFrame to store the top 10 keywords with their corresponding weights for each file
top_keywords_df = pd.DataFrame(columns=['File'] + [f'Top_{i}_Keyword' for i in range(1, 11)] + [f'Top_{i}_Weight' for i in range(1, 11)])

for i, file in enumerate(df['File']):
    # Get TF-IDF weights for the current file
    file_tfidf_weights = tfidf_weights[i]

    # Get indices of top 10 TF-IDF weights
    top_indices = file_tfidf_weights.argsort()[-10:][::-1]

    # Extract top 10 keywords and their corresponding weights
    top_keywords = [feature_names[idx] for idx in top_indices]
    top_weights = [file_tfidf_weights[idx] for idx in top_indices]

    # Add file name to the list of top keywords and weights
    row_data = [file] + top_keywords + top_weights

    # Append row to DataFrame
    top_keywords_df.loc[i] = row_data

top_keywords_df

Unnamed: 0,File,Top_1_Keyword,Top_2_Keyword,Top_3_Keyword,Top_4_Keyword,Top_5_Keyword,Top_6_Keyword,Top_7_Keyword,Top_8_Keyword,Top_9_Keyword,...,Top_1_Weight,Top_2_Weight,Top_3_Weight,Top_4_Weight,Top_5_Weight,Top_6_Weight,Top_7_Weight,Top_8_Weight,Top_9_Weight,Top_10_Weight
0,2007-05-22_c.txt,cent,growth,bangladesh,dhaka,saarcfinance,inflation,south,india,acu,...,0.234775,0.229320,0.210753,0.195487,0.186546,0.156227,0.154558,0.153474,0.150322,0.142033
1,2007-10-26_e.txt,customer,bank,service,banking,code,rbi,centricity,financial,ombudsman,...,0.534137,0.393804,0.266007,0.259796,0.157214,0.149065,0.136656,0.127268,0.101394,0.092843
2,2002-07-25_d.txt,architecture,exchange,capital,international,bretton,wood,crisis,rate,flow,...,0.247963,0.245410,0.216927,0.185625,0.184417,0.170021,0.167523,0.167473,0.162508,0.150866
3,2005-06-29_i.txt,security,password,authorised,software,message,sender,verify,information,computer,...,0.383203,0.215466,0.197312,0.184704,0.162914,0.150117,0.143304,0.133733,0.131541,0.125420
4,2007-04-04_d.txt,monetary,market,bank,policy,inflation,cent,rate,reserve,financial,...,0.285638,0.281475,0.225391,0.221913,0.170659,0.161018,0.155279,0.154999,0.149503,0.146359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,2011-08-18_d.txt,inflation,price,commodity,protein,cent,wpi,persistence,food,global,...,0.653262,0.233828,0.198736,0.193632,0.152644,0.145540,0.142392,0.141097,0.131411,0.127030
817,2010-05-25_e.txt,msmes,enterprise,mse,bank,credit,micro,msme,small,ised,...,0.335885,0.274910,0.246643,0.224231,0.213613,0.183239,0.157622,0.151528,0.150774,0.148087
818,2013-08-05_e.txt,financial,innovation,bank,responsible,regulation,idrbt,market,sector,reserve,...,0.361036,0.309368,0.276409,0.273956,0.268027,0.179950,0.162192,0.120997,0.116808,0.110312
819,2012-09-24_b.txt,provision,loss,provisioning,alpha,dynamic,loan,spanish,spain,specific,...,0.455766,0.403543,0.342232,0.276986,0.268711,0.192215,0.137963,0.137623,0.130678,0.126996


In [None]:
# Extract year from the 'File' column
sample_df['Date'] = sample_df['File'].str.extract(r'(\d{4})')

# Convert 'Date' column to datetime
sample_df['Date'] = pd.to_datetime(sample_df['Date'])

# Define input text and keywords related to the input
input_text = 'Monetary funds from central bank'
input_text_lower = input_text.lower()  # Convert input text to lowercase
input_txt = input_text_lower.replace(' ', '_')  # Replace spaces with underscores
input_keywords = input_text_lower.split()   # Split the input text into individual keywords

# Filter articles containing keywords related to the input
def contains_keywords(text, keywords):
    return any(re.search(keyword, text, re.IGNORECASE) for keyword in keywords)

sample_df[input_txt] = sample_df['Preprocessed Content'].apply(lambda x: contains_keywords(x, input_keywords))

# Group by year and count the number of articles discussing the input for each year
Yearly_counts = sample_df[sample_df[input_txt]].groupby(sample_df['Date'].dt.year).size()

# Create a Plotly figure for the bar graph
fig = go.Figure()

# Add bar trace
fig.add_trace(go.Bar(x=Yearly_counts.index, y=Yearly_counts.values, name='Number of Speeches'))

# Add line trace
fig.add_trace(go.Scatter(x=Yearly_counts.index, y=Yearly_counts.values,
                         mode='lines+markers', name='Line Plot',
                         line=dict(color='red')))

# Update layout
fig.update_layout(title=f'Number of Speeches Discussing {input_text}',
                  xaxis_title='Year', yaxis_title='Number of Speeches')

# Show the plot
fig.show()

In [None]:
# Define input text and keywords related to the input
# ad hoc Treasurbby bills
# rakesh mohan
# Monetary funds from central bank
input_text = 'rakesh mohan'
input_text_lower = input_text.lower()  # Convert input text to lowercase
input_txt = input_text_lower.replace(' ', '_')  # Replace spaces with underscores
input_keywords = input_text_lower.split()   # Split the input text into individual keywords

# Filter articles containing keywords related to the input
def contains_keywords(text, keywords):
    return any(re.search(keyword, text, re.IGNORECASE) for keyword in keywords)

sample_df[input_txt] = sample_df['Preprocessed Content'].apply(lambda x: contains_keywords(x, input_keywords))

# Group by year and count the number of articles discussing the input for each year
Yearly_counts = sample_df[sample_df[input_txt]].groupby(sample_df['Date'].dt.year).size()

# Create a Plotly figure for the bar graph
fig = go.Figure()

# Add bar trace
fig.add_trace(go.Bar(x=Yearly_counts.index, y=Yearly_counts.values, name='Number of Speeches'))

# Add line trace
fig.add_trace(go.Scatter(x=Yearly_counts.index, y=Yearly_counts.values,
                         mode='lines+markers', name='Line Plot',
                         line=dict(color='red')))

# Update layout
fig.update_layout(title=f'Number of Speeches Discussing {input_text}',
                  xaxis_title='Year', yaxis_title='Number of Speeches')

# Show the plot
fig.show()

In [None]:
# Define input text and keywords related to the input
# ad hoc Treasurbby bills
# rakesh mohan
# Monetary funds from central bank
input_text = 'Market Crash'
input_text_lower = input_text.lower()  # Convert input text to lowercase
input_txt = input_text_lower.replace(' ', '_')  # Replace spaces with underscores
input_keywords = input_text_lower.split()   # Split the input text into individual keywords

# Filter articles containing keywords related to the input
def contains_keywords(text, keywords):
    return any(re.search(keyword, text, re.IGNORECASE) for keyword in keywords)

sample_df[input_txt] = sample_df['Preprocessed Content'].apply(lambda x: contains_keywords(x, input_keywords))

# Group by year and count the number of articles discussing the input for each year
Yearly_counts = sample_df[sample_df[input_txt]].groupby(sample_df['Date'].dt.year).size()

# Create a Plotly figure for the bar graph
fig = go.Figure()

# Add bar trace
fig.add_trace(go.Bar(x=Yearly_counts.index, y=Yearly_counts.values, name='Number of Speeches'))

# Add line trace
fig.add_trace(go.Scatter(x=Yearly_counts.index, y=Yearly_counts.values,
                         mode='lines+markers', name='Line Plot',
                         line=dict(color='red')))

# Update layout
fig.update_layout(title=f'Number of Speeches Discussing {input_text}',
                  xaxis_title='Year', yaxis_title='Number of Speeches')

# Show the plot
fig.show()

In [None]:
import re
import plotly.graph_objects as go  # Import Plotly library

# Make a copy of the DataFrame
sample_df = T_df
sample_df

Unnamed: 0,File,Preprocessed Content,Date,monetary_funds_from_central_bank,rakesh_mohan,market_crash
0,2007-05-22_c.txt,v reddy indian economy review prospect speech ...,2007-01-01,True,False,True
1,2007-10-26_e.txt,v leeladhar customer centricity reserve bank o...,2007-01-01,True,False,True
2,2002-07-25_d.txt,bimal jalan seminar international financial ar...,2002-01-01,True,False,True
3,2005-06-29_i.txt,v leeladhar challenge banking security inaugur...,2005-01-01,True,False,False
4,2007-04-04_d.txt,v reddy role monetary policy attaining growth ...,2007-01-01,True,False,True
...,...,...,...,...,...,...
816,2011-08-18_d.txt,deepak mohanty changing inflation dynamic indi...,2011-01-01,True,True,True
817,2010-05-25_e.txt,k c chakrabarty bank credit micro small medium...,2010-01-01,True,False,True
818,2013-08-05_e.txt,duvvuri subbarao responsible innovation regula...,2013-01-01,True,False,True
819,2012-09-24_b.txt,b mahapatra underlying concept principle dynam...,2012-01-01,True,False,True


IMPROVED IMPLEMENTATION METHOD

In [None]:
#Tokenisation
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK resources if not already downloaded
nltk.download('punkt')

# Function for tokenization
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the 'Preprocessed Content' column of the DataFrame
preprocessed_df['Tokens'] = preprocessed_df['Preprocessed Content'].apply(tokenize_text)

# Print the first few rows to verify tokenization
print(preprocessed_df[['File', 'Tokens']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


               File                                             Tokens
0  2007-05-22_c.txt  [v, reddy, indian, economy, review, prospect, ...
1  2007-10-26_e.txt  [v, leeladhar, customer, centricity, reserve, ...
2  2002-07-25_d.txt  [bimal, jalan, seminar, international, financi...
3  2005-06-29_i.txt  [v, leeladhar, challenge, banking, security, i...
4  2007-04-04_d.txt  [v, reddy, role, monetary, policy, attaining, ...


In [None]:
Cor1=preprocessed_df['Tokens']
print(Cor1[0])


['v', 'reddy', 'indian', 'economy', 'review', 'prospect', 'speech', 'dr', 'v', 'reddy', 'governor', 'reserve', 'bank', 'india', 'metropolitan', 'chamber', 'commerce', 'industry', 'dhaka', 'may', 'latifur', 'rahman', 'president', 'metropolitan', 'chamber', 'commerce', 'industry', 'mahbubur', 'rahman', 'president', 'icc', 'bangladesh', 'distinguished', 'member', 'trade', 'body', 'dear', 'friend', 'honoured', 'invited', 'deliver', 'address', 'today', 'metropolitan', 'chamber', 'commerce', 'industry', 'dhaka', 'reserve', 'bank', 'india', 'rbi', 'pleasure', 'receiving', 'recently', 'delegation', 'occasion', 'accepted', 'invitation', 'principle', 'address', 'gathering', 'glad', 'able', 'fulfil', 'assurance', 'honour', 'associated', 'three', 'decade', 'several', 'distinguished', 'civil', 'servant', 'bangladesh', 'notable', 'among', 'continue', 'close', 'friend', 'syeduzzaman', 'gholam', 'kibria', 'former', 'finance', 'secretary', 'government', 'bangladesh', 'privilege', 'working', 'closely', 

In [None]:
pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
#LDA Corpora on Unigrams
id2word = corpora.Dictionary(Cor1)

corpus = []
for text in Cor1:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 2), (9, 2), (10, 1), (11, 2), (12, 9), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 5)]
ability


In [None]:
#Unigrams LDA Model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis





In [None]:
lda_model.print_topics()

[(9,
  '0.000*"reasonability" + 0.000*"reissuable" + 0.000*"multiinstitutional" + 0.000*"multilevel" + 0.000*"nonadherence" + 0.000*"karimnagar" + 0.000*"unawareness" + 0.000*"soiled" + 0.000*"instructed" + 0.000*"revolutionizing"'),
 (2,
  '0.000*"reasonability" + 0.000*"reissuable" + 0.000*"multiinstitutional" + 0.000*"multilevel" + 0.000*"nonadherence" + 0.000*"karimnagar" + 0.000*"unawareness" + 0.000*"soiled" + 0.000*"instructed" + 0.000*"revolutionizing"'),
 (17,
  '0.000*"reasonability" + 0.000*"reissuable" + 0.000*"multiinstitutional" + 0.000*"multilevel" + 0.000*"nonadherence" + 0.000*"karimnagar" + 0.000*"unawareness" + 0.000*"soiled" + 0.000*"instructed" + 0.000*"revolutionizing"'),
 (6,
  '0.000*"reasonability" + 0.000*"reissuable" + 0.000*"multiinstitutional" + 0.000*"multilevel" + 0.000*"nonadherence" + 0.000*"karimnagar" + 0.000*"unawareness" + 0.000*"soiled" + 0.000*"instructed" + 0.000*"revolutionizing"'),
 (8,
  '0.000*"reasonability" + 0.000*"reissuable" + 0.000*"mul

In [None]:
bigrams_phrases = gensim.models.Phrases(Cor1, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[Cor1], threshold=50)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
  return (bigram[doc] for doc in texts)

def make_trigram(texts):
  return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(Cor1)
data_bigrams_trigrams = make_trigram(data_bigrams)

data_bigrams_trigrams_list = list(data_bigrams_trigrams)
print(data_bigrams_trigrams_list[0])

['v_reddy', 'indian', 'economy', 'review', 'prospect', 'speech_dr_v', 'reddy_governor', 'reserve', 'bank', 'india', 'metropolitan', 'chamber_commerce_industry', 'dhaka', 'may', 'latifur', 'rahman', 'president', 'metropolitan', 'chamber_commerce_industry', 'mahbubur', 'rahman', 'president', 'icc', 'bangladesh', 'distinguished', 'member', 'trade', 'body', 'dear_friend', 'honoured_invited', 'deliver', 'address', 'today', 'metropolitan', 'chamber_commerce_industry', 'dhaka', 'reserve', 'bank', 'india', 'rbi', 'pleasure', 'receiving', 'recently', 'delegation', 'occasion', 'accepted_invitation', 'principle', 'address', 'gathering', 'glad', 'able', 'fulfil', 'assurance', 'honour', 'associated', 'three_decade', 'several', 'distinguished', 'civil_servant', 'bangladesh', 'notable', 'among', 'continue', 'close', 'friend', 'syeduzzaman', 'gholam', 'kibria', 'former', 'finance_secretary', 'government', 'bangladesh', 'privilege', 'working', 'closely', 'among_others', 'respected', 'fakhruddin', 'ahme

In [None]:
#TF-IDF Removal
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams_list)

texts = data_bigrams_trigrams_list

corpus = [id2word.doc2bow(text) for text in texts]
print (corpus [0] [0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []
for i in range(0, len (corpus)):
  bow = corpus[i]
  low_value_words = [] #reinitialize to be safe. You can skip this.
  tfidf_ids = [id for id, value in tfidf[bow]]
  bow_ids = [id for id, value in bow]
  low_value_words = [ id for id, value in tfidf[bow] if value < low_value]
  drops = low_value_words+words_missing_in_tfidf
  for item in drops:
    words.append(id2word [item])
  words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre o will be missing

  new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]

  corpus[i] = new_bow

[(0, 1), (1, 3), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 2), (9, 2), (10, 1), (11, 2), (12, 6), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 5), (19, 1)]


DEMO MODEL

In [None]:
#Demo Model
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model2, corpus, id2word, mds="mmds", R=30)
vis





TRAINING AND TESTING


In [None]:
#Training
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
test_doc = corpus[-1]

vector = lda_model2[test_doc]
print(vector)

def Sort(sub_li):     #using sublist to sort on a key
  sub_li.sort(key = lambda x: x[1])
  sub_li.reverse()
  return(sub_li)
new_vector=Sort(vector)
print(new_vector)

[(0, 0.04978274), (1, 0.041236818), (2, 0.020401033), (5, 0.031957038), (6, 0.011524156), (8, 0.085956134), (11, 0.22273976), (12, 0.017604126), (14, 0.020201337), (15, 0.18688625), (17, 0.013335114), (23, 0.031670503), (26, 0.18188955), (27, 0.02580517), (28, 0.011703107), (29, 0.021968583)]
[(11, 0.22273976), (15, 0.18688625), (26, 0.18188955), (8, 0.085956134), (0, 0.04978274), (1, 0.041236818), (5, 0.031957038), (23, 0.031670503), (27, 0.02580517), (29, 0.021968583), (2, 0.020401033), (14, 0.020201337), (12, 0.017604126), (17, 0.013335114), (28, 0.011703107), (6, 0.011524156)]


In [None]:
final_output_list=lda_model2.print_topics()

# Create a DataFrame from the list
df = pd.DataFrame(final_output_list, columns=['Topic Number', 'Keywords'])

# Show the DataFrame to verify it looks correct
print(df)


    Topic Number                                           Keywords
0             10  0.082*"cost" + 0.016*"etf" + 0.011*"pacific" +...
1             18  0.109*"mfis" + 0.090*"microfinance" + 0.036*"c...
2              3  0.050*"see" + 0.034*"say" + 0.029*"lot" + 0.01...
3              4  0.400*"market" + 0.035*"security" + 0.029*"par...
4             12  0.102*"currency" + 0.066*"foreign_exchange" + ...
5              8  0.108*"public" + 0.096*"reform" + 0.095*"secto...
6             11  0.339*"credit" + 0.187*"loan" + 0.090*"borrowe...
7              0  0.070*"infrastructure" + 0.055*"indian" + 0.04...
8             17  0.073*"data" + 0.066*"survey" + 0.052*"statist...
9             25  0.325*"inflation" + 0.136*"price" + 0.025*"cha...
10            20  0.129*"inclusion" + 0.066*"state" + 0.042*"lit...
11             7  0.041*"economics" + 0.025*"economist" + 0.024*...
12            14  0.046*"global" + 0.032*"domestic" + 0.030*"dem...
13            24  0.154*"capital" + 0.031*"expec

In [None]:
import re

# Function to extract keywords and their weights
def extract_keywords_weights(text):
    # Find all occurrences of the pattern "weight*keyword"
    matches = re.findall(r'(\d+\.\d+)\*"(.*?)"', text)
    weights = [float(weight) for weight, keyword in matches]
    keywords = [keyword for weight, keyword in matches]
    return weights, keywords

# Apply the function to the Keywords column
df['Weights'], df['Only Keywords'] = zip(*df['Keywords'].apply(extract_keywords_weights))

# Show the updated DataFrame
print(df[['Topic Number', 'Weights', 'Only Keywords']])

    Topic Number                                            Weights  \
0             10  [0.082, 0.016, 0.011, 0.007, 0.006, 0.005, 0.0...   
1             18  [0.109, 0.09, 0.036, 0.012, 0.009, 0.005, 0.00...   
2              3  [0.05, 0.034, 0.029, 0.016, 0.016, 0.014, 0.01...   
3              4  [0.4, 0.035, 0.029, 0.027, 0.026, 0.023, 0.019...   
4             12  [0.102, 0.066, 0.06, 0.048, 0.043, 0.03, 0.028...   
5              8  [0.108, 0.096, 0.095, 0.048, 0.026, 0.023, 0.0...   
6             11  [0.339, 0.187, 0.09, 0.046, 0.04, 0.038, 0.016...   
7              0  [0.07, 0.055, 0.047, 0.044, 0.035, 0.03, 0.03,...   
8             17  [0.073, 0.066, 0.052, 0.048, 0.039, 0.033, 0.0...   
9             25  [0.325, 0.136, 0.025, 0.021, 0.02, 0.02, 0.018...   
10            20  [0.129, 0.066, 0.042, 0.036, 0.028, 0.025, 0.0...   
11             7  [0.041, 0.025, 0.024, 0.02, 0.019, 0.018, 0.01...   
12            14  [0.046, 0.032, 0.03, 0.019, 0.019, 0.018, 0.01...   
13    

In [None]:
#Top Topic Extraction

top_topics= new_vector[:5]
# Display the top 5 topics
print(top_topics)
print("\n")
# Extract only the topic numbers
topic_numbers = [topic[0] for topic in top_topics]

# Display the topic numbers
print(topic_numbers)
print("\n")

# Filter DataFrame to only include rows where 'Topic Number' is in the list of topic_numbers
filtered_df = df[df['Topic Number'].isin(topic_numbers)]

# Sort the DataFrame based on the order of topic_numbers
filtered_sorted_df = filtered_df.set_index('Topic Number').loc[topic_numbers].reset_index()

# Display the resulting DataFrame
print(filtered_sorted_df[['Topic Number', 'Only Keywords']])



[(11, 0.22273976), (15, 0.18688625), (26, 0.18188955), (8, 0.085956134), (0, 0.04978274)]


[11, 15, 26, 8, 0]


   Topic Number                                      Only Keywords
0            11  [credit, loan, borrower, lender, account, lend...
1            15  [model, dynamic_provisioning, restructuring, h...
2            26  [business, fraud, employee, people, total, cor...
3             8  [public, reform, sector, government, private, ...
4             0  [infrastructure, indian, investment, financing...


In [None]:
# Assuming 'filtered_sorted_df' is your DataFrame
keywords_list = [keyword for sublist in filtered_sorted_df['Only Keywords'] for keyword in sublist]

# Display the complete list of keywords
print(keywords_list)


['credit', 'loan', 'borrower', 'lender', 'account', 'lending', 'recommendation', 'working_group', 'advised', 'default', 'model', 'dynamic_provisioning', 'restructuring', 'housing', 'pricing', 'r', 'profit', 'digital', 'entrepreneur', 'licensing', 'business', 'fraud', 'employee', 'people', 'total', 'corporate', 'source', 'knowledge', 'quality', 'job', 'public', 'reform', 'sector', 'government', 'private', 'foreign', 'bi_review', 'priority', 'improvement', 'fiscal', 'infrastructure', 'indian', 'investment', 'financing', 'company', 'smes', 'sme', 'etc', 'enterprise', 'unit']


In [None]:
!pip install -q -U google-generativeai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/142.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.1/142.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/663.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m512.0/663.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m663.6/663.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Used to securely store your API key
from google.colab import userdata

# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [None]:
# Convert the list of keywords into a single string
prompt_text = ['credit', 'loan', 'borrower', 'lender', 'account', 'lending', 'recommendation',
               'working_group', 'advised', 'default', 'model', 'dynamic_provisioning',
               'restructuring', 'housing', 'pricing', 'r', 'profit', 'digital', 'entrepreneur',
               'licensing', 'business', 'fraud', 'employee', 'people', 'total', 'corporate',
               'source', 'knowledge', 'quality', 'job', 'public', 'reform', 'sector', 'government',
               'private', 'foreign', 'bi_review', 'priority', 'improvement', 'fiscal', 'infrastructure',
               'indian', 'investment', 'financing', 'company', 'smes', 'sme', 'etc', 'enterprise', 'unit']   #keywords_list

# Convert list to string
prompt_text_str = ','.join(prompt_text)

# Define the prompt text
prompt = "We are building a system to understand text from central bank speech transcripts. The keywords are extracted using LDA and those keywords are present as a list. Dont print the keywords list. Generate an interpretation from the keywords and summarize the changes from the text regarding any market changes from the keyword and text generated and provide a concise interpretation of the market changes : "
prompt += prompt_text_str

In [None]:
response = model.generate_content(prompt)
to_markdown(response.text)

> **Market Changes Interpretation:**
> 
> The transcript reveals several changes affecting the financial sector and market landscape.
> 
> **Credit and Lending:**
> 
> * Emphasis on responsible lending and borrower protection
> * Revisions to credit models and dynamic provisioning norms
> * Recommendations by a working group for restructuring and defaults
> 
> **Financial Institutions:**
> 
> * Focus on digital transformation and risk mitigation
> * Revamping of licensing and employee training regulations
> 
> **Entrepreneurial Support:**
> 
> * Initiatives to enhance credit access for entrepreneurs and SMEs
> * Financial support and guidance for start-ups
> * Reforms to improve the business environment
> 
> **Public Sector Involvement:**
> 
> * Government prioritizing infrastructure investment and financing
> * Reforms and amendments to fiscal policies
> * Collaborations for public-private partnerships
> 
> **General Economic Trends:**
> 
> * Focus on improving financial inclusion and reducing financial fraud
> * Emphasis on knowledge sharing, quality control, and job creation
> * Initiatives to boost corporate governance and stakeholder engagement
> 
> **Concise Interpretation:**
> 
> The transcript highlights regulatory changes, technological advancements, and policy initiatives aimed at promoting responsible lending, strengthening financial institutions, fostering entrepreneurial growth, and enhancing public sector involvement in economic development. These changes suggest a shift towards a more inclusive, transparent, and dynamic financial market environment.

In [None]:
#CONNECTION OF RESULT TO RECOMMENDATION

# Define the prompt text
prompt = "From the Previous response, Provide me 5 topic names to research for the same on Web"

response = model.generate_content(prompt)
to_markdown(response.text)


SyntaxError: incomplete input (<ipython-input-13-74fe1eae8e80>, line 7)

In [None]:
pip install google



In [None]:
from googlesearch import search

# Function to search for news articles based on keywords
def search_news_articles(keyword, num_results=10):
    # Construct the search query
    query = f'{keyword} news'

    # Initialize a list to store search results
    search_results = []

    # Perform the Google search and retrieve the top results
    for result in search(query, pause=2):
        search_results.append(result)
        # Check if the desired number of results has been reached
        if len(search_results) >= num_results:
            break

    # Return the search results
    return search_results

# Example keywords to search for
keywords = ['economic trends', 'central bank policy', 'monetary funds']

# Iterate over the keywords and search for news articles
for keyword in keywords:
    print(f"News articles related to '{keyword}':")
    results = search_news_articles(keyword, num_results=10)  # Specifying the number of results
    for index, result in enumerate(results, start=1):
        print(f"{index}. {result}")
    print()
    break

News articles related to 'economic trends':
1. https://www.usnews.com/topics/subjects/economy
2. https://www.cnbc.com/economy/
3. https://www.cnbc.com/world-economy/
4. https://www.cnbc.com/federal-reserve/
5. https://www.cnbc.com/central-banks/
6. https://www.cnbc.com/2024/02/23/economic-boost-from-taylor-swifts-eras-tour-could-be-overstated-nomura-warns.html
7. https://www.cnn.com/business/economy
8. https://www2.deloitte.com/us/en/insights/economy/global-economic-outlook/weekly-update.html
9. https://www.nytimes.com/section/business/economy?page=2
10. https://www.cnbc.com/us-economy/



In [None]:
#Web-scrapping
import requests
from bs4 import BeautifulSoup

# Iterate through each link
for link in results:
    try:
        # Send a GET request to the link with a timeout of 10 seconds
        response = requests.get(link, timeout=10)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')
            # Extract title
            title = soup.title.text.strip()
            # Extract very short one-liner abstract (if available)
            abstract = soup.find('meta', attrs={'name': 'description'})
            if abstract:
                abstract = abstract.get('content').strip()
            else:
                abstract = "No abstract available"
            # Print title, abstract, and link
            print(f"Title: {title}")
            print(f"Abstract: {abstract}")
            print(f"Link: {link}")
            print()
        else:
            print(f"Error: Failed to access link - {link}")
            print()
    except requests.Timeout:
        print(f"Timeout error occurred while accessing link: {link}")
        print()
    except Exception as e:
        # Skip the link and move on to the next one
        pass

Timeout error occurred while accessing link: https://www.usnews.com/topics/subjects/economy

Title: Economic News
Abstract: Find the latest economic news, current events and headlines, as well as blogs and video from CNBC.com.
Link: https://www.cnbc.com/economy/

Title: Global Economy
Abstract: Latest news and headlines around the world related to the state of the global economy.
Link: https://www.cnbc.com/world-economy/

Title: Federal Reserve
Abstract: Latest news and headlines related to the Federal Reserve.
Link: https://www.cnbc.com/federal-reserve/

Title: News from Central Banks
Abstract: Latest headlines from central banks around the world, including the Bank of Japan and ECB.
Link: https://www.cnbc.com/central-banks/

Title: Nomura: Economic boost from Swift's Eras Tour could be overstated
Abstract: While the firm said the concert's effect on local economies was undeniable, it may have a smaller imprint on national-level economic data than some think.
Link: https://www.cnbc.co