# Install and Load Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
#installing dependencies
!pip install bnlp_toolkit
!pip install faiss-cpu

Collecting bnlp_toolkit
  Downloading bnlp_toolkit-4.0.3-py3-none-any.whl.metadata (3.3 kB)
Collecting gensim==4.3.2 (from bnlp_toolkit)
  Downloading gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting scipy==1.10.1 (from bnlp_toolkit)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sklearn-crfsuite==0.3.6 (from bnlp_toolkit)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting tqdm==4.66.3 (from bnlp_toolkit)
  Downloading tqdm-4.66.3-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ftfy==6.2.0 (from bnlp_toolkit)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting emoji==1.7

In [27]:
#imporing libraris
 #importing all necessari libraries . For tokenization we will use bnlp tokenizer and create our own embedder
from bnlp import NLTKTokenizer
import string
from bnlp import CleanText
import re
import pandas as pd
import numpy as np
import pickle
import faiss

from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import matplotlib.font_manager as fm

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Text cleaning and Preparing our dataset for Embedder and Creating our own vocabulary


In [4]:
#loading our dataset
#We will load two dataframe . Archieve for archived data and current date data for todays news

archive = pd.read_csv('/content/archive.csv')
current_date = pd.read_csv('/content/current_date.csv')

In [5]:
'''
This cell , we will load our tokenizer . we will use BNLP toolkit .
Also  define some necessary functions like text cleaning and preprocessing for our task
'''

tokenizer = NLTKTokenizer()
clean_text = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=False,
   remove_email=True,
   remove_emoji=True,
   remove_number=True,
   remove_digits=True,
   remove_punct=True,
   replace_with_url="<URL>",
   replace_with_email="<EMAIL>",
   replace_with_number="<NUMBER>",
   replace_with_digit="<DIGIT>",
   replace_with_punct = ""
)

def remove_hyphens(text):
    # Remove hyphens from the text
    cleaned_text = text.replace('\u002D', '')
    return cleaned_text

def remove_unwanted_char(text):
  cleaned_text = text.replace('—','')
  cleaned_text = cleaned_text.replace('<','')
  cleaned_text = cleaned_text.replace('>','')
  cleaned_text = cleaned_text.replace('/','')
  cleaned_text = cleaned_text.replace('...','')

  return cleaned_text

# Chunk text function remains the same
def chunk_text(text, chunk_size=100):
    words = text.split()  # Split the text into words
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def remove_parentheses_and_text(lines):
    cleaned_lines = [re.sub(r'\([^)]*\)', '', line) for line in lines]
    return cleaned_lines

def is_bangla(text):
    bangla_pattern = re.compile(r'^[\u0980-\u09FF\s]+$')
    return bool(bangla_pattern.match(text))

def preprocess(text):
  text = clean_text(text)
  text = remove_hyphens(text)
  text = remove_unwanted_char(text)
  text = remove_parentheses_and_text(text)
  #text = is_bangla(text)


  return text

In [6]:
#now preprocess and make a vocab
#we will create our own embedder using word2vec
#dot products between words will extract sementic meaning as we will not use sentence embedder for this case

archive_list = archive['news_text'].apply(lambda text: [sent.strip() for sent in text.split('।') + text.split('?') + text.split('!') if sent.strip()])
current_list = current_date['text'].apply(lambda text: [sent.strip() for sent in text.split('।') + text.split('?') + text.split('!') if sent.strip()])

In [7]:
#Making our dataseet by joining archive and current news to train our embedder

archive_sentence = [sentence for sublist in archive_list for sentence in sublist]
current_sentence = [sentence for sublist in current_list for sentence in sublist]


main_list = archive_sentence + current_sentence


In [8]:
#cleaning text for create our vocab
#clena unwanted words or chars
texts = []

for line in main_list:
  temp = clean_text(line)
  temp = remove_hyphens(temp)
  temp = remove_unwanted_char(temp)
  texts.append(temp)

texts = [text.replace('\n', ' ') for text in texts]


In [9]:
#removing non bengali words
print(len(texts))
texts = [text for text in texts if is_bangla(text)]
print(len(texts))

15818
12088


In [11]:
#tokenized text to train our embedder
tokenized_text = []

for text in texts:
  token = tokenizer.word_tokenize(text)
  tokenized_text.append(token)

In [13]:
#loading my own vocabulary, i have created earlier for my later use
#to increase our benglai word range
with open('/content/drive/MyDrive/dschatbot/vocab.pkl','rb') as f:
  vocab = pickle.load(f)

print(f"the length of vacab is {len(vocab)}")

the length of vacab is 535477


In [14]:
#checking duplicate words with our loaded vocab and add unique words
flattened_tokens = [word for sublist in tokenized_text for word in sublist]
tokenized_set = set(flattened_tokens)
vocab_set = set(vocab.keys())

duplicates_word = set.intersection(vocab_set,tokenized_set)
print(len(duplicates_word))

new_words = set.difference(tokenized_set,vocab_set)
print(len(new_words))


15664
1075


In [15]:
#remove duplicates from our newly words

unique_list = list(set(new_words))
len(unique_list)

1075

In [16]:
#adding new unique words to our vocab
current_vocab_size = len(vocab)

for i,word in enumerate(unique_list):
  vocab[word] = current_vocab_size + i

print(f'old vocab size {current_vocab_size}')
print(f"new vocab size {len(vocab)}")

old vocab size 535477
new vocab size 536552


In [17]:
#optional
#saving our new vocab for later use
#so when you use you can just load this vocab which will reduce whole process till now


with open('/content/vocab.pkl','wb') as f:
  pickle.dump(vocab,f)


In [18]:
#load our pretrained word embedder
#Trained with bengali words to my later use



model = Word2Vec.load('/content/drive/MyDrive/dschatbot/word2vec_model.bin')
model.build_vocab(tokenized_text, update=True)
model.train(tokenized_text, total_examples=len(tokenized_text), epochs=model.epochs)

#save our embedder for later use
model.save('word2vec_model.bin')
print(f"Vocabulary size after update: {len(model.wv.index_to_key)}")




Vocabulary size after update: 536552


In [19]:
#optional
#check my embedder's performance
#fucntion to find similar words
def find_similar_words(word, topn=5):
    similar_words = model.wv.similar_by_word(word, topn=topn)
    return similar_words

In [20]:
#optional
#you are requested to check with any bengali words and find out how good my embedder is

similar = find_similar_words('খেলা', topn=10)
similar

[('খেলাটা', 0.7625560164451599),
 ('খেলাও', 0.7287689447402954),
 ('ম্যাচটি', 0.7021195292472839),
 ('খেলাই', 0.6814424395561218),
 ('ম্যাচ', 0.6699913144111633),
 ('লড়াইটা', 0.6571086049079895),
 ('খেলাটি', 0.652617871761322),
 ('টুর্নামেন্ট', 0.6470634341239929),
 ('ম্যাচটা', 0.6422958374023438),
 ('খেলাটাও', 0.639151930809021)]

# Preprocess data for Indexing

In [21]:

# Function to filter and return only Bangla words from a list of words
# you might wonder why this?
# our previous functions could not perform in dataframe
# as my target to clean data in dataframe saved in column to reduce my time if i will work later on this projet
# hence , i re wrote the function as per my requirements

def is_bangla_word(line):
    """
    Takes a list of words and returns a list containing only the words that have Bangla characters.

    Args:
        word_list (list): A list of words (strings).

    Returns:
        list: A list containing only words that are in Bangla.
    """
    bangla_pattern = re.compile(r'^[\u0980-\u09FF]+$')  # Pattern to match only Bangla characters

    word_list = tokenizer.word_tokenize(line)

    # Filter and return words that match the Bangla pattern
    return [word for word in word_list if bangla_pattern.fullmatch(word)]



In [22]:
#cleaning function for dataframe too

def full_clean(line):


  texts = []


  temp = clean_text(line)
  #print(temp)
  #print('-------')
  temp = remove_hyphens(temp)
  #print(temp)
  #print('---')
  temp = remove_unwanted_char(temp)
  #print(temp)
  #print('--------')
  temp = is_bangla_word(temp)
  #print(temp)
  #print("--------")
  temp = clean_text(temp)
  texts.append(temp)






  return texts

def main_clean(text):
  line = tokenizer.word_tokenize(text)
  #print(line)
  line = full_clean(line)
  return line


In [23]:
#cleaning category , title and text and add new column into the dataframe
#Remember we will create separate index for archive and current date to save time to interact with user

archive['clean_title'] = archive['title'].apply(main_clean)
archive['clean_cat'] = archive['category'].apply(main_clean)
archive['clean_text'] = archive['news_text'].apply(main_clean)
current_date['clean_title'] = current_date['title'].apply(main_clean)
current_date['clean_text'] = current_date['text'].apply(main_clean)

In [None]:
#optional
# save the cleaned dataframe for later use

archive.to_csv('with_clean_text_archive.csv')
current_date.to_csv('with_clean_text_current.csv')

In [24]:
# chunking function
# i will use chunking this time for text only as news text can be a bit longer
# for creating index . News text will be chunked and then will be indexed using FAiSS maybe

def chunk_list(word_list):
    word_list = ' '.join(word_list)
    word_list = word_list.split()
    chunk_size = 30
    return [word_list[i:i + chunk_size] for i in range(0, len(word_list), chunk_size)]

In [25]:
#chunking our text for archive and current date data frame

archive['chunked_text'] = archive['clean_text'].apply(lambda x : chunk_list(x))
current_date['chunked_text'] = current_date['clean_text'].apply(lambda x : chunk_list(x))

In [28]:
'''
Here in this cell, we will use our own techniques to extract semantics.
As i have chunked all the text and did not use any pretrained popular embedder
i will calculate words tf-idf score for each chunk and then this tf-idf scores will be multiplied each
words embedding to figure words importance in semantics . tf-idf score is popular method to find important words . so
i will try this technique to find our semantics of chunks . Here , dot production of chunks embedding could be more helpful .

'''


def compute_tfidf_weights(chunk):

    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
    tfidf_matrix = vectorizer.fit_transform([' '.join(chunk)])
    tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0]))

    return tfidf_scores


def embed_text_chunk(text_chunk, word2vec_model):

    tfidf_scores = compute_tfidf_weights(text_chunk)
    embeddings = []
    for word in text_chunk:
        if word in word2vec_model.wv:
            tfidf_weight = tfidf_scores.get(word, 1.0)  # Default weight is 1.0 if word not in TF-IDF scores
            weighted_embedding = word2vec_model.wv[word] * tfidf_weight  # Multiply embedding by TF-IDF weight
            embeddings.append(weighted_embedding)  # Store the weighted embedding


    if embeddings:
        final_embedding = np.zeros(word2vec_model.vector_size)


        for emb in embeddings:
            final_embedding += emb  # This can be changed to np.dot if you want a different combination logic

        return final_embedding
    else:
        return np.zeros(word2vec_model.vector_size)


def embed_text_chunks(text_chunks, word2vec_model):

    chunk_embeddings = [embed_text_chunk(chunk, word2vec_model) for chunk in text_chunks]


    if chunk_embeddings:
        chunk_weights = [np.linalg.norm(embedding) for embedding in chunk_embeddings]
        total_weight = sum(chunk_weights)


        final_embedding = np.zeros(word2vec_model.vector_size)

        for embedding, weight in zip(chunk_embeddings, chunk_weights):
            if total_weight > 0:
                final_embedding += embedding * (weight / total_weight)

        return final_embedding if total_weight > 0 else np.zeros(word2vec_model.vector_size)
    else:
        return np.zeros(word2vec_model.vector_size)


In [29]:
#get text embeddings for both the dataframe

archive['text_embedding'] = archive['chunked_text'].apply(lambda x: embed_text_chunks(x, model))
current_date['text_embedding'] = current_date['chunked_text'].apply(lambda x: embed_text_chunks(x, model))


In [30]:
# function to embedding category

def get_cat_embeddings(tokens, model):
    tokens = ' '.join(tokens)
    tokens = tokens.split()
    tfidf_scores = compute_tfidf_weights(tokens)
    embeddings = []
    weights = []

    for word in tokens:
        if word in model.wv:
            tfidf_weight = tfidf_scores.get(word, 1.0)
            embeddings.append(model.wv[word] * tfidf_weight)
            weights.append(tfidf_weight)


    if embeddings:
        total_weight = sum(weights)
        weighted_average_embedding = np.sum(embeddings, axis=0) / total_weight
        return weighted_average_embedding if total_weight > 0 else np.zeros(model.vector_size)
    else:
        return np.zeros(model.vector_size)


In [31]:
# title embedding

def get_title_embeddings(tokens, model):


    tokens = ' '.join(tokens).split()
    tfidf_scores = compute_tfidf_weights(tokens)
    embeddings = []
    weights = []

    for word in tokens:
        if word in model.wv:
            tfidf_weight = tfidf_scores.get(word, 1.0)
            weighted_embedding = model.wv[word] * tfidf_weight
            embeddings.append(weighted_embedding)
            weights.append(tfidf_weight)


    if embeddings:
        final_embedding = np.zeros(model.vector_size)
        total_weight = sum(weights)
        for emb, weight in zip(embeddings, weights):
            final_embedding += emb

        if total_weight > 0:
            final_embedding /= total_weight

        return final_embedding
    else:
        return np.zeros(model.vector_size)


In [32]:
# category and title embedding for archive news
# title embedding for current news

archive['title_embedding'] = archive['clean_title'].apply(lambda x : get_title_embeddings(x,model))
current_date['title_embedding'] = current_date['clean_title'].apply(lambda x: get_title_embeddings(x,model))
archive['cat_embdding'] = archive['clean_cat'].apply(lambda x: get_cat_embeddings(x,model))


# Creating FAISS indexs


In [33]:
# Create and save our faiss indexes for archive's category , title and text
# This system will search different indexes based on users query
#


embedding_dim = archive['title_embedding'].iloc[0].shape[0]


a_title_index = faiss.IndexFlatL2(embedding_dim)
a_text_index = faiss.IndexFlatL2(embedding_dim)
a_category_index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings to a NumPy array and add them to the respective FAISS index
a_title_embeddings = np.vstack(archive['title_embedding'].values)
a_text_embeddings = np.vstack(archive['text_embedding'].values)
a_category_embeddings = np.vstack(archive['cat_embdding'].values)

a_title_index.add(a_title_embeddings)
a_text_index.add(a_text_embeddings)
a_category_index.add(a_category_embeddings)

# save the indexes for later use
# optional
# Save the indices to disk
faiss.write_index(a_title_index, 'a_title_index.index')
faiss.write_index(a_text_index, 'a_text_index.index')
faiss.write_index(a_category_index, 'a_category_index.index')


In [34]:
#similar task for current news
# if user serach for current news, system will use current indexs to interact


c_embedding_dim = current_date['title_embedding'].iloc[0].shape[0]


c_title_index = faiss.IndexFlatL2(c_embedding_dim)
c_text_index = faiss.IndexFlatL2(c_embedding_dim)


c_title_embeddings = np.vstack(current_date['title_embedding'].values)
c_text_embeddings = np.vstack(current_date['text_embedding'].values)

c_title_index.add(c_title_embeddings)
c_text_index.add(c_text_embeddings)


#optional
faiss.write_index(c_title_index, 'c_title_index.index')
faiss.write_index(c_text_index, 'c_text_index.index')

In [35]:
'''
This function will be designed to retrive information based on users query
for details , read documentation

Once we can match index information , we wil retrive index and based on the index
we will return all the other information from archive dataframe

'''

def search_archived_faiss(query_embedding, index_type, k=5):

    if index_type == 'title':
        distances, indices = a_title_index.search(np.array([query_embedding]), k)
    elif index_type == 'text':
        distances, indices = a_text_index.search(np.array([query_embedding]), k)
    elif index_type == 'category':
        distances, indices = a_category_index.search(np.array([query_embedding]), k)

    else:
        raise ValueError("Invalid index_type. Must be one of: 'title', 'text', 'category'.")


    #print(f"distance is {distances}")
    # Retrieve the corresponding rows from the DataFrame based on the indices returned
    result_data = archive.iloc[indices[0]]

    # Return the relevant data (e.g., news link, title, text, etc.)
    results = []
    for idx, row in result_data.iterrows():
        results.append({
            'title': row['title'],
            'category': row['category'],
            'news_link': row['link'],
            'news_text': row['news_text']
        })

    return results


In [36]:
#similar task as we do for archive indexes

def search_current_faiss(query_embedding, index_type, k=5):

    if index_type == 'title':
        distances, indices = c_title_index.search(np.array([query_embedding]), k)
    elif index_type == 'text':
        distances, indices = c_text_index.search(np.array([query_embedding]), k)

    else:
        raise ValueError("Invalid index_type. Must be one of: 'title', 'text', 'category'.")


    result_data = current_date.iloc[indices[0]]

    print(f"distance is {distances}")


    results = []
    for idx, row in result_data.iterrows():
        results.append({
            'title': row['title'],
            'news_link': row['url'],
            'news_text': row['text']
        })

    return results


# Generator funcitons

In [37]:
def check_bengali_query(query):
    """
    This function will find out user's promt related to current date news or not
    The idea is if current date , we will search current date's index
    Else we will search archive index
    """

    current_indicators = ['আজ', 'আজকের', 'আজকে']


    for word in current_indicators:
        if word in query:
            return 'current'

    return 'archive'

In [38]:
def check_cat(query):
  '''
  This functionality will check if the user's prompt is related to title or category or text
  and then retrive the information accordingly to the required faiss index
  '''

  query = tokenizer.word_tokenize(query)

  if (len(query) <=2 ):
    return 'category'
  elif (len(query ) >2 and len(query) <=13):
    return 'title'

  else:
    return 'text'

In [39]:
def process_query(query,idx):

  '''
  This function will process user's prompt
  if the prompt is text/title/category, we will embedded the input accordingly
  '''

  embeddings = []

  #print(query)
  query = query
  word2vec_model = model

  if idx == 'title':

    #print(query)
    query = query
    #print(query)
    avg_embedding = get_title_embeddings(query,word2vec_model)

  elif idx == 'category':
    avg_embedding = get_cat_embeddings(query,word2vec_model)

  elif idx == 'text':
    qry = ' '.join(query)
    #print(qry)
    qry = qry.split()
    #print(qry)

    chunks = chunk_list(qry)

    #print(chunks)

    avg_embedding = embed_text_chunks(chunks, word2vec_model)

  else:
    print(f' Something wrong happend . Try again with appropriate words or text. Thank You')




  return query, avg_embedding





In [40]:
def retrive_info(query):
  '''
  This would be the main function to retrive informations
  It will check the query is a current / archive one
  It will chekc the query is a text/title/categroy

  and then retrive the information accordingly

  '''
  query = tokenizer.word_tokenize(query)
  query = full_clean(query)

  #query,embedding = process_query(query)
  #print(f" embedding is {embedding}")

  st_qry = ' '.join(query)
  #print(st_qry)


  status = check_bengali_query(st_qry)
  #print(status)
  idx = check_cat(st_qry)
  #print(idx)

  query,embedding = process_query(query,idx)

  if status == 'current':
    if idx == 'category':
      print(f"please type more than two keywoords . Thank you"
      )
    else:
      result = search_current_faiss(embedding , idx, k=5)

  elif status == 'archive':
    result = search_archived_faiss(embedding, idx, k=5)

  else:
    print(f'Sorry i do not understand . Would you be more specific please?')

  return result



In [44]:
def generator(query):
  '''
  Till now this is the main generator function which will be replaced with a LLM like bert or T5.
  Generator function will use a text summarizer which i trained on a different notebook, a bert architecture fine tunned with my
  dataset . And will reply the summarized text using all the text it will retrive .
  '''
  result = retrive_info(query)

 # print(type(result))
  #print(result)

  print("--------------------------------------------------")
  print("Dear User,")
  print("Thank you for your query. I have successfully retrieved the following news articles related to your search:")
  print()

    # Ensure each result appears on a new line
  for item in result:
      print(f"- {item}")  # You can format it as desired (e.g., adding bullets or numbers)

  print()
  print("If you need any further assistance or more information, please feel free to ask.")
  print("--------------------------------------------------")


In [45]:
# Chatbot interface
def chatbot():
    print("Chatbot: Hi! Ask me anything, or type 'exit' to quit.\n")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Chatbot: Goodbye!")
            break
        try:
            answer = generator(user_input)
            print(f"Chatbot: {answer}")
        except Exception as e:
            print(f"Chatbot: Sorry, I encountered an error: {e}")



In [46]:
chatbot()

Chatbot: Hi! Ask me anything, or type 'exit' to quit.

You: আজকের রাজনীতির খবর কি?
distance is [[250.94629 277.79846 313.7649  313.9292  323.8869 ]]
--------------------------------------------------
Dear User,
Thank you for your query. I have successfully retrieved the following news articles related to your search:

- {'title': 'টমেটোর এই পুষ্টিগুণগুলো জানেন কি', 'news_link': 'https://bangla.thedailystar.net/life-living/food-recipe/news-634096', 'news_text': "টমেটো একটি জনপ্রিয় সবজি, যা সারা বিশ্বে বিভিন্ন রকম রান্না এবং সালাদে ব্যবহৃত হয়।অনেকে কাঁচা টমেটো খেতে পছন্দ করেন। টমেটো স্বাদে ও পুষ্টিগুণে অনন্য। বর্তমানে এদেশে সারাবছর টমেটো পাওয়া গেলেও শীতকালে এটির ফলন বেশী হয়। তাই টমেটো মোসুমি সবজি হিসেবে বিবেচিত হয়। আর যেকোনো মৌসুমি সবজি শরীরের জন্য ভীষণ উপকারী। চলুন জেনে নিই টমেটোর পুষ্টিগুণ। জানিয়েছেন এমএইচ শমরিতা মেডিকেল কলেজ অ্যান্ড হাসপাতালের পুষ্টিবিদআঞ্জুমান আরা শিমুল। তিনি বলেন, টমেটো তাজা, রসালো, মিষ্টি এবং সামান্য টক জাতীয় ফল যেটি সাধারণত সবজি হিসেবেই বেশি পরিচিত। টমেটো বিভিন্