<a href="https://colab.research.google.com/github/Karthik0510/Karthik0510/blob/main/lda_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pypdf==3.14.0
!pip install tiktoken==0.4.0
!pip install langchain==0.0.353
!pip install openai==0.27.8
!pip install gdown==4.7.3

In [2]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pypdf import PdfReader
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI

In [3]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short
    tokens.

    Parameters:
        text (str): The input text to preprocess.
        stop_words (set): A set of stopwords to be removed from the text.
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

In [4]:
def get_topic_lists_from_text(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a text document using the
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the text file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words
        for a topic.
    """
    # Load the text file
    with open(file, 'r', encoding='utf-8') as f:
        documents = [f.read()]
    print(documents)

    if len(documents)<300:
      num_topics=3
    elif len(documents)>=300 and len(documents)<1000:
      num_topics=5
    else:
      num_topics=8
    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english', 'spanish']))
    processed_documents = [preprocess(doc,stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=15
    )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

In [5]:
def topics_from_txt(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function,
    which consists of a list of topic-related words for each topic, and
    generates an output string in bulleted nested list format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided
        topic words.
    """

    # Extract topics and convert them to string
    list_of_topicwords = get_topic_lists_from_text(file, num_topics,
                                                  words_per_topic)
    string_lda = ""
    for list in list_of_topicwords:
        string_lda += str(list) + "\n"

    # Create the template
    template_string = '''Generate topic in few words in a sentence of the {num_topics}
        double-quote delimited lists in a simple sentence. The lists are the result of an
        algorithm for topic discovery.



        Do not provide an introduction or a conclusion, only Generate the
        topics. Don't mention the word "topic" or anything similar to "the first stopic is about..." when describing the topics.


        Desired example output:
        1: Bearish Trading Indicators
        2: Crypto Market Trends

        1: Borussia Dortmund in the Champions League final
        2: Dani Olmo scores for RB Leipzig against Atletico Madrid


        Lists: """{string_lda}"""
         '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda" : string_lda,
        "num_topics" : num_topics
        })

    return response

In [6]:
openai_key = "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"
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

  warn_deprecated(


In [None]:
#send a zip file over here to get the output


import zipfile
import os
import tempfile
import glob

num_topics = 5
words_per_topic = 10
# Specify the path to your .zip file
zip_file_path = "/content/testing_topic.zip"

# Create a temporary directory to extract the files
with tempfile.TemporaryDirectory() as tempdir:
    # Extract the .zip file to the temporary directory
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(tempdir)

    # Use glob to find all .txt and .pdf files in the temporary directory
    files = glob.glob(f"{tempdir}/*.txt") + glob.glob(f"{tempdir}/*.pdf")

    l = []
    for file_path in files:
        # Determine the file type and call the appropriate function
        if file_path.lower().endswith('.txt'):
            summary = topics_from_txt(llm, file_path, num_topics, words_per_topic)
        elif file_path.lower().endswith('.pdf'):
            summary = topics_from_pdf(llm, file_path, num_topics, words_per_topic)
        l.append([file_path,summary])

In [9]:
promptlist=[]
for x in l:
  prompt=''''''
  prompt+=x[0]+":"
  for i in range(1,len(x)):
    prompt+=x[i]
    prompt+="\n\n"
  promptlist.append(prompt)
  #promptlist is a list of headlines and also the file path which is used to map files and also categorize them using llm
for i in promptlist:
  print(i)


/tmp/tmp0jstlf1y/news_2.txt:
1: Mumbai Firing Incident
2: Salman Khan's Involvement
3: Police Response in the Village
4: District Administration's Actions
5: Night of Chaos in Mumbai's District


/tmp/tmp0jstlf1y/news_3.txt:
1: Indian Political Leaders
2: Election Candidates
3: Personal Sacrifices


/tmp/tmp0jstlf1y/crypto_1.txt:
1: Crypto Regulations and Wildlife Conservation
2: Jail Time for Crypto Scammers and Chairman Rostin's Challenge
3: Warren's Full Accounting Collapse and Government Interactions with Crypto Markets


/tmp/tmp0jstlf1y/news6.txt:
1: Payment Aggregators and Merchant Networth
2: Bank Activity Authorisation and Due Diligence


/tmp/tmp0jstlf1y/news4.txt:
1: Durga Puja violence in West Bengal
2: Chief Minister Mamata Banerjee under attack


/tmp/tmp0jstlf1y/news3.txt:
1: Salman Khan's Involvement in Mumbai Firing Incident
2: Police Investigation into Crime Branch and Bishnoi's Role


/tmp/tmp0jstlf1y/news5.txt:1: Sabha Election Phase
         2: Modi in Bengal
     

In [15]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
openai_key = "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"
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)
list_of_headings= promptlist


example='''{[Machine Learning:
    [filepath1234],


Artificial Intelligence:
[
    filepath898,
    filepath345],

Data Science:
    [filepath345,
    filepath1234]}'''


prompt=f'''You excel in the art of categorizing diverse topics and news items effectively. Your task is to categorize
a set of main headings into distinct groups, assigning a name to each category. Your goal is to streamline the topics into FEWER,
 yet significant categories and output them in a dictionary format . Below are a few main headings for you to work with. Dive in and create an optimal categorization!
->{list_of_headings}
Do not chaange the filepath assinged to the headlines and only print the filepath under the topic do not print the sentence
give the output in the format of a dictionary
example output->
{example}
 '''


output = llm.invoke(prompt)
print(output)



{News:
    [
        /tmp/tmp0jstlf1y/news_2.txt,
        /tmp/tmp0jstlf1y/news_3.txt,
        /tmp/tmp0jstlf1y/news4.txt,
        /tmp/tmp0jstlf1y/news3.txt,
        /tmp/tmp0jstlf1y/news5.txt
    ],

Crypto:
    [
        /tmp/tmp0jstlf1y/crypto_1.txt
    ],

Politics:
    [
        /tmp/tmp0jstlf1y/news3.txt
    ],

Payment and Banking:
    [
        /tmp/tmp0jstlf1y/news6.txt
    ]
}
