In [1]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI

In [2]:
from langchain.document_loaders.csv_loader import CSVLoader

In [3]:
import pandas as pd
pos_file = 'dataset/pos_reviews.csv'
neg_file = 'dataset/neg_reviews.csv'
df = pd.read_csv(pos_file)
df.columns

Index(['Unnamed: 0', 'review'], dtype='object')

In [4]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short
    tokens.

    Parameters:
        text (str): The input text to preprocess.
        stop_words (set): A set of stopwords to be removed from the text.
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

In [5]:
def get_topic_lists_from_csv(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words
        for a topic.
    """
    # Load the pdf file
    loader = CSVLoader(file_path= file, csv_args={
      'delimiter': ',',
      'quotechar': '"',
      'fieldnames': ['Unnamed: 0', 'statnNm', 'rating', 'review', 'review_count']
    })

    # Extract the text from each page into a list. Each page is considered a document
    documents= []
    for data in loader.load():
        documents.append(data.page_content)

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

In [6]:
pos_template = '''Describe the postive topic of each of the {num_topics}
        double-quote delimited lists in a simple phrase and also write down
        ten possible words that can describe the topic. The lists are the result of an
        algorithm for topic discovery. While selecting the topic, the pronouns are not allowed.

        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>

        2: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>


        ...

        n: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>


        Lists: """{string_lda}""" '''

neg_template = '''Describe the negative topics of each of the {num_topics}
        double-quote delimited lists in a simple phrase and also write down
        ten possible words that can describe the topic. The lists are the result of an
        algorithm for topic discovery. While selecting the topic, the pronouns are not allowed.

        Do not provide an introduction or a conclusion, only describe the
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>

        2: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>


        ...

        n: <<<(sentence describing the topic)>>>
        - <<<(whether topic is positive or negative)>>>
        - <<<(words describing the topic)>>>


        Lists: """{string_lda}""" '''

In [7]:
def topics_from_csv(llm, template, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function,
    which consists of a list of topic-related words for each topic, and
    generates an output string in table of content format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided
        topic words.
    """

    # Extract topics and convert to string
    list_of_topicwords = get_topic_lists_from_csv(file, num_topics,
                                                  words_per_topic)
    string_lda = ""
    for list in list_of_topicwords:
        string_lda += str(list) + "\n"

    # Create the template
    template_string = template

    # LLM call
    print(string_lda)

    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda" : string_lda,
        "num_topics" : num_topics
        })

    return response

In [8]:
openai_key = 'YOUR_API_KEY'
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

  warn_deprecated(


In [9]:
num_topics = 6
words_per_topic = 30

summary = topics_from_csv(llm, pos_template, pos_file, num_topics, words_per_topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/myeongseop.kim/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['none', 'airport', 'station', 'review', 'unnamed', 'statnnm', 'rating', 'review_count', 'train', 'also', 'ticket', 'sinagawa', 'keikyu', 'people', 'haneda', 'express', 'stop', 'escalator', 'easy', 'elevator', 'good', 'gate', 'kawasaki', 'clean', 'want', 'entrance', 'line', 'narita', 'know', 'take']
['none', 'review', 'unnamed', 'statnnm', 'review_count', 'rating', 'station', 'good', 'line', 'also', 'like', 'think', 'time', 'want', 'place', 'clean', 'tokyo', 'easy', 'home', 'shibuya', 'yokohama', 'city', 'people', 'convenient', 'walk', 'transfer', 'enjoy', 'know', 'feel', 'seems']
['station', 'line', 'none', 'also', 'tokyo', 'many', 'transfer', 'shinjuku', 'convenient', 'time', 'review', 'review_count', 'statnnm', 'unnamed', 'rating', 'people', 'ticket', 'good', 'subway', 'think', 'home', 'exit', 'metro', 'gate', 'around', 'used', 'doei', 'walk', 'place', 'asakusa']
['station', 'none', 'line', 'good', 'review', 'rating', 'unnamed', 'statnnm', 'review_count', 'also', 'train', 'many', 'c

  warn_deprecated(


In [10]:
summary

'\n1: <<<The convenience of traveling to and from the airport>>>\n- Positive\n- Easy, convenient, clean, good, gate, entrance, line, stop, ticket, take\n\n2: <<<The overall experience of using an unnamed station>>>\n- Neutral\n- Review, review count, rating, also, think, time, place, clean, people, walk\n\n3: <<<The convenience of transferring at a busy station>>>\n- Positive\n- Convenient, transfer, time, review, review count, rating, people, good, exit, gate\n\n4: <<<A good experience at a popular and busy station>>>\n- Positive\n- Station, line, good, review, rating, convenient, shopping, clean, place, time\n\n5: <<<The famous and well-known stations in a specific city>>>\n- Neutral\n- Station, city, people, shopping, famous, place, like, many, temperature, intersection\n\n6: <<<The positive experience of using public transportation in a big city>>>\n- Positive\n- Good, subway, train, clean, people, city, great, service, well, crowded'

In [11]:
num_topics = 6
words_per_topic = 30

summary = topics_from_csv(llm, neg_template, neg_file, num_topics, words_per_topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/myeongseop.kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['none', 'station', 'rating', 'statnnm', 'unnamed', 'review_count', 'review', 'line', 'good', 'ticket', 'also', 'people', 'time', 'think', 'gate', 'high', 'home', 'store', 'exit', 'difficult', 'infection', 'transfer', 'train', 'groove', 'side', 'stop', 'risk', 'many', 'area', 'walk']
['station', 'none', 'line', 'people', 'train', 'many', 'subway', 'review', 'rating', 'unnamed', 'review_count', 'statnnm', 'think', 'time', 'also', 'good', 'airport', 'elevator', 'express', 'high', 'restaurants', 'tokyo', 'entrance', 'speed', 'walk', 'around', 'convenience', 'shops', 'stores', 'minutes']
['none', 'station', 'review', 'unnamed', 'rating', 'review_count', 'statnnm', 'also', 'convenient', 'line', 'transfer', 'think', 'train', 'ticket', 'good', 'many', 'difficult', 'first', 'time', 'exit', 'near', 'express', 'home', 'make', 'restaurant', 'shops', 'stop', 'place', 'stairs', 'shop']
['station', 'none', 'line', 'review', 'unnamed', 'review_count', 'statnnm', 'rating', 'also', 'home', 'people', 'm

In [12]:
summary

'\n1: <<<The station is difficult to navigate.>>>\n- <<Negative>>\n- <<Difficult, navigate, station, gate, transfer, exit, time, people, area, walk>>\n\n2: <<<The train is always crowded.>>>\n- <<Negative>>\n- <<Crowded, train, people, line, subway, transfer, time, station, ticket, shops>>\n\n3: <<<The station is not convenient.>>>\n- <<Negative>>\n- <<Not convenient, station, transfer, time, home, exit, stop, restaurant, shops, stairs>>\n\n4: <<<The station has confusing signs.>>>\n- <<Negative>>\n- <<Confusing, signs, station, transfer, time, entrance, people, line, gate, ticket>>\n\n5: <<<The train is often delayed.>>>\n- <<Negative>>\n- <<Delayed, train, time, transfer, station, express, people, shops, subway, line>>\n\n6: <<<The station is not well-maintained.>>>\n- <<Negative>>\n- <<Not well-maintained, station, transfer, ticket, time, exit, express, shops, subway, line>>'