In [96]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number, Bool
from langchain.chat_models import ChatOpenAI

import os
import pandas as pd

# Setup API Keys
with open('config/config.txt') as f:
    for line in f:
        env_data = line.split(',')
        os.environ[env_data[0]] = env_data[1]

# Initialize Model
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
)

# Get Review Data
reviews = pd.read_csv('data/review_sample.csv', index_col=0)

# Deprecated

## Extract ideas from reviews one by one

In [126]:
# Defines the schema for extracting the relevant ideas from a yelp review. 
idea_schema = Object(
    id="Ideas",
    description="All of the relevat ideas expressed by a customer writing a review for a business.",
    examples=[
        ("As far as diners go, you just don't get much better than this. Worth the wait, great staff behind the counter. A regular spot for us to visit when in the Big Easy.", [{"idea": "one of the best diners"}, {"idea": "worth the wait"}, {"idea": "great staff"}, {"idea": "visits regularly when in the area"}])
    ],
    attributes=[
        Text(
            id="idea",
            description="A short sentence describing an idea expressed in the text. Make sure the idea reflects what the customer is conveying not what they literally say. No more than 10 words.",
        )
    ],
    many=True,
)

# Initialize the chain with the schema.
extraction_chain = create_extraction_chain(llm, idea_schema)

review_sample = reviews.iloc[20:25]

data = {
    'review_id': [],
    'idea': []
}

for i in range(review_sample.shape[0]):
    review = review_sample.iloc[i]
    ideas = extraction_chain.invoke(review['text'])['text']['data']['Ideas']
    for idea in ideas:
        data['review_id'].append(review['review_id'])
        data['idea'].append(idea['idea'])

data = pd.DataFrame(data)
data

Unnamed: 0,review_id,idea
0,Tmg23-TKzaI8RrGz4pVmAA,attentive and abrupt staff
1,Tmg23-TKzaI8RrGz4pVmAA,unique lingo among cooks
2,Tmg23-TKzaI8RrGz4pVmAA,amazing burgers
3,Tmg23-TKzaI8RrGz4pVmAA,best burgers in the area
4,Tmg23-TKzaI8RrGz4pVmAA,reasonably priced
5,Tmg23-TKzaI8RrGz4pVmAA,will return next time in New Orleans
6,hpc9hWgqWl-Mx5Pwb6On1A,light and fluffy eggs
7,hpc9hWgqWl-Mx5Pwb6On1A,best breakfast
8,hpc9hWgqWl-Mx5Pwb6On1A,omelettes above and beyond
9,hpc9hWgqWl-Mx5Pwb6On1A,must try


## Merge Related Ideas

In [93]:
# Defines the schema for merging ideas
merge_schema = Object(
    id="Merge",
    description="Merges two the two ideas presented into one sentence and decides whether they express the same meaning or not using a boolean.",
    examples=[
        ("idea 1: atmosphere sets the place apart \nidea 2: good atmosphere", {'idea':'positive atmosphere', 'equivalent':True}),
        ("idea 1: burgers are great \nidea 2: terrific hamburgers", {'idea':'great hamburgers', 'equivalent':True}),
        ("idea 1: will return \nidea 2: true original gem", {'idea':'positive atmosphere', 'equivalent':False})
    ],
    attributes=[
        Text(
            id="idea",
            description="A short sentence that captures the meaning of both ideas presented.",
        ),
        Bool(
            id="equivalent",
            description="Returns true if the ideas presented express the same core idea returns false otherwise."
        )
    ],
    many=False,
)

# Initialize a chain with the schema.
merge_chain = create_extraction_chain(llm, merge_schema)

In [96]:
# merge_chain.invoke()
a = data.iloc[2].idea
b = data.iloc[25].idea

chain_input = f'idea 1: {a} \nidea 2: {b}'

a, b

('delicious burgers', 'good burgers')

In [97]:
merge_chain.invoke(chain_input)

{'text': {'data': {'Merge': [{'idea': 'tasty burgers', 'equivalent': 'True'}]},
  'raw': 'idea|equivalent\ntasty burgers|True',
  'errors': [],
  'validated_data': {}}}

# Model A

## Extract Ideas from Reviews in Batches

### Schema and LLM Chain

In [51]:
# Defines the schema for extracting the relevant ideas from a batch yelp reviews. 
idea_collection_schema = Object(
    id="Ideas",
    description="Extract all of the relevant ideas expressed by customers in their reviews for a business. Do NOT include the same idea twice.",
    examples=[
        (
            """
            Review 1: 
            As far as diners go, you just don't get much better than this. Worth the wait, great staff behind the counter. A regular spot for us to visit when in the Big Easy. 
            
            Review 2: 
            best breakfast ive had in a very long time. ive been very disappointed with breakfast, maybe its just me. thats why cammellias was so amazing i had the omelet on the bottom of the menu maybe camellia, chef  or something. so good! my brother had the mexican which was also very tasty. then we had a pecan omelet and that my friends is to die for. its thinner than most which i really enjoyed, get you order in asap they get busy. the service however is very personable and awesome.

            Review 3: 
            I recently visited the new French Quarter location and enjoyed it as much as the Carrollton location. Make sure you get the Manhattan Omelette. To die for!
            """, 
            [
                {"idea": "one of the best diners"}, 
                {"idea": "best breakfast"},
                {"idea": "gets busy"},
                {"idea": "service is great"}, 
                {"idea": "visits regularly when in the area"},
                {"idea": "disappointed with brekfast before"}, 
                {"idea": "omelets are very good"}, 
                {"idea": "pecan omelet is very good"}, 
                {"idea": "Manhattan Omelette is very good"},
                {"idea": "Mexican breakfast was tasty"},
                {"idea": "French Quarter location is as good as Carrollton location"},
            ]
        )
    ],
    attributes=[
        Text(
            id="idea",
            description="A short sentence describing an idea expressed in the text. Make sure the idea reflects what the customer is conveying not what they literally say. No more than 10 words. No idea should have the same meaning as any preceding idea.",
        )
    ],
    many=True,
)

batch_extraction_chain = create_extraction_chain(llm, idea_collection_schema)

### Process Reviews

In [52]:
batches = [reviews.iloc[20:100], reviews.iloc[100:180]]
ideas = []

for batch in batches:
    chain_input = ''
    for i in range(batch.shape[0]):
        chain_input += f'Review {i}:\n{batch.iloc[i].text}\n\n\n'
    print('build chain input!')
    result = batch_extraction_chain.invoke(chain_input)
    print('done with result!')
    ideas += list(map(lambda x: x['idea'], result['text']['data']['Ideas']))
    print('formatted ideas!')

build chain input!
done with result!
formatted ideas!
build chain input!
done with result!
formatted ideas!


## Marge Batches of Ideas

In [53]:
len(ideas)

191

In [71]:
reduce_schema = Object(
    id='ReducedIdeas',
    # description='Takes in many ideas that have overlapping meanings and reduces to only the most fundaments. No two ideas outputted should express the same thing.',
    description='Look through all the ideas presented and eliminate ones that have duplicate meaning. There should be fewer outgoing ideas than there are presented in the prompt.',
    examples= [
        (
            """
            Idea 0:
            yummy omelettes

            Idea 1:
            the omelettes were very good

            Idea 2:
            good burgers
            """, 
            [
                {'idea': 'tasty omelettes'}, 
                {'idea': 'good burgers'}
            ]
        )
    ],
    attributes=[
        Text(
            id="idea",
            description="A short sentence describing an idea or group of ideas expressed in the text. No two ideas should express the same meaning.",
        )
    ],
    many=True
)

reduce_chain = create_extraction_chain(llm, reduce_schema)

In [76]:
def reduce_ideas(ideas):
    reduce_chain_input = ''
    for i in range(50):
        reduce_chain_input += f'Idea {i}:\n{ideas[i]}\n\n'

    reduce_chain_result = reduce_chain.invoke(reduce_chain_input)
    ideas = list(
        map(lambda x: x['idea'], 
        reduce_chain_result['text']['data']['ReducedIdeas'])
    )
    
    return ideas 

In [77]:
ideas_reduced = reduce_ideas(ideas)
ideas_reduced

['best burgers in the area',
 'light and fluffy eggs',
 'omelets are above and beyond',
 "chef's special is a bargain",
 'hash browns are flavorful',
 'pecan waffle is good',
 'fries are above average',
 'Mexican omelette is tasty',
 'western omelette is the best',
 'counter service only',
 'chocolate freeze is very good',
 'Manhattan omelette is exceptional',
 'roast beef po-boy is delicious',
 'great prices',
 'great food',
 'friendly service',
 'counter-side seating',
 'fun and friendly atmosphere',
 'great old fashioned place',
 'great food and excellent service',
 'must try in Nola',
 'great late night spot',
 'best comfort food',
 'great for drunk moments',
 'worth the wait for New Orleans charm and delicious food',
 'stellar food, fantastic service',
 'fun experience',
 'affordable, tasty, good dang fun',
 'great hangover food',
 'great for tourists',
 'good southern breakfast served with charm and humor',
 'fast and friendly atmosphere',
 'serious value',
 'classic breakfast sp

In [75]:
ideas

['best burgers in the area',
 'light and fluffy eggs',
 'omelets are above and beyond',
 "chef's special is a bargain",
 'hash browns are flavorful',
 'pecan waffle is good',
 'fries are above average',
 'Mexican omelette is tasty',
 'western omelette is the best',
 'counter service only',
 'chocolate freeze is very good',
 'Manhattan omelette is exceptional',
 'roast beef po-boy is delicious',
 'great prices',
 'great food',
 'friendly service',
 'counter-side seating',
 'fun and friendly atmosphere',
 'great old fashioned place',
 'great food and excellent service',
 'must try in Nola',
 'great late night spot',
 'best comfort food',
 'great for drunk moments',
 'worth the wait for New Orleans charm and delicious food',
 'stellar food, fantastic service',
 'fun experience',
 'affordable, tasty, good dang fun',
 'great hangover food',
 'great for tourists',
 'good southern breakfast served with charm and humor',
 'fast and friendly atmosphere',
 'serious value',
 'classic breakfast sp

## Reverse Mapping

In [78]:
len(ideas_reduced)

47

In [80]:
# Schema to decide whether a given idea is expressed by a review
is_expressed_schema = Object(
    id="is-expressed",
    description="A boolean to decide whether the given idea is represented in the given review",
    examples=[
        (
            """
            Idea:
            enjoyed the service

            Review:
            Missed Marvin this time around. Really enjoyed the chocolate shake. Went later in the day than I normally had in the past. Best time to go is as soon as they open. If you're lucky enough to catch Marvin working then you're in for a treat! He's definitely entertaining!
            """, 
            {"expresses":"True"}
        ),
        (
            """
            Idea:
            great pecan pie

            Review:
            Missed Marvin this time around. Really enjoyed the chocolate shake. Went later in the day than I normally had in the past. Best time to go is as soon as they open. If you're lucky enough to catch Marvin working then you're in for a treat! He's definitely entertaining!
            """,
            {"expresses":"False"}
        ),
    ],
    attributes=[
        Bool(
            id="expresses",
            description="Returns True when the idea is expressed by the review. Returns False otherwise.",
        ),
    ],
    many=False,
)

is_expressed_chain = create_extraction_chain(llm, is_expressed_schema)


In [89]:
def idea_in_reviews(idea, reviews):
    '''
    Takes in one idea and a list of reviews and returns all of the review ids 
    that express the given idea.
    '''
    ids = []

    for i in range(len(reviews)):
        review = reviews.iloc[i].text

        chain_input = f'Idea:\n{idea}\n\nReview:\n{review}'
        # is_expressed_data['chain_input'].append(chain_input)
        result = is_expressed_chain.invoke(chain_input)

        is_expressed = result['text']['data']['is-expressed'][0]['expresses']
        if is_expressed == 'True':
            ids.append(reviews.iloc[i].review_id)
        
        print(f'review {i} is expressed: {is_expressed}')

    return ids

# model_output = {idea: idea_in_reviews(idea, reviews.iloc[20:180]) for idea in ideas_reduced}
model_output = {}

for i, idea in enumerate(ideas_reduced):
    model_output[idea] = idea_in_reviews(idea, reviews.iloc[20:180])
    print(f'finished {i+1}/{len(ideas_reduced)}')
model_output

review 0 is expressed: True
review 1 is expressed: False
review 2 is expressed: True
review 3 is expressed: False
review 4 is expressed: False
review 5 is expressed: False
review 6 is expressed: True
review 7 is expressed: True
review 8 is expressed: False
review 9 is expressed: False
review 10 is expressed: False
review 11 is expressed: False
review 12 is expressed: False
review 13 is expressed: False
review 14 is expressed: False
review 15 is expressed: False
review 16 is expressed: False
review 17 is expressed: False
review 18 is expressed: False
review 19 is expressed: False
review 20 is expressed: False
review 21 is expressed: False
review 22 is expressed: False
review 23 is expressed: False
review 24 is expressed: False
review 25 is expressed: True
review 26 is expressed: False
review 27 is expressed: False
review 28 is expressed: False
review 29 is expressed: False
review 30 is expressed: False
review 31 is expressed: False
review 32 is expressed: False
review 33 is expressed: F

KeyboardInterrupt: 

In [93]:
model_output

{'best burgers in the area': ['Tmg23-TKzaI8RrGz4pVmAA',
  'peC35r9sMcjX0dvCpgQv-g',
  'gk-fmydqtLbdLTs1lP-4LQ',
  'rRu0SYqrdMtNRcEr58mo8A',
  'pPwyaM4a67-aQj9GcpaMlg',
  'z6RWPrj4of5XoQKzeQKbDw',
  '4PKQCo6MbPFqgqGpZ34i5Q',
  'HisAUXzhgM0sv0MOC_hV5Q',
  'WZKDokoJoXYcvY7B0geIfg',
  '4aJY-Q8qE4JJxTcD0uwCrA',
  'R4D6VSyXxyCU31Pj9geCKQ',
  'uutZFDVG9gzU7z3RF4lkww',
  'yQ2XawvLZVR26gb2GJ1MDg',
  'DRvyPlrI-DWvll7cPe09ug',
  '6lm4174XiBQKV9apQTdelQ',
  'IpS_jEl5ZM3u1Bwqtpu1vA',
  'KfNvOanVF3-kKO60aYKHlA',
  'rZJq9-S3JOU2MFosKKN50w'],
 'light and fluffy eggs': ['XjeDKH1wflLtpI60cZL0HQ', 'GKH4BqrvzEpmrfzBYuzFIQ']}

### Parallelization Tests

In [105]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

def process_review(idea, review):
    '''
    Processes a single review to determine if it expresses the given idea.
    Returns the review_id if the idea is expressed.
    '''
    review_text = review['text']
    chain_input = f'Idea:\n{idea}\n\nReview:\n{review_text}'
    result = is_expressed_chain.invoke(chain_input)
    is_expressed = result['text']['data']['is-expressed'][0]['expresses']
    
    if is_expressed == 'True':
        return review['review_id']
    return None

def idea_in_reviews_parallel(idea, reviews):
    '''
    Takes in one idea and a DataFrame of reviews, uses multiple threads to check each review,
    and returns all of the review ids that express the given idea.
    '''
    # Create a ThreadPoolExecutor to manage multiple threads
    with ThreadPoolExecutor() as executor:
        # Map process_review function across all reviews for the given idea
        results = executor.map(lambda review: process_review(idea, review), reviews.to_dict('records'))
    
    # Filter out None results and collect valid review_ids
    ids = [result for result in results if result is not None]
    return ids

# Example usage:
model_output = {}
for i, idea in enumerate(ideas_reduced[0:10]):
    result_ids = idea_in_reviews_parallel(idea, reviews.iloc[20:100])
    model_output[idea] = result_ids
    print(f'finished {i+1}/{len(ideas_reduced)}')

model_output

finished 1/47


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-tuvCutU1KiazOwnsMOF1HZc1 on tokens per min (TPM): Limit 60000, Used 59095, Requested 2447. Please try again in 1.542s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [109]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

# Assuming a callback implementation and an API similar to OpenAI's API is available for rate limit tracking
from langchain_community.callbacks import get_openai_callback

rate_limit_per_minute = 60000  # tokens per minute limit

def is_expressed_chain_invoke(chain_input, callback):
    # This is a placeholder for the actual API call
    result = llm.invoke(chain_input)  # Assume this uses the llm object which is an instance of OpenAI
    
    return result

def process_review(idea, review, callback):
    '''
    Processes a single review to determine if it expresses the given idea.
    Returns the review_id if the idea is expressed.
    '''
    review_text = review['text']
    chain_input = f'Idea:\n{idea}\n\nReview:\n{review_text}'
    
    result = is_expressed_chain_invoke(chain_input, callback)
    is_expressed = result['text']['data']['is-expressed'][0]['expresses']
    
    if is_expressed == 'True':
        return review['review_id']
    return None

def idea_in_reviews_parallel(idea, reviews):
    '''
    Takes in one idea and a DataFrame of reviews, uses multiple threads to check each review,
    and returns all of the review ids that express the given idea, while respecting API rate limits.
    '''
    ids = []
    with get_openai_callback() as cb, ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_review, idea, review, cb) for review in reviews.to_dict('records')]
        
        for future in as_completed(futures):
            if cb.total_tokens >= rate_limit_per_minute:
                print(f"Approaching rate limit: {cb.total_tokens} tokens used.")
                sleep(60)  # Sleep for a minute when approaching the rate limit
            result = future.result()
            if result is not None:
                ids.append(result)
                
    return ids

model_output = {}
for i, idea in enumerate(ideas_reduced[0:10]):
    result_ids = idea_in_reviews_parallel(idea, reviews.iloc[20:100])
    model_output[idea] = result_ids
    print(f'finished {i+1}/{len(ideas_reduced)}')

model_output


AttributeError: 'OpenAICallbackHandler' object has no attribute 'update_tokens'

In [None]:
def process_review(idea, review):
    chain_input = f'Idea:\n{idea}\n\nReview:\n{review["text"]}'
    result = is_expressed_chain.invoke(chain_input)
    is_expressed = result['text']['data']['is-expressed'][0]['expresses']
    print(f'Review {review["review_id"]} is expressed: {is_expressed}')
    if is_expressed == 'True':
        return review['review_id']
    return None


### Example Model Output

In [135]:
import random
import json 

# Dictionary to store the model output
example_model_out = {}

for i, idea in enumerate(ideas_reduced):
    # Randomly sample between 0 and 10 reviews (inclusive) and retrieve their 'review_id'
    ids = list(reviews.iloc[20:180].sample(random.randint(0, 10))['review_id'])
    example_model_out[idea] = ids

# Convert dictionary to JSON string
json_output = json.dumps(example_model_out, indent=4)  # 'indent' for pretty printing

# Saving the JSON output to a file
with open("output/example_output.json", "w") as json_file:
    json_file.write(json_output)
