In [1]:
from dotenv import load_dotenv
load_dotenv()
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Data Scraping


In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

In [None]:
# url = "https://www.imdb.com/title/tt1517268/reviews/?ref_=tt_ov_rt"
url = "https://www.imdb.com/title/tt0111161/reviews"
driver.get(url)


In [None]:
sel = Selector(text = driver.page_source)
review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0]
print(review_counts)
more_review_pages = int(int(review_counts)/25)

In [None]:
from time import sleep
# for i in tqdm(range(more_review_pages)):
for i in tqdm(range(0,25)):
    try:
        css_selector = 'load-more-trigger'
        driver.find_element(By.ID, css_selector).click()
        sleep(10)
    except:
        pass


In [None]:
# Wait for up to 10 seconds for the elements to be present
wait = WebDriverWait(driver, 10)
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.review-container')))


In [None]:
len(elements)

In [None]:
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')
len(reviews)

In [None]:
rating_list = []
review_date_list = []
review_title_list = []
author_list = []
review_list = []
review_url_list = []
error_url_list = []
error_msg_list = []
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

for d in tqdm(reviews):
    try:
        sel2 = Selector(text = d.get_attribute('innerHTML'))
        try:
            rating = sel2.css('.rating-other-user-rating span::text').extract_first()
        except:
            rating = np.NaN
        try:
            review = sel2.css('.text.show-more__control::text').extract_first()
        except:
            review = np.NaN
        try:
            review_date = sel2.css('.review-date::text').extract_first()
        except:
            review_date = np.NaN
        try:
            author = sel2.css('.display-name-link a::text').extract_first()
        except:
            author = np.NaN
        try:
            review_title = sel2.css('a.title::text').extract_first()
        except:
            review_title = np.NaN
        try:
            review_url = sel2.css('a.title::attr(href)').extract_first()
        except:
            review_url = np.NaN
        rating_list.append(rating)
        review_date_list.append(review_date)
        review_title_list.append(review_title)
        author_list.append(author)
        review_list.append(review)
        review_url_list.append(review_url)
    except Exception as e:
        print("Going in Exception")
        error_url_list.append(url)
        error_msg_list.append(e)
review_df = pd.DataFrame({
    'Review_Date':review_date_list,
    'Author':author_list,
    'Rating':rating_list,
    'Review_Title':review_title_list,
    'Review':review_list,
    'Review_Url':review_url
    })

In [None]:
review_df

In [None]:
review_df.to_csv("./reviews.csv")

In [None]:
data = review_df
# 

In [None]:
import pandas as pd
data = pd.read_csv("./barbie.csv")
data.head()


## Data Parsing

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,### YOUR CODE HERE, # the character length of the chunk
    chunk_overlap = 200,### YOUR CODE HERE, # the character length of the overlap between chunks
    length_function =len ### YOUR CODE HERE, # the length function - in this case, character length (aka the python len() fn.)
)

In [2]:
import pinecone
import os
index_name = 'movie-review-index'

pinecone.init(
    api_key= os.environ['PINECONE_API_KEY'],
    environment= os.environ['PINECONE_ENV']
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension= 1536 ### YOU CODE HERE - REMEMBER TO USE THE SAME DIMENSION AS THE EMBEDDING MODEL (text-embedding-ada-002)
    )

In [3]:
index = pinecone.GRPCIndex(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 858}},
 'total_vector_count': 858}

### Caching

In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

core_embeddings_model = OpenAIEmbeddings()

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model,
    store,
    namespace= core_embeddings_model.model
)

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

BATCH_LIMIT = 100

texts = []
metadatas = []

for i in tqdm(range(len(data))):

    record = data.iloc[i]

    metadata = {
        'review-url': str(record["Review_Url"]),
        'review-date' : str(record["Review_Date"]),
        'author' : str(record["Author"]),
        'rating' : str(record["Rating"]),
        'review-title' : str(record["Review_Title"]),
    }

    record_texts = text_splitter.split_text(
        record["Review"]
        )

    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    
    if len(texts) >= BATCH_LIMIT:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embedder.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embedder.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
index.describe_index_stats()

## Q and A with Vector Store

In [5]:
from langchain.vectorstores import Pinecone

text_field = "text"

index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embedder.embed_query, text_field
)

In [6]:
query = "I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow."

vectorstore.similarity_search(
    query, 
    k=3  
)

[Document(page_content="I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow. The film was very disjointed. Ryan Gosling as good as he is seemed to old to play Ken and Will Ferrell ruined every scene he was in. I just didn't get it, it seemed hollow artificial and hackneyed. A waste of some great talent. It was predictable without being reassuring and trying so hard to be woke in the most superficial way in that but trying to tick so many boxes it actually ticked none. Margo Robbie looks beautiful throughout, the costumes and the sets were amazing but the story was way too weak and didn't make much sense at all.", metadata={'author': 'agjbull', 'chunk': 0.0, 'rating': '6.0', 'review-date': datetime.datetime(2023, 7, 23, 0, 0), 'review-title': ' Just a little empty\n', 'review-url': '/review/rw9221648/?ref_=tt_urv'}),
 Document(page_content="because what really matters is the story. Well it fell short on that mark 

In [None]:
%%timeit
query = "I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow."
vectorstore.similarity_search(
    query, 
    k=3  
)

## Q and A Chain

In [None]:
from langchain.llms.openai import OpenAIChat

llm = OpenAIChat(model="gpt-3.5-turbo", temperature=0)
retriever = vectorstore.as_retriever()


In [None]:
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler

handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

In [None]:
qa_with_sources_chain({"query" : "How was Will Ferrell in this movie?"})["result"]

In [None]:
qa_with_sources_chain({"query" : "Do reviewers consider this movie Kenough?"})["result"]

In [None]:
result = qa_with_sources_chain({"query" : "Was Will Ferrel funny?"})

In [None]:
for k, v in result.items():
    print(f"Key: {k}")
    print(f"Value: {v}")
    print("")

In [None]:
for page_content, metadata in result["source_documents"]:
    print(f"Metadata: {metadata}")
    print(f"Page Content: {page_content}")
    print("")

## Tracing with W and B

In [None]:
qa_with_sources_chain({"query" : "Do reviewers consider this movie Kenough?"})["result"]

In [None]:
import langchain
from langchain.cache import InMemoryCache
langchain.llm_cache = InMemoryCache()

In [None]:
%%time
qa_with_sources_chain({"query" : "Do reviewers consider this movie Kenough?"})["result"]

In [None]:
%%time
qa_with_sources_chain({"query" : "Do reviewers consider this movie Kenough?"})["result"]

In [None]:
%%time
qa_with_sources_chain({"query" : "Do reviewers consider this here movie Kenough?"})["result"]