In [32]:
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
import textwrap
import sys
import os
from transformers import BitsAndBytesConfig,AutoModelForCausalLM, AutoTokenizer

In [43]:
import pdfplumber
import re
import torch

In [13]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf("blade_runner_2049.pdf")

In [28]:
def preprocess_text(text):
    # Remove special characters, newlines, and other non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Lowercasing
    text = text.lower()
    
    return text


In [26]:
preprocessed_text = preprocess_text(pdf_text)

In [37]:
file_path = "pdffile.txt"

# Open the file in write mode
with open(file_path, "w", encoding="utf-8") as file:
    # Write the preprocessed text to the file
    file.write(preprocessed_text)

In [38]:
loader =TextLoader('pdffile.txt')

In [40]:
documents = loader.load()

In [45]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cuda'})

In [46]:
text_splitter=CharacterTextSplitter(separator='\n',
                                    chunk_size=1000,
                                    chunk_overlap=200)
text_chunks=text_splitter.split_documents(documents)
vectorstore=FAISS.from_documents(text_chunks, embeddings)

In [44]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [48]:
token ="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"
model_name="meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name,token=token,trust_remote_code=True, padding_side="left")

In [49]:
model = AutoModelForCausalLM.from_pretrained(model_name,token=token,quantization_config=bnb_config,device_map='auto',torch_dtype=torch.float16,trust_remote_code=True)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [50]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=3000,
    num_return_sequences=1,
    repetition_penalty=1.2,
)

In [52]:
llm=HuggingFacePipeline(pipeline=pipe)
qa =  RetrievalQA.from_chain_type(llm=llm, chain_type = "stuff",return_source_documents=False, retriever=vectorstore.as_retriever(k=2))

In [53]:
def get_answer(qa_pipeline, query):
    # Perform question answering
    result = qa_pipeline({"query": query}, return_only_outputs=True)
    
    # Extract the helpful answer from the result
    answer_pattern = r"Helpful Answer:\s(.+)"
    match = re.search(answer_pattern, result['result'])
    if match:
        helpful_answer = match.group(1)
        return helpful_answer
    else:
        return None

In [54]:
query = "Explain the theme of the movie?"
answer = get_answer(qa, query)
print(answer)

  warn_deprecated(


The theme of Blade Runner is the exploration of what it means to be human, specifically in relation to the blurring of lines between humans and replicants (artificially created beings). Throughout the film, characters struggle with their own identities and the nature of existence, raising questions about the value of life and the ethics of creating and controlling living beings. Additionally, the film touches on themes of nostalgia, memory, and the impact of technology on society.


In [62]:
query = "Explain the theme of the movie?"
answer = get_answer(qa, query)
print(answer)

The theme of Blade Runner (1982) revolves around the exploration of what it means to be human, particularly through the character of Roy Batty, a synthetic life form who seeks to extend his lifespan despite being designed to die after four years. The film delves into questions of identity, free will, and the ethical considerations surrounding artificial intelligence. It also critiques the capitalistic society depicted in the film, where corporate greed and exploitation lead to the dehumanization of both humans and replicants. Ultimately, the film posits that true humanity lies not in biology but rather in emotions such as compassion, love, and empathy, which can transcend even the most advanced technology.


In [58]:
query = "Who are the characters in the movie?"
answer = get_answer(qa, query)
print(answer)

The characters in Blade Runner 2049 include Niander Wallace, Joi, K, Deckard, and Mariette.


In [59]:
query = "How many male and female characters are in the movie?"
answer = get_answer(qa, query)
print(answer)

There are 3 males (Wallace, Deckard, and Niander) and 2 females (Luv and Mariette).


In [61]:
query = "Does the script pass the Bechdel test?"
answer = get_answer(qa, query)
print(answer)

Yes, the script passes the Bechdel test. There are several lines spoken by female characters throughout the script, including Mariette, Freysa, and Doctor Anastelline. These characters engage in conversations with each other and have their own agency within the story.


In [60]:
query = "What is the role of Deckard in the movie?"
answer = get_answer(qa, query)
print(answer)

Deckard is a bounty hunter tasked with "retiring" advanced androids known as replicants in a futuristic society. He is portrayed by Harrison Ford.
