# Unstructured File Loader
This notebook covers how to use Unstructured to load files of many types. Unstructured currently supports loading of text files, powerpoints, html, pdfs, images, and more.

In [None]:
# # # Install package
# !pip install "unstructured[local-inference]"
# !pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
# !pip install layoutparser[layoutmodels,tesseract]

In [None]:
# # Install other dependencies
# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst
# !brew install libmagic
# !brew install poppler
# !brew install tesseract
# # If parsing xml / html documents:
# !brew install libxml2
# !brew install libxslt

In [None]:
# import nltk
# nltk.download('punkt')

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
import os 
from apikey import apikey 

import streamlit as st 
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SequentialChain 
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.utilities import WikipediaAPIWrapper 
from langchain.document_loaders import TextLoader

In [None]:
os.environ['OPENAI_API_KEY'] = apikey


loader = TextLoader('email.txt', encoding='utf8')
loader2 =  TextLoader('email2.txt', encoding='utf8')
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader, loader2])

In [None]:
query = "What was Yuyen's excuse for why his company used the music without a license?"
response = index.query(query, verbose=True)
print(response)

## PDF Example

Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. 

In [None]:
from tqdm.auto import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

import torch

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("chavinlo/alpaca-native")

base_model = LlamaForCausalLM.from_pretrained(
    "chavinlo/alpaca-native",
    load_in_8bit=True,
    device_map='auto',
)

In [None]:
from langchain.document_loaders import PyPDFLoader # for loading the pdf
# from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
from langchain.embeddings import LlamaCppEmbeddings

from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chains import ChatVectorDBChain # for chatting with the pdf
# from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
from langchain.llms import LlamaCpp

pdf_path = '/Users/jordandavis/Downloads/jonas.pdf'
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
print(pages[0].page_content[:200])


In [None]:
model_path = '/Users/jordandavis/dalai/alpaca/models/7B/ggml-model-q4_0.bin'

model = LlamaCpp(model_path=model_path)

In [None]:
# embeddings = OpenAIEmbeddings()
embeddings = LlamaCppEmbeddings(model_path=model_path)
vectordb = Chroma.from_documents(pages, embedding=embeddings, persist_directory=".")
vectordb.persist()

In [None]:
pdf_qa = ChatVectorDBChain.from_llm(model,
                                    vectordb, return_source_documents=True)

query = "Where are the workers going?"
result = pdf_qa({"question": query, "chat_history": ""})
print("Answer:")
print(result["answer"])