# Exercise 5
### By Gruppe H (Caroline og Maria)

In [1]:
#!pip install streamlit

In [2]:
## !streamlit hello

In [3]:
#!pip install langchain

In [4]:
#!pip install langdetect

In [5]:
#!pip install -U torch

## Set up Enviroment 

In [6]:
import os
import pandas as pd

In [7]:
import langdetect
from langdetect import DetectorFactory, detect, detect_langs

In [8]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [9]:

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

In [10]:
# Embedding facilities
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
# Pipelines
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

## Load Documents
Our first task is to collect and load documents from various sources and formats in the context of our chosen domain, which is **knitting for beginners**.

We have chosen to load content from pdf files, YouTube and Wikipedia.

In [12]:
import myloadlib
from myloadlib import loadDir, loadFile, loadWiki, loadYoutube, readAPI

In [13]:
import importlib 
importlib.reload(myloadlib)

<module 'myloadlib' from '/Users/mariamcnally/Desktop/SW2Semester/AI/OLA5/myloadlib.py'>

In [14]:
# Collect all here
documents = []

### Load single pdf files

#### File 1

In [15]:
file = "./data/knitting_pdfs/knit.pdf"

In [16]:
#!pip install pypdf

In [17]:
docs = myloadlib.loadFile(file)

In [18]:
documents.extend(docs)
len(documents)

21

In [19]:
# metadata of loaded Document
docs[0].metadata 

{'source': './data/knitting_pdfs/knit.pdf', 'page': 0}

Content of page [0] = page 1. 

In [20]:
documents[0].page_content
#docs[0].page_content[:1000]
# First 1000 charactors.

' \n \nBeginning Knitting  \n \nWHAT IS KNITTING? \n \nHand knitting is to make fabric with yarn \non two or more needles.  A number of loops are first made on one needle, and then the fabric "grows\'\' by drawing other loops through them as they are passed back and forth along the needles from row to row. \n \nAll knitting comes from two kinds of stitches. One is called a "knit stitch," and the other is called a "purl stitch." There \nare several different ways or methods of knitting. Refer to directions in the pattern/instructions for explanations and types of stitches required.  \n \nKnitting can produce something useful, but the process can,  and should be, fun \nand relaxing.  The main thing is to learn to enjoy  knitting—relax while you work, \navoid a cramped position, have a good light to see by, and if your hands become tired, stop and rest a while.  \nHistory of Knitting \n \nKnitting is older than written history. No \none knows exactly when people began to knit, but we do k

#### File 2

In [21]:
file2 = "./data/knitting_pdfs/knitting-handbook.pdf"

In [22]:
docs = myloadlib.loadFile(file2)

In [23]:
documents.extend(docs)
len(documents)

61

In [24]:
docs[1].metadata 

{'source': './data/knitting_pdfs/knitting-handbook.pdf', 'page': 1}

In [25]:
documents[1].page_content

' 2 \nAll knitting patterns tell you the size of \nneedles you will need. Never discard your needles when you have fini shed the project.  \nIf you lose one needle and have to buy another pair the same size, keep the extra \nneedle. You never know when you may lose or break another one.   \n \n\x88    Crochet Hook  \nYou will need a crochet hook to pick \nup dropped stitches and to correct other mistakes. They also come in many sizes. Size C, or 6, is a good one for 4-ply knitting yarn. \n \n\x88    Measuring Tool  \nYou will need a measuring tool. You \ncan use a ruler, a measuring tape, or a metal measuring gauge. Be sure your \nmeasuring tool has both standard and \nmetric measurements. \n \n \n \n  \n\x88 Other helpful items: \n \nScissors  -    You will also want to \ninclude in your knitting basket a pair of small scissors (kept in a case for safety). \n \nYarn needle  -  a blunt-pointed yarn \nneedle for sewing your articles together. \n  Nail file or emery board   -  for catchy

### Load YouTube

In [26]:
url = 'https://www.youtube.com/watch?v=Zjq0MoUZqVY'
save_dir="./youtube/"

In [27]:
url

'https://www.youtube.com/watch?v=Zjq0MoUZqVY'

In [28]:
lang = 'en'

In [29]:
#!pip install youtube-transcript-api

In [30]:
#!pip install pytube

In [31]:
docs = myloadlib.loadYoutube(url, lang)

In [32]:
documents.extend(docs)
len(documents)

62

In [33]:
documents[61].type

'Document'

In [34]:
documents[61].page_content

"In, around, under and off. You'll\xa0 be dreaming this by the end. Hello knitters, i'm Marion from Lovecrafts\xa0 and I'm going to show you how to knit. Now, in this video we're going to do a\xa0little project where we start knitting\xa0from start to finish.  This is the little\xa0swatch we're going to knit. I'm going to\xa0show you how to start, how to cast on, how to knit this lovely knit stitch\xa0\xa0 and then how to cast off. So, it's everything\xa0 in one go.  Now if you learn to make one of these, you can make a scarf or a cushion cover or\xa0a pillow or a blanket - anything at all! \xa0 So, this is a really good tutorial just for some basic\xa0ground work before you start your knitting. So, I'm going to use Paint Box wool mix super chunky.\xa0And the reason i'm using a super chunky yarn\xa0\xa0 is because when you start knitting it's really\xa0important to be able to see the stitches.  Um, and so I always prefer to use a big yarn, and that way your\xa0work grows very fast, so 

### Load wikipedia page

In [35]:
subject = "Knitting"

In [36]:
lang = 'en'

In [37]:
#!pip install wikipedia

In [38]:
docs = myloadlib.loadWiki(subject, lang, 2)

In [39]:
documents.extend(docs)

Should be 4 at the moment, but will update everytime its run, and or other Docs/documents are ran again

In [40]:
len(documents)

64

## Chunking
Now we will be chunking our documents, which means breaking down our texts into smaller, more manageable chunks to prepare it for AI processing.

In [41]:
#!pip install spacy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0.tar.gz

In [42]:
#!pip install wordcloud

In [43]:
import myutils2
from myutils2 import chunkDocs, langDetect, wordCloud

In [44]:
import importlib 
importlib.reload(myutils2)

<module 'myutils2' from '/Users/mariamcnally/Desktop/SW2Semester/AI/OLA5/myutils2.py'>

In [45]:
splits = myutils2.chunkDocs(documents, 350)  
splits

[Document(page_content='Beginning Knitting  \n \nWHAT IS KNITTING? \n \nHand knitting is to make fabric with yarn \non two or more needles.  A number of loops are first made on one needle, and then the fabric "grows\'\' by drawing other loops through them as they are passed back and forth along the needles from row to row.', metadata={'source': './data/knitting_pdfs/knit.pdf', 'page': 0}),
 Document(page_content='All knitting comes from two kinds of stitches. One is called a "knit stitch," and the other is called a "purl stitch." There \nare several different ways or methods of knitting. Refer to directions in the pattern/instructions for explanations and types of stitches required.', metadata={'source': './data/knitting_pdfs/knit.pdf', 'page': 0}),
 Document(page_content='Knitting can produce something useful, but the process can,  and should be, fun \nand relaxing.  The main thing is to learn to enjoy  knitting—relax while you work, \navoid a cramped position, have a good light to se

In [46]:
len(splits)

408

In [47]:
splits[70]

Document(page_content='technique is called binding off.  \nProcedure \n \n1. Slip the first stitch  on the row off the left \nneedle onto the right  needle without \nknitting. \n \n2. Knit the next stitch very loosely. There \nare now two stitches on the right needle. \n \n3. Insert the left needle through the left \nside of the first stitch.', metadata={'source': './data/knitting_pdfs/knit.pdf', 'page': 9})

In [48]:
df = pd.DataFrame(splits, columns=['page_content', 'metadata', 'type'])
df.sample(3)

Unnamed: 0,page_content,metadata,type
49,"(page_content, 2. Be sure that the loop is nea...","(metadata, {'source': './data/knitting_pdfs/kn...","(type, Document)"
18,"(page_content, measuring guide for determining...","(metadata, {'source': './data/knitting_pdfs/kn...","(type, Document)"
104,"(page_content, increasing. It is also used to ...","(metadata, {'source': './data/knitting_pdfs/kn...","(type, Document)"


In [49]:
df['page_content'][0]

('page_content',
 'Beginning Knitting  \n \nWHAT IS KNITTING? \n \nHand knitting is to make fabric with yarn \non two or more needles.  A number of loops are first made on one needle, and then the fabric "grows\'\' by drawing other loops through them as they are passed back and forth along the needles from row to row.')

In [50]:
df['metadata'][0]

('metadata', {'source': './data/knitting_pdfs/knit.pdf', 'page': 0})

In [51]:
#!pip install scapy

In [52]:
#!bash
#!python3 -m spacy download en_core_web_md

### Data Visualization
To visually represent the data of our texts, we have created a word cloud. On the word cloud, we can see which words apear more frequently as they appear bigger.

In [54]:
im, longstring = myutils2.wordCloud(df, 'page_content')

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
im

## Embeddings

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

In [None]:
#!pip install sentence-transformers

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain.vectorstores import FAISS

In [None]:
#!pip install faiss-cpu

In [None]:
db = FAISS.from_documents(splits, embeddings)

## Storing the Embeddings in Vector DB

In [None]:
#!pip install chromadb

In [None]:
db = Chroma.from_documents(splits, embeddings)

In [None]:
persist_directory = '../data/chroma/'

# Create the vector store
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)
vectordb.persist()

In [None]:
vectordb._collection.count()

## Similarity Search

In [None]:
query = 'what is the best yarn for beginners?'

In [None]:
answer = vectordb.similarity_search(query, k=3)
answer

In [None]:
for d in answer:
    print(d.page_content)

In [None]:
for d in answer:
    print(d.metadata)

## Information Retrieval

In [None]:
q1 = 'What does "K2tog" mean?'

In [None]:
q2 = 'What are stitch markers and how do I use them?'

In [None]:
q3 = 'How do I cast on stitches?'

In [None]:
q4 = "What's the difference between circular needles and straight needles?"

In [None]:
answer = vectordb.max_marginal_relevance_search(q1, k=2, fetch_k=5)
for d in answer:
    print(d.page_content)

In [None]:
answer = vectordb.max_marginal_relevance_search(q2, k=2, fetch_k=5)
for d in answer:
    print(d.page_content)

In [None]:
answer = vectordb.max_marginal_relevance_search(q3, k=2, fetch_k=5)
for d in answer:
    print(d.page_content)

In [None]:
answer = vectordb.max_marginal_relevance_search(q4, k=2, fetch_k=5)
for d in answer:
    print(d.page_content)

## Large Language Model

In [None]:
!ollama serve

In [None]:
!ollama run

In [None]:
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
!ollama list

In [None]:
llm = Ollama(model="mistral", callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

In [None]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use five sentences maximum. Keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 

{context}

Question: {question}

Helpful Answer:
"""

In [None]:
prompt = PromptTemplate.from_template(template)
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt})

In [None]:
question1 = 'What does "K2tog" mean?'

In [None]:
result1 = chain({"query": question1})
result1["result"]

In [None]:
question2 = 'What are stitch markers and how do I use them?'

In [None]:
result2 = chain({"query": question2})
result2["result"]

In [None]:
question3 = 'How do I cast on stitches?'

In [None]:
result3 = chain({"query": question3})
result3["result"]

In [None]:
question4 = "What's the difference between circular needles and straight needles?"

In [None]:
result4 = chain({"query": question4})
result4["result"]

In [None]:
question5 = 'What does water taste like?'

In [None]:
result5 = chain({"query": question5})
result5["result"]

In [None]:
!bye

In [None]:
## make a steamlit application