# Installing the needed libraries

In [24]:
!pip install langchain openai pypdf chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.29.0-py3-none-any.whl.metadata (6.3 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb)
  Downloading pulsar_client-3.4.0-cp311-cp311-macosx_10_15_universal2.whl.metadata (1.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.17.1-cp311-cp311-macosx_11_0_universal2.whl.metadata (4.2 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.23.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetr

In [2]:
!pip install langchain-openai 

Collecting langchain-openai
  Downloading langchain_openai-0.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting openai<2.0.0,>=1.10.0 (from langchain-openai)
  Downloading openai-1.14.3-py3-none-any.whl.metadata (20 kB)
Downloading langchain_openai-0.1.1-py3-none-any.whl (32 kB)
Downloading openai-1.14.3-py3-none-any.whl (262 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai, langchain-openai
  Attempting uninstall: openai
    Found existing installation: openai 1.4.0
    Uninstalling openai-1.4.0:
      Successfully uninstalled openai-1.4.0
Successfully installed langchain-openai-0.1.1 openai-1.14.3


In [5]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.1.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.1.0-py3-none-any.whl (286 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.1.0


# Setting up the env

After installing all the packages we need to set the env variable for OPEN AI API.
You can do that by doing the following:

```
export OPENAI_API_KEY="KEYGOESHERE"
```

# Loading in the model

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()

In [32]:
# A simple prompt
llm.invoke("skill issue")

AIMessage(content="I'm sorry to hear that you are experiencing a skill issue. Can you please provide more details about the specific skill you are having trouble with so that I can offer you some guidance or assistance?", response_metadata={'token_usage': {'completion_tokens': 39, 'prompt_tokens': 9, 'total_tokens': 48}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': 'fp_3bc1b5746c', 'finish_reason': 'stop', 'logprobs': None})

In [33]:
# Load in a resume:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("resume.pdf")

In [34]:
pages = loader.load()

In [35]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

In [36]:
directory = 'index_store'
vector_index = Chroma.from_documents(pages, OpenAIEmbeddings(), persist_directory=directory)
vector_index.persist()

In [26]:
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
retriever = vector_index.as_retriever(search_type="similarity", search_kwargs={"k":6})
qa_interface = RetrievalQA.from_chain_type(llm=ChatOpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)

In [44]:
result = qa_interface("What skills does the person have in the domain of computer science? Return each skill separated by new line. Just list the skill dont say anything else. Based on these skills create a question that tests them after listed skills that can be solved within 15 minutes")['result']

Number of requested results 6 is greater than number of elements in index 4, updating n_results = 4


In [45]:
result.split('\n')

['- Machine Learning',
 '- Python (Pandas, PyTorch, scikit-learn)',
 '- JavaScript (NodeJS, ReactJS, express)',
 '- REST API',
 '- SQL',
 '- MongoDB',
 '- High Performance Computing',
 '- Linux',
 '- Docker',
 '',
 '**Question:**',
 'Given a dataset with customer information stored in a SQL database, develop a Python script that connects to the database, extracts the necessary data, performs a machine learning task (such as clustering or classification) using PyTorch, and then visualizes the results using a React front-end. The final output should be a web application that allows users to interact with the clustering/classification results.']