In [35]:
import os
import torch
from dotenv import load_dotenv
import logging
import pandas as pd
from pdfminer.high_level import extract_text
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import Pinecone as PineconeStores
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import ctransformers, CTransformers
from pinecone import Pinecone
import pinecone

logging.basicConfig(level=logging.INFO)
load_dotenv()

True

In [15]:
def extract_resume_text(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    
    docs = loader.load()
    return docs

In [16]:
# Change this path to your actual dataset folder
data_folder_path = 'G://Auto Recruit AI//data'
resumes_df = extract_resume_text(data_folder_path)

In [17]:
def text_split(resumes_df):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(resumes_df)

    return text_chunks

In [18]:
chunks = text_split(resumes_df)
print("Length of Chunks: ", len(chunks))

Length of Chunks:  35193


In [19]:
def embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

In [20]:
embedding = embedding_model()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [36]:
# Initialize Pinecone instance
pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

index_name="recruit"

#Embedding each text chunk and storing in Pinecone
docsearch=PineconeStores.from_texts([t.page_content for t in chunks], embedding, index_name=index_name)

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['g:\\Auto Recruit AI\\env\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['g:\\Auto Recruit AI\\env\\Lib\\site-packages\\pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone


In [37]:
#If we already have an index we can load it like this:
#docsearch = PineconeStores.from_existing_index(index_name, embedding)

query = "Give me skil sets of an Accountant."
docs = docsearch.similarity_search(query, k=3)
print("Results:", docs)

Results: [Document(metadata={}, page_content="GENERAL ACCOUNTANT\nProfessional Summary\nI have around 6 years experiences in the field of Accounting & banking works, with Egyptians and foreign companies, I'm really honored to\nintroduce myself as a hardworking, good Communication skills, with the highest level of integrity, honesty, loyalty, strong beliefs, goal orientated,\nsober habits and the desire to produce the best of the work.\nAreas of Expertise"), Document(metadata={}, page_content="ACCOUNTANT\nSummary\nAccountant with over a decade of diverse professional experience including corporate and small business accounting, tax preparation and\naccounting services. \nDedicated and reliable team member who enjoys learning and taking on new challenges.\nHighlights\nAccounts Receivable/Payable\nMaster's Degree\nGeneral Ledger Accounting\nSales Tax Reporting\nGAAP Knowledge\nPeriod End Close\nTeam Oriented\nTax Preparation\nFinancial Statement Preparation"), Document(metadata={}, page_c

In [38]:
prompt_template = """
Use the following piece of information to answer the user's Questions.
If you don't know an answer, just say you don't know the answer.

Context: {context}
Question: {question}

Only return the helful answer below
Helpful answer:
"""

In [39]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [40]:
llm = CTransformers(model="G://Auto Recruit AI//Model//llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':500,
                            'temperature':0.8})

In [41]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type='stuff',
                                 retriever=docsearch.as_retriever(search_kwargs={'k':2}),
                                 return_source_documents=True,
                                 chain_type_kwargs=chain_type_kwargs)

In [42]:
while True:
    user_input = input(f"Input Prompt:")
    result = qa({"query":user_input})
    print("Response: ", result["result"])

  result = qa({"query":user_input})


Response:  An accountant has excellent attention to detail, fast learning abilities, and strong problem-solving skills. They are also able to work well under pressure, often meeting tight deadlines to deliver accurate financial reports or perform audits.


KeyboardInterrupt: 