In [17]:
from langchain.llms import Ollama
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate

In [20]:
# Initialize model mistral
model = Ollama(model="mistral_copy")

In [4]:
pdf_path = "/home/mohamed/Documents/Mohamed/Mohamed_DIALLO_CV.pdf"

In [5]:
loader = PyPDFLoader(pdf_path)
doc = loader.load()

In [6]:
separators = [
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
]

In [7]:
 # Initialize text splitter with specified parameters
text_splitter = RecursiveCharacterTextSplitter(
    separators=separators,
    chunk_size=300, # Size of each chunk in characters
    chunk_overlap=100, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
)

chunks = text_splitter.split_documents(doc)

In [8]:
print(f"Split {len(doc)} documents into {len(chunks)} chunks.")

Split 2 documents into 15 chunks.


In [9]:
# Print example of page content and metadata for a chunk
page = chunks[0]
print(page.page_content)
print(page.metadata)


Kalaban Coura ACI, Bamako, Rue 418  
Tel : +223 62 09 21 57  
Email  : mohameddiallo728@gmail.com  
MOHAMED DIALLO  
Maîtrisant des outils tels que MySQL, PostgreSQL, je suis capable de façonner et d'optimiser des bases de données
{'source': '/home/mohamed/Documents/Mohamed/Mohamed_DIALLO_CV.pdf', 'page': 0, 'start_index': 0}


In [10]:
# Path to the directory to save Chroma database
CHROMA_PATH = "embdb"

In [11]:
# Retrieve embedding function from code env resources
emb_model = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=emb_model
)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
2024-07-24 19:26:05.636407: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-24 19:26:05.801754: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-24 19:26:05.887118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-24 19:26:05.910975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07

In [12]:
# Create a new Chroma database from the documents using HuggingFaceEmbeddings
db = Chroma.from_documents(
    chunks,
    embeddings,
    persist_directory=CHROMA_PATH
)

  # Persist the database to disk
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

Saved 15 chunks to embdb.


  warn_deprecated(


In [13]:
query_text = "Quelles competences Mohamed possède t'il ?"

In [14]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

In [15]:
 # Prepare the database
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
  
# Retrieving the context from the DB using similarity search
results = db.similarity_search_with_relevance_scores(query_text, k=3)

# Check if there are any matching results or if the relevance score is too low
if len(results) == 0 or results[0][1] < 0.7:
  print(f"Unable to find matching results.")

Unable to find matching results.


In [18]:
# Combine context from matching documents
context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])
 
  # Create prompt template using context and query text
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

In [24]:
# Generate response text based on the prompt
response_text = model.predict(prompt)

print(response_text)

 Based on the provided context, it appears that Mohamed has the following key skills:

1. Databases: MySQL, PostgreSQL
2. Programming Languages and Frameworks: PHP, Java, SpringBoot, React.js, TypeScript, Python, Flask
3. Tools: Scrum, Slack, Jira, Trello, Power BI, UML
4. Languages: French (professional level), English (intermediate level), Bambara (maternal level)


In [25]:
# Get sources of the matching documents
sources = [doc.metadata.get("source", None) for doc, _score in results]

In [27]:
# Format and return response including generated text and sources
formatted_response = f"Response: {response_text}\nSources: {sources}"

print(formatted_response)

Response:  Based on the provided context, it appears that Mohamed has the following key skills:

1. Databases: MySQL, PostgreSQL
2. Programming Languages and Frameworks: PHP, Java, SpringBoot, React.js, TypeScript, Python, Flask
3. Tools: Scrum, Slack, Jira, Trello, Power BI, UML
4. Languages: French (professional level), English (intermediate level), Bambara (maternal level)
Sources: ['/home/mohamed/Documents/Mohamed/Mohamed_DIALLO_CV.pdf', '/home/mohamed/Documents/Mohamed/Mohamed_DIALLO_CV.pdf', '/home/mohamed/Documents/Mohamed/Mohamed_DIALLO_CV.pdf']
