In [6]:
%load_ext autoreload
%autoreload 2


import sys
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Now, iText2KG is compatible with all language models supported by LangChain. 

To use iText2KG, you will need both a chat model and an embeddings model. 

For available chat models, refer to the options listed at: https://python.langchain.com/v0.2/docs/integrations/chat/. 
For embedding models, explore the choices at: https://python.langchain.com/v0.2/docs/integrations/text_embedding/. 

This notebook will show you how to run iText2KG using Mistral, Ollama, and OpenAI models. 

**Please ensure that you install the necessary package for each chat model before use.**

# Mistral

For Mistral, please set up your model using the tutorial here: https://python.langchain.com/v0.2/docs/integrations/chat/mistralai/. Similarly, for the embedding model, follow the setup guide here: https://python.langchain.com/v0.2/docs/integrations/text_embedding/mistralai/ .

# OpenAI

The same applies for OpenAI. 

please setup your model using the tutorial : https://python.langchain.com/v0.2/docs/integrations/chat/openai/
The same for embedding model : https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/

In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

openai_api_key = "##"

openai_llm_model = llm = ChatOpenAI(
    api_key = openai_api_key,
    model="gpt-4o",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

openai_embeddings_model = OpenAIEmbeddings(
    api_key = openai_api_key ,
    model="text-embedding-3-large",
)

# Ollama

The same applies for Ollama. 

please setup your model using the tutorial : https://python.langchain.com/v0.2/docs/integrations/chat/ollama/
The same for embedding model : https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/

In [8]:
from langchain_ollama import ChatOllama, OllamaEmbeddings

llm = ChatOllama(
    model="llama3",
    temperature=0,
    max_retries=5,
    max_tokens=None,
)

embeddings = OllamaEmbeddings(
    model="llama3",
)

# iText2KG

* Use Case: We aim to connect an online job description with a generated CV using Knowledge Graphs. 

* The objective is to assess the candidate's suitability for the job offer. You can utilize different LLM or embedding models for each module of iText2KG. However, it is important to ensure that the dimensions of node and relation embeddings are consistent across models. If the embedding dimensions differ, cosine similarity may struggle to accurately measure vector distances for further matching.

## Document Distiller

### CV

In [9]:
from langchain.document_loaders import PyPDFLoader, TextLoader

loader = TextLoader("../datasets/cvs/Emily_Davis.txt")
pages = loader.load()
pages

# loader = PyPDFLoader(f"../datasets/cvs/CV_Jane_Smith.pdf")
# pages = loader.load_and_split()

[Document(metadata={'source': '../datasets/cvs/Emily_Davis.txt'}, page_content="{'name': 'Emily Davis',\n 'phone_number': '+1 567 890 1234',\n 'email': 'emilydavis@example.com',\n 'linkedin': 'linkedin.com/in/emilydavis',\n 'summary': 'Creative and passionate graphic designer with a keen eye for aesthetics and visual storytelling. Experienced in creating compelling designs for various media, including print and digital platforms.',\n 'work_experience': [{'title': 'Senior Graphic Designer',\n   'company': 'MNO Creative',\n   'location': 'Boston, MA',\n   'start_date': 'June 2017',\n   'end_date': 'Present',\n   'responsibilities': ['Designed logos, brochures, and social media graphics.',\n    'Collaborated with clients to understand their vision and deliver high-quality designs.',\n    'Managed multiple projects simultaneously.']}],\n 'education': [{'degree': 'Bachelor of Fine Arts in Graphic Design',\n   'institution': 'Rhode Island School of Design',\n   'location': '',\n   'start_dat

In [10]:
from itext2kg.documents_distiller import DocumentsDisiller, CV

document_distiller = DocumentsDisiller(llm_model = llm)

In [11]:
IE_query = '''
# DIRECTIVES : 
- Act like an experienced information extractor. 
- You have a chunk of a CV.
- If you do not find the right information, keep its place empty.
'''
# we have replaced the curly braces with square brackets to avoid the error in the query
distilled_cv = document_distiller.distill(documents=[page.page_content.replace("{", "").replace("}", "") for page in pages], IE_query=IE_query, output_data_structure=CV)

[{'name': 'Emily Davis', 'phone_number': '+1 567 890 1234', 'email': 'emilydavis@example.com', 'linkedin': 'linkedin.com/in/emilydavis', 'summary': 'Creative and passionate graphic designer with a keen eye for aesthetics and visual storytelling. Experienced in creating compelling designs for various media, including print and digital platforms.', 'work_experience': [{'title': 'Senior Graphic Designer', 'company': 'MNO Creative', 'location': 'Boston, MA', 'start_date': 'June 2017', 'end_date': 'Present', 'responsibilities': ['Designed logos, brochures, and social media graphics.', 'Collaborated with clients to understand their vision and deliver high-quality designs.', 'Managed multiple projects simultaneously.']}], 'education': [], 'skills': ['Adobe Creative Suite (Photoshop, Illustrator, InDesign)', 'Sketch', 'Figma', 'Creativity', 'communication', 'time management', 'Illustration', 'Photography', 'Traveling'], 'certifications': ['Adobe Certified Expert', 'UX Design Certification by N

In [12]:
semantic_blocks_cv = [f"{key} - {value}".replace("{", "[").replace("}", "]") for key, value in distilled_cv.items() if value !=[] and value != ""  and value != None]
semantic_blocks_cv

['name - Emily Davis',
 'phone_number - +1 567 890 1234',
 'email - emilydavis@example.com',
 'linkedin - linkedin.com/in/emilydavis',
 'summary - Creative and passionate graphic designer with a keen eye for aesthetics and visual storytelling. Experienced in creating compelling designs for various media, including print and digital platforms.',
 "work_experience - [['title': 'Senior Graphic Designer', 'company': 'MNO Creative', 'location': 'Boston, MA', 'start_date': 'June 2017', 'end_date': 'Present', 'responsibilities': ['Designed logos, brochures, and social media graphics.', 'Collaborated with clients to understand their vision and deliver high-quality designs.', 'Managed multiple projects simultaneously.']]]",
 "skills - ['Adobe Creative Suite (Photoshop, Illustrator, InDesign)', 'Sketch', 'Figma', 'Creativity', 'communication', 'time management', 'Illustration', 'Photography', 'Traveling']",
 "certifications - ['Adobe Certified Expert', 'UX Design Certification by Nielsen Norman 

### Job description

In [39]:
job_offer = """ 
About the Job Offer
THE FICTITIOUS COMPANY

FICTITIOUS COMPANY is a high-end French fashion brand known for its graphic and poetic style, driven by the values of authenticity and transparency upheld by its creator Simon Porte Jacquemus.

Your Role

Craft visual stories that captivate, inform, and inspire. Transform concepts and ideas into visual representations. As a member of the studio, in collaboration with the designers and under the direction of the Creative Designer, you should be able to take written or spoken ideas and convert them into designs that resonate. You need to have a deep understanding of the brand image and DNA, being able to find the style and layout suited to each project.

Your Missions

Translate creative direction into high-quality silhouettes using Photoshop
Work on a wide range of projects to visualize and develop graphic designs that meet each brief
Work independently as well as in collaboration with the studio team to meet deadlines, potentially handling five or more projects simultaneously
Develop color schemes and renderings in Photoshop, categorized by themes, subjects, etc.
Your Profile

Bachelor’s degree (Bac+3/5) in Graphic Design or Art
3 years of experience in similar roles within a luxury brand's studio
Proficiency in Adobe Suite, including Illustrator, InDesign, Photoshop
Excellent communication and presentation skills
Strong organizational and time management skills to meet deadlines in a fast-paced environment
Good understanding of the design process
Freelance contract possibility
"""

In [40]:
job_offer_2 = """ 
About the Job Offer
About The Role

Innovate Design Co. is seeking a talented Visual Designer with expertise in web and interactive design to enhance our visual brand identity. This role is responsible for external digital storytelling and communications design.

You'll be in charge of designing and developing engaging web pages, infographics, social media content, and global digital experiences that effectively convey our cutting-edge technology and products to diverse audiences.

Responsibilities

Translate our core values, product, marketing, and sales objectives into beautifully crafted deliverables
Design compelling, brand-aligned digital and print materials, including websites, social media content, ads, third-party marketplaces, presentations, animations, events, prints, etc.
Develop and maintain visual brand identity guidelines, ensuring brand consistency across all media and multichannel platforms
Communicate Innovate Design Co.'s narrative through conversion and data-driven design
Participate in brainstorming sessions and collaborate with stakeholders to articulate a creative vision that enhances our brand’s visual storytelling
Promote design comprehension and sensibility across the organization, refining work methodologies and design processes to enhance efficiency and effectiveness
Required Qualifications

A Bachelor’s degree (or equivalent) in Graphic Design / Visual Arts - or a self-starter with a strong creative project track record
5-7 years of experience in Graphic Design, including brand design, 360° marketing and communications design, product brand design, 0-to-1 projects, front-end development, etc.
Work experience within well-structured design departments operating in the tech/software space (including leading creative agencies, scale-ups, and mature tech companies)
Proficiency in Figma, Adobe CC, print design best practices, and a solid understanding of web technologies (HTML, CSS, JS)
A robust portfolio demonstrating a variety of design projects, showcasing creativity, originality, consistency, and attention to detail
Perfectly fluent in English, both written and spoken
Results-oriented, resourceful, innovative, intellectually curious, no-ego, proactive
Highly collaborative and able to balance multiple projects and stakeholders
Professional behavior with personal accountability, drive, and work ethics
You may be a good fit if you

Share our excitement for the AI/technology space
Drive projects independently, make judgments based on brand and business goals, seek feedback, collaborate cross-functionally, and leverage others' expertise
Maintain high design standards without perfectionism, understanding tradeoffs in a fast-paced environment
Understand audience feelings, cultures, and challenges through an inquisitive and learning-based approach
What We Offer

The ability to shape the exciting journey of technology and be part of the very early days of one of Europe’s hottest startups
A fun, young, international, and multicultural team — based in Paris, London, and San Francisco
Beautiful office space in the heart of Paris (Canal St Martin)
Competitive salary and benefits package
Opportunities for professional growth and development
"""

In [41]:
from pydantic import BaseModel, Field
from typing import List, Optional

class JobResponsibility(BaseModel):
    description: str = Field(..., description="A specific responsibility in the job role")

class JobQualification(BaseModel):
    skill: str = Field(..., description="A required or preferred skill for the job")

class JobCertification(BaseModel):
    certification: str = Field(..., description="Required or preferred certifications for the job")

class JobOffer(BaseModel):
    job_offer_title: str = Field(..., description="The job title")
    company: str = Field(..., description="The name of the company offering the job")
    location: str = Field(..., description="The job location (can specify if remote/hybrid)")
    job_type: str = Field(..., description="Type of job (e.g., full-time, part-time, contract)")
    responsibilities: List[JobResponsibility] = Field(..., description="List of key responsibilities")
    qualifications: List[JobQualification] = Field(..., description="List of required or preferred qualifications")
    certifications: Optional[List[JobCertification]] = Field(None, description="Required or preferred certifications")
    benefits: Optional[List[str]] = Field(None, description="List of job benefits")
    experience_required: str = Field(..., description="Required years of experience")
    salary_range: Optional[str] = Field(None, description="Salary range for the position")
    apply_url: Optional[str] = Field(None, description="URL to apply for the job")

In [42]:
IE_query = '''
# DIRECTIVES : 
- Act like an experienced information extractor. 
- You have a chunk of a job offer description.
- If you do not find the right information, keep its place empty.
'''
# we have replaced the curly braces with square brackets to avoid the error in the query
distilled_Job_Offer = document_distiller.distill(documents=[job_offer], IE_query=IE_query, output_data_structure=JobOffer)

[{'job_offer_title': 'About the Job Offer', 'company': 'FICTITIOUS COMPANY', 'location': '', 'job_type': 'Contract', 'responsibilities': [{'$ref': '#/$defs/JobResponsibility'}, {'description': 'Translate creative direction into high-quality silhouettes using Photoshop'}, {'description': 'Work on a wide range of projects to visualize and develop graphic designs that meet each brief'}, {'description': 'Work independently as well as in collaboration with the studio team to meet deadlines, potentially handling five or more projects simultaneously'}, {'description': 'Develop color schemes and renderings in Photoshop, categorized by themes, subjects, etc.'}], 'qualifications': [{'$ref': '#/$defs/JobQualification'}, {'skill': 'Bachelor’s degree (Bac+3/5) in Graphic Design or Art'}, {'skill': "3 years of experience in similar roles within a luxury brand's studio"}, {'skill': 'Proficiency in Adobe Suite, including Illustrator, InDesign, Photoshop'}, {'skill': 'Excellent communication and presen

In [43]:
semantic_blocks_job_offer = [f"{key} - {value}".replace("{", "[").replace("}", "]") for key, value in distilled_Job_Offer.items() if value !=[] and value != ""  and value != None]

In [44]:
distilled_Job_Offer_2 = document_distiller.distill(documents=[job_offer_2], IE_query=IE_query, output_data_structure=JobOffer)
semantic_blocks_job_offer_2 = [f"{key} - {value}".replace("{", "[").replace("}", "]") for key, value in distilled_Job_Offer_2.items() if value !=[] and value != ""  and value != None]

[{'job_offer_title': 'About the Job Offer', 'company': 'Innovate Design Co.', 'location': 'Paris, London, and San Francisco (Canal St Martin)', 'job_type': 'Full-time', 'responsibilities': [{'$ref': '#/$defs/JobResponsibility', 'description': 'Translate our core values, product, marketing, and sales objectives into beautifully crafted deliverables'}, {'$ref': '#/$defs/JobResponsibility', 'description': 'Design compelling, brand-aligned digital and print materials, including websites, social media content, ads, third-party marketplaces, presentations, animations, events, prints, etc.'}, {'$ref': '#/$defs/JobResponsibility', 'description': 'Develop and maintain visual brand identity guidelines, ensuring brand consistency across all media and multichannel platforms'}, {'$ref': '#/$defs/JobResponsibility', 'description': "Communicate Innovate Design Co.'s narrative through conversion and data-driven design"}, {'$ref': '#/$defs/JobResponsibility', 'description': 'Participate in brainstormin

## iText2KG for graph construction

In [45]:
from itext2kg import iText2KG


itext2kg = iText2KG(llm_model = llm, embeddings_model = embeddings)

We constructed the first graph. As you can see, we have passed all the semantic blocks of the CV to the LLM as one block. You can also pass the job offer as another semantic block, or you can separate the semantic blocks while constructing the graph.

You just need to pay attention to how much information is contained within each block. For example, the block "name - Emily Davis" does not contain much information, which will result in some isolated nodes (even if we reprompt it, it won't converge). It is better to concatenate it with other blocks or pass all the semantic blocks at once as we did here :) .

You should also ensure that you pass a substantial amount of information within each block. If there is a lot of information, you will have a well-formed graph. However, the nodes may contain merged concepts (phrases from paragraphs).

You need to find the optimal amount of information to include (you can experiment by iterating).

In [46]:
global_ent, global_rel = itext2kg.build_graph(sections=[semantic_blocks_cv], ent_threshold=0.6, rel_threshold=0.6)

[INFO] Extracting Entities from the Document 1
{'entities': [{'label': 'Person', 'name': 'Emily Davis'}, {'label': 'Process', 'name': 'Senior Graphic Designer'}]}
[INFO] Extracting Relations from the Document 1
{'relationships': [{'startNode': 'Emily Davis', 'endNode': 'Senior Graphic Designer', 'name': 'Works As'}]}
[INFO] The isolated entities are  [{'label': 'Person', 'name': 'emily davis', 'properties': {'embeddings': array([-0.01584088, -0.00927059,  0.01920192, ..., -0.00695282,
       -0.00056115,  0.00781327])}}, {'label': 'Process', 'name': 'senior graphic designer', 'properties': {'embeddings': array([ 0.00499245,  0.00576018,  0.02761992, ...,  0.00557556,
        0.00663689, -0.00528985])}}]
Some isolated entities without relations were detected ... trying to solve them!
{'relationships': [{'startNode': 'Emily Davis', 'endNode': 'Senior Graphic Designer', 'name': 'work_experience'}]}
[INFO] The isolated entities are  [{'label': 'Person', 'name': 'emily davis', 'properties':

We construct the second graph of the job offers, noting that we already have existing entities and relationships (of the CV).

In [47]:
global_ent_, global_rel_ = itext2kg.build_graph(sections=[semantic_blocks_job_offer, semantic_blocks_job_offer_2], existing_global_entities = global_ent, existing_global_relationships = global_rel, ent_threshold=0.6, rel_threshold=0.6)

[INFO] Extracting Entities from the Document 1
{'entities': [{'label': 'Job', 'name': 'Graphic Designer'}, {'label': 'Company', 'name': 'FICTITIOUS COMPANY'}]}
[INFO] Extracting Relations from the Document 1
{'relationships': [{'startNode': 'graphic designer', 'endNode': 'FICTITIOUS COMPANY', 'name': 'employs'}]}
[INFO] The isolated entities are  [{'label': 'Company', 'name': 'fictitious company', 'properties': {'embeddings': array([-0.00447414, -0.02116995,  0.01478735, ...,  0.00205458,
       -0.00464939, -0.0059599 ])}}]
Some isolated entities without relations were detected ... trying to solve them!
{'relationships': [{'startNode': 'graphic designer', 'endNode': 'FICTITIOUS COMPANY', 'name': 'has_experience_in_similar_roles'}]}
[INFO] The isolated entities are  [{'label': 'Company', 'name': 'fictitious company', 'properties': {'embeddings': array([-0.00447414, -0.02116995,  0.01478735, ...,  0.00205458,
       -0.00464939, -0.0059599 ])}}]
Some isolated entities without relations 

# Draw the graph
---

The final section involves visualizing the constructed knowledge graph using GraphIntegrator. The graph database Neo4j is accessed using specified credentials, and the resulting graph is visualized to provide a visual representation of the relationships and entities extracted from the document.

In [59]:
global_ent_list = []
for ent in global_ent_:
    if ent["label"] != '':
        global_ent_list.append(ent)

global_ent_list

[{'label': 'Job_Offer',
  'name': 'about the job offer',
  'properties': {'embeddings': array([-0.01519403, -0.01181279,  0.01700064, ..., -0.0080252 ,
           0.0074151 , -0.00961702])}},
 {'label': 'Company',
  'name': 'innovate design co.',
  'properties': {'embeddings': array([ 0.00467274, -0.02104599,  0.02542451, ..., -0.01286071,
          -0.02440203, -0.0166277 ])}},
 {'label': 'Location',
  'name': 'paris, london, and san francisco (canal st martin)',
  'properties': {'embeddings': array([ 0.00131088, -0.01223588,  0.03110223, ..., -0.00199854,
           0.01018634, -0.00377018])}},
 {'label': 'Job_Type',
  'name': 'full time',
  'properties': {'embeddings': array([-1.6611168e-02,  5.2146090e-05,  3.8347840e-03, ...,
          -1.2042833e-02, -1.5450866e-02,  1.5430220e-03])}},
 {'label': 'Responsibilities',
  'name': 'translate our core values, product, marketing, and sales objectives into beautifully crafted deliverables',
  'properties': {'embeddings': array([-0.013751

In [55]:
global_rel_list = []
rels = []
for rel in global_rel:
    if rel["startNode"] != '' and rel["endNode"] != '':
        rel["name"] = rel["name"].replace(" ", "_")
        global_rel_list.append(rel)

global_rel_list

[{'startNode': 'emily davis',
  'endNode': 'senior graphic designer',
  'name': 'works_as',
  'properties': {'embeddings': array([-0.01144046, -0.01113987,  0.00578057, ...,  0.00079197,
          -0.01015133,  0.01060089])}},
 {'startNode': 'emily davis',
  'endNode': 'senior graphic designer',
  'name': 'work_experience',
  'properties': {'embeddings': array([ 0.02409872, -0.00389711,  0.02718268, ..., -0.01397033,
           0.00905268,  0.0073592 ])}},
 {'startNode': 'senior graphic designer',
  'endNode': 'fictitious company',
  'name': 'works_as',
  'properties': {'embeddings': array([-0.01144046, -0.01113987,  0.00578057, ...,  0.00079197,
          -0.01015133,  0.01060089])}},
 {'startNode': 'senior graphic designer',
  'endNode': 'fictitious company',
  'name': 'has_experience_in_similar_roles',
  'properties': {'embeddings': array([-0.01462248,  0.00032782,  0.02402373, ..., -0.00743022,
           0.00879461,  0.00149123])}},
 {'startNode': 'senior graphic designer',
  'end

In [60]:
from itext2kg.graph_integration import GraphIntegrator

URI = "bolt://localhost:7687"
USERNAME = "neo4j"
PASSWORD = "secretgraph"

new_graph = {}
new_graph["nodes"] = global_ent_
new_graph["relationships"] = global_rel_
new_graph
GraphIntegrator(uri=URI, username=USERNAME, password=PASSWORD).visualize_graph(json_graph=new_graph)