In [3]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
import re 
import os

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
llm=AzureChatOpenAI(
    azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT_MODEL'),
    api_version=os.getenv('AZURE_OpenAI_API_VERSION'),
    temperature=0,
)
llm

AzureChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000024344188CA0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x00000243441A9E20>, root_client=<openai.lib.azure.AzureOpenAI object at 0x000002433F70C3D0>, root_async_client=<openai.lib.azure.AsyncAzureOpenAI object at 0x0000024344188D00>, temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), disabled_params={'parallel_tool_calls': None}, azure_endpoint='https://easttest123.openai.azure.com/openai/deployments/gpt-35-turbo/chat/completions?api-version=2023-03-15-preview', deployment_name='gpt-35-turbo', openai_api_version='2024-02-15-preview', openai_api_type='azure')

In [5]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [6]:
url_input=("https://en.wikipedia.org/wiki/Elon_Musk")
loader=WebBaseLoader([url_input])
loader.load().pop().page_content



In [7]:
data=''' 
Elon Reeve Musk FRS (/ˈiːlɒn/; born June 28, 1971) is a businessman and investor known for his key roles in the space company SpaceX 
and the automotive company Tesla, Inc. Other involvements include ownership of X Corp., the company that operates the social media platform X 
(formerly known as Twitter), and his role in the founding of the Boring Company, xAI, Neuralink, and OpenAI. He is one of the wealthiest individuals 
in the world; as of August 2024 Forbes estimates his net worth to be US$247 billion.[3]
'''

In [8]:
data=clean_text(loader.load().pop().page_content)
data



In [9]:
data=''' 
Elon Reeve Musk FRS (/ˈiːlɒn/; born June 28, 1971) is a businessman and investor known for his key roles in the space company SpaceX 
and the automotive company Tesla, Inc. Other involvements include ownership of X Corp., the company that operates the social media platform X 
(formerly known as Twitter), and his role in the founding of the Boring Company, xAI, Neuralink, and OpenAI. He is one of the wealthiest individuals 
in the world; as of August 2024 Forbes estimates his net worth to be US$247 billion.[3]
'''
documents=[Document(page_content=data)]
documents

[Document(metadata={}, page_content=' \nElon Reeve Musk FRS (/ˈiːlɒn/; born June 28, 1971) is a businessman and investor known for his key roles in the space company SpaceX \nand the automotive company Tesla, Inc. Other involvements include ownership of X Corp., the company that operates the social media platform X \n(formerly known as Twitter), and his role in the founding of the Boring Company, xAI, Neuralink, and OpenAI. He is one of the wealthiest individuals \nin the world; as of August 2024 Forbes estimates his net worth to be US$247 billion.[3]\n')]

In [10]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer=LLMGraphTransformer(llm=llm)

In [11]:
graph_documents=llm_transformer.convert_to_graph_documents(documents)

In [12]:
graph_documents

[GraphDocument(nodes=[Node(id='Elon Reeve Musk', type='Person', properties={}), Node(id='Spacex', type='Company', properties={}), Node(id='Tesla, Inc.', type='Company', properties={}), Node(id='X Corp.', type='Company', properties={}), Node(id='X', type='Social media platform', properties={}), Node(id='Boring Company', type='Company', properties={}), Node(id='Xai', type='Company', properties={}), Node(id='Neuralink', type='Company', properties={}), Node(id='Openai', type='Company', properties={})], relationships=[Relationship(source=Node(id='Elon Reeve Musk', type='Person', properties={}), target=Node(id='Spacex', type='Company', properties={}), type='KEY_ROLE', properties={}), Relationship(source=Node(id='Elon Reeve Musk', type='Person', properties={}), target=Node(id='Tesla, Inc.', type='Company', properties={}), type='KEY_ROLE', properties={}), Relationship(source=Node(id='Elon Reeve Musk', type='Person', properties={}), target=Node(id='X Corp.', type='Company', properties={}), type

# connection to graph database

In [13]:
from langchain_community.graphs import Neo4jGraph
import dotenv
import os
dotenv.load_dotenv()
graph=Neo4jGraph(url=os.getenv('NEO4J_URI_ONLINE'),
                 username=os.getenv('NEO4J_USERNAME_ONLINE'),
                 password=os.getenv('NEO4J_PASSWORD_ONLINE')
                #  database=os.getenv('NEO4J_DATABASE')
)

In [14]:
graph.add_graph_documents(graph_documents)


# to add custom relationships to the graphs for better knowledge

In [15]:
for r in graph_documents[0].nodes:
    print(r.type)

Person
Company
Company
Company
Social media platform
Company
Company
Company
Company


In [16]:
llm_transformer_filtered=LLMGraphTransformer(
    llm=llm,
    allowed_nodes=['Person','Country','Organization','Rockets','Planet','City','Company','Concept','Spacecraft'],
    allowed_relationships=['FOUNDED','LAUNCHED','BOUGHT','RECEIVED_CONTRACT','INVOLVED_WITH','ATTEMPTED_LAUNCH','AGREEMENT_FAILED'],
)

graph_documents_filtered=llm_transformer_filtered.convert_to_graph_documents(documents)

In [17]:
graph_documents_filtered[0].relationships

[]

In [18]:
graph.add_graph_documents(graph_documents_filtered)

# for a large webpage - convert to smaller chunks and add to graph database

In [19]:
url_input=("https://en.wikipedia.org/wiki/Elon_Musk")
loader=WebBaseLoader([url_input])
loader.load().pop().page_content



In [20]:
data=clean_text(loader.load().pop().page_content)
documents=[Document(page_content=data)]
documents



In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter=RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=300)
smaller_doc=splitter.split_documents(documents)
smaller_doc


[Document(metadata={}, page_content='Elon Musk WikipediaJump to contentMain menuMain menumove to sidebarhideNavigationMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usContributeHelpLearn to editCommunity portalRecent changesUpload fileSearchSearchDonateAppearanceCreate accountLog inPersonal tools Create account Log inPages for logged out editors learn moreContributionsTalkContentsmove to sidebarhideTop1Early life and educationToggle Early life and education subsection11Childhood and family12Education2Business careerToggle Business career subsection21Zip222Xcom and PayPal23SpaceX231Starlink24Tesla241SEC and shareholder lawsuits regarding tweets242SolarCity and Tesla Energy25Neuralink26The Boring Company27Twitter X28Leadership style3Other activitiesToggle Other activities subsection31Musk Foundation32Hyperloop33OpenAI and xAI34Tham Luang cave rescue and defamation case352018 cannabis incident36Music37Private jet38Company towns4Wealth5Personal actions views and social

In [29]:
graph_documents=[]

for batch in smaller_doc[:5]:
    graph_documents=llm_transformer.convert_to_graph_documents([batch])
    graph.add_graph_documents(graph_documents)

# creating dataframe of all the chunks

In [26]:
graph_documents

[GraphDocument(nodes=[Node(id='Elon Musk', type='Person', properties={}), Node(id='Spacex', type='Company', properties={}), Node(id='Tesla Inc', type='Company', properties={}), Node(id='X', type='Company', properties={}), Node(id='Twitter', type='Company', properties={})], relationships=[Relationship(source=Node(id='Elon Musk', type='Person', properties={}), target=Node(id='Spacex', type='Company', properties={}), type='FOUNDED', properties={}), Relationship(source=Node(id='Elon Musk', type='Person', properties={}), target=Node(id='Tesla Inc', type='Company', properties={}), type='CEO', properties={}), Relationship(source=Node(id='Elon Musk', type='Person', properties={}), target=Node(id='X', type='Company', properties={}), type='OWNER', properties={}), Relationship(source=Node(id='Elon Musk', type='Person', properties={}), target=Node(id='X', type='Company', properties={}), type='CTO', properties={}), Relationship(source=Node(id='Elon Musk', type='Person', properties={}), target=Node(

In [30]:
from langchain.chains import GraphCypherQAChain
chain=GraphCypherQAChain.from_llm(llm=llm,graph=graph,verbose=True,allow_dangerous_requests=True)
chain

GraphCypherQAChain(verbose=True, graph=<langchain_community.graphs.neo4j_graph.Neo4jGraph object at 0x00000243457ADEB0>, cypher_generation_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['question', 'schema'], input_types={}, partial_variables={}, template='Task:Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\nSchema:\n{schema}\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\n{question}'), llm=AzureChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000024344188CA0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x00000243441A9E20>, 

In [36]:
response=chain.invoke({'query':'give some info abot Tesla'})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person)-[:KEY_ROLE|FOUNDING|OWNERSHIP]->(c:Company {id: "Tesla"})
RETURN p.id, c.id[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'give some info abot Tesla',
 'result': "I'm sorry, but I don't have any information about Tesla at the moment."}