In [None]:
!pip install neo4j langchain langchain_openai -q

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import re
import csv
import pandas as pd
from tqdm import tqdm
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain.chat_models import ChatOpenAI

In [None]:
graph = Neo4jGraph(
    url="neo4j+s://.databases.neo4j.io",
    username="neo4j",
    password=""
    )

In [None]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Concept {concept_id: INTEGER, concept_name: STRING}
Relationship properties are the following:

The relationships are the following:
(:Concept)-[:PREREQUISITE]->(:Concept)


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-"

In [None]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0, model= "gpt-4-1106-preview"),
    graph=graph,
    verbose=True,
    return_direct=True,
    top_k=20,
)

In [None]:
llm = ChatOpenAI(model_name="gpt-4-1106-preview",
                 temperature=0,
                 openai_api_key="sk-")

***Task I***

In [None]:
file_path = "T1_binary.tsv"

concepts_1 = []
concepts_2 = []

pattern_1 = r"learned about\s(.*?),"
pattern_2 = r"learn about\s([^?']*?)\?"

with open(file_path, "r", encoding='utf-8') as file:
    reader = csv.reader(file, delimiter="\t")

    for row in reader:
        sentence = row[0]

        match_1 = re.search(pattern_1, sentence)
        match_2 = re.search(pattern_2, sentence)

        if match_1 and match_2:
            concept_1 = match_1.group(1)
            concept_2 = match_2.group(1)
            concepts_1.append(concept_1)
            concepts_2.append(concept_2)

In [None]:
binary_prompt = """
In the domain of natural language processing, I have already learned about "{concept_1}", based on this, does it help for me to learn about "{concept_2}"?
Retuen the path.
"""

In [None]:
paths = []
for i in tqdm(range(100)):
    prompt = binary_prompt.format(concept_1=concepts_1[i], concept_2=concepts_2[i])
    try:
        result = chain.run(prompt)
    except Exception as e:
        print(f"Error processing concept pair {concepts_1[i]}, {concepts_2[i]}: {e}")
        result = []
    paths.append(result)

In [None]:
task_I_prompt = """
There is a knowledge graph that includes the relationships between concepts.
Based on the question, the path between concepts has been returned.
If the path is empty, then there is no relationship.
Only use the returned path as the information for answering.
Only return "Yes" or "No".

**Question**:
In the domain of natural language processing, I have already learned about "{concept_1}", based on this, does it help for me to learn about "{concept_2}"?

**Path**:
{path}
"""

In [None]:
results = []

for i in tqdm(range(100)):
    prompt = task_I_prompt.format(concept_1=concepts_1[i], concept_2=concepts_2[i], path=paths[i])
    result = llm.predict(prompt)
    results.append(result)

100%|██████████| 100/100 [00:54<00:00,  1.82it/s]


In [None]:
results = pd.DataFrame(results)
results.to_csv("task1.csv")

***Task II***

In [None]:
file_path = "T2_one_hop.tsv"

concepts = []

pattern = r"learn about\s(.*?),"

with open(file_path, "r", encoding='utf-8') as file:
    reader = csv.reader(file, delimiter="\t")

    for row in reader:
        sentence = row[0]

        match = re.search(pattern, sentence)
        if match:
            concept = match.group(1)
            concepts.append(concept)

In [None]:
one_hop_prompt = """
In the domain of natural language processing, I want to learn about "{concept}", what concepts should I learn frist?
Return all the paths.
The length of the path cannot exceed 15.
"""

In [None]:
paths = []
for concept in tqdm(concepts):
    prompt = one_hop_prompt.format(concept=concept)
    try:
        result = chain.run(prompt)
    except Exception as e:
        print(f"Error processing concept pair {concepts[i]}: {e}")
        result = []
    paths.append(result)

In [None]:
task_II_prompt = """
There is a knowledge graph that includes the relationships between concepts.
Based on the question, the path between concepts has been returned.
Return all unqiue concept names in the paths, but don't include the concept name in the question.
Answer the question in the fllowing format (only return a list):
[concept1, cpncept2, ...]

**Question**:
In the domain of natural language processing, I want to learn about "{concept}", what concepts should I learn frist?

**Path**:
{path}
"""

In [None]:
results = []
for i in tqdm(range(100)):
    prompt = task_II_prompt.format(concept=concepts[i], path=paths[i])
    result = llm.predict(prompt)
    results.append(result)

100%|██████████| 100/100 [01:26<00:00,  1.16it/s]


In [None]:
results = pd.DataFrame(results)
results.to_csv("task2.csv")

In [None]:
#results = []
#for concept in tqdm(concepts):
    #prompt = one_hop_prompt.format(concept=concept)
    #try:
        #result = chain.run(prompt)
        #prerequisites = [i["prerequisite.concept_name"] for i in result]
    #except Exception as e:
        #print(f"{concept}:{e}")
        #prerequisites = []
    #results.append(prerequisites)

***Task III***

In [None]:
file_path = "T3_multi_hop.tsv"

concepts_1 = []
concepts_2 = []

pattern_1 = r"know about\s(.*?),"
pattern_2 = r"learn about\s(.*?),"

with open(file_path, "r", encoding='utf-8') as file:
    reader = csv.reader(file, delimiter="\t")

    for row in reader:
        sentence = row[0]

        match_1 = re.search(pattern_1, sentence)
        match_2 = re.search(pattern_2, sentence)

        if match_1 and match_2:
            concept_1 = match_1.group(1)
            concept_2 = match_2.group(1)
            concepts_1.append(concept_1)
            concepts_2.append(concept_2)

In [None]:
multi_hop_prompt = """
In the domain of natural language processing, I have learned "{concept_1}", what is the path to learn "{concept_2}"?
Return the shortest path.
"""

In [None]:
paths = []
for i in tqdm(range(100)):
    prompt = multi_hop_prompt.format(concept_1=concepts_1[i], concept_2=concepts_2[i])
    try:
        result = chain.run(prompt)
    except Exception as e:
        print(f"Error processing concept pair {concepts_1[i]}, {concepts_2[i]}: {e}")
        result = []
    paths.append(result)

In [None]:
task_III_prompt = """
There is a knowledge graph that includes the relationships between concepts.
Based on the question, the path between concepts has been returned.
If the path is empty, then there is no relationship.
Only return concept name and the total number of concepts in the list
Answer the question in the fllowing format:
[concept, cpncept, ..., number]

**Question**:
In the domain of natural language processing, I have learned "{concept_1}", what is the shortest path to learn "{concept_2}"?

**Path**:
{path}
"""

In [None]:
results = []

for i in tqdm(range(100)):
    prompt = task_III_prompt.format(concept_1=concepts_1[i], concept_2=concepts_2[i], path=paths[i])
    result = llm.predict(prompt)
    results.append(result)

In [None]:
results = pd.DataFrame(results)
results.to_csv("task3.csv")

In [None]:
#results = []
#for i in tqdm(range(20)):
    #prompt = multi_hop_prompt.format(concept_1=concepts_1[i], concept_2=concepts_2[i])
    #try:
        #result = chain.run(prompt)
        #path_concepts = []
        #for path in result[0]["path"]:
            #if isinstance(path, dict) and "concept_name" in path:
                #path_concepts.append(path["concept_name"])
    #except Exception as e:
        #print(f"Error processing concept pair {concepts_1[i]}, {concepts_2[i]}: {e}")
        #path_concepts = []
    #results.append(path_concepts[1:-1])

***Task IV***

In [None]:
task4 = pd.read_csv("T4_suggestion.tsv", sep='\t')

In [None]:
prompt = """
Return all the concpets in the graph.
Only return concept_name in the following format:
[concept1, cpncept2, ...]
"""

In [None]:
concepts = chain.run(prompt)

In [None]:
task4_prompt = """
Title: {title}
Description: {description}

What NLP (Natural Language Processing) concepts do you think the above content includes?
Here is the concept list you can refer to:
Concepts: {concepts}

Answer the question in the fllowing format, only return the concept name:
[concept1, cpncept2, ...]
"""

In [None]:
results = []

for i in tqdm(range(100)):
    prompt = task4_prompt.format(title=task4["Title"][i], description=task4["Description"][i], concepts=concepts)
    print(prompt)
    result = llm.predict(prompt)
    results.append(result)

In [None]:
results = pd.DataFrame(results)
results.to_csv("task4.csv")

***Task V***

In [None]:
file_path = "T5_idea.tsv"

concepts = []


with open(file_path, "r", encoding='utf-8') as file:
    reader = csv.reader(file, delimiter="\t")
    next(reader)
    for row in reader:
        content = row[0]
        concepts.append(content.replace(';',','))

In [None]:
task5_prompt = """
In the domain of Natural Language Processing, I already know about {concepts}.
Return the prerequisite nodes of each concept, as a set.
"""

In [None]:
paths = []
query_concepts = []
final_concepts = []
for content in tqdm(concepts):

    query_concepts.append(content.split(','))
    prompt = task5_prompt.format(concepts=content)
    print (prompt)
    try:
        result = chain.run(prompt)
        print (result)


    except Exception as e:
        print(f"Error")
        result = []
    paths.append(result)

In [None]:
combined_prerequisites = []

for item in paths:
    prerequisites = []
    for sub_item in item:
        # Check for both possible keys due to inconsistency in key naming
        if 'Prerequisites' in sub_item:
            prerequisites.extend(sub_item['Prerequisites'])
        elif 'prerequisites' in sub_item:
            prerequisites.extend(sub_item['prerequisites'])
    combined_prerequisites.append(prerequisites)

In [None]:
with open('task5_prerequisites.txt', 'w') as f:
    for item in combined_prerequisites:
        f.write("%s\n" % item)

with open('task5_concepts.txt', 'w') as f:
    for item in query_concepts:
        f.write("%s\n" % item)


In [None]:
ask_IV_prompt = """
I have learned following concepts:
{que_concepts};
moreover, I also know about the following concepts, which might be helpfull:
{pre_concepts}.
I am looking to apply these knowledge and undertake a novel and interesting project. Show me how each concept is applied to this project.
Please provide me with a project I can work on, including title and description (max 200 words).
Return in this format:
Title: xxx ; Description: xxx
"""


results = []

for content,qcontent in tqdm(zip(combined_prerequisites,query_concepts)):
    try:
      # prompt = task_IV_prompt.format(que_concepts=qcontent)
      prompt = task_IV_prompt.format(que_concepts=qcontent,pre_concepts=content)
      print (prompt)
      result = llm.predict(prompt)
      results.append(result.replace('\n',' ').replace('\t',' '))
      print (result)
    except:
      print ('Error')
      results.append('Error\tError')


In [None]:
save_path = 'task5.tsv'
with open(save_path,'w') as file:
  for content in results:
    file.write(content+'\n')
print ('Done!')