In [130]:
from dotenv import load_dotenv
import os
import numpy as np
import matplotlib.pyplot as plt
from neo4j import GraphDatabase

load_dotenv(override=True)

True

In [132]:
driver = GraphDatabase.driver(os.getenv("NEO4J_URI"), auth=(os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASS")))
driver.get_server_info()

<neo4j.api.ServerInfo at 0x28beeac90>

In [133]:
#import the papers to the database
import json
PAPERS_DATA_PATH = "extract/papers_data.json"

with open(PAPERS_DATA_PATH, 'r') as json_file:
        papers_data = json.load(json_file)
        #remove the papers with no abstract
        papers_data = [paper for paper in papers_data if len(paper['abstract']) > 0]

CIPHR = """
WITH $data as data
UNWIND data as paper
MERGE (p:Paper {title: paper.title})
SET p.abstract = paper.abstract
SET p.url = paper.url
SET p.embeddings = paper.abstract_embedding
"""

with driver.session() as session:
    result = session.run(CIPHR, data=papers_data)

In [134]:
#create a vector index on the embeddings property
#in neo4j you can do this with 
# CALL db.index.vector.createNodeIndex('paper-embeddings', 'Paper', 'embeddings', 1536, 'cosine')

In [135]:
from neo4j import GraphDatabase

# Assuming you have already created a Neo4j driver instance
# driver = GraphDatabase.driver(uri, auth=(user, password))

GET_PAPERS = """
MATCH (p:Paper)
WHERE p.embeddings IS NOT NULL
RETURN p.title as title, p.abstract as abstract, p.url as url, p.embeddings as embeddings
"""

with driver.session() as session:
    papers_result = session.run(GET_PAPERS)
    papers = [dict(record) for record in papers_result]
    print(f"Number of papers: {len(papers)}")

GET_CENTROIDS = """
MATCH (c:Centroid)
RETURN c.coordinates as coordinates, c.title as centroid_title
"""

with driver.session() as session:
    centroids_result = session.run(GET_CENTROIDS)
    centroids = [dict(record) for record in centroids_result]

# papers, centroids

Number of papers: 19


In [137]:
#run TSNE on the node and centroid embeddings
from sklearn.manifold import TSNE

#convert the embeddings to a numpy array
embeddings = np.array([paper['embeddings'] for paper in papers])

#run TSNE on the node embeddings
tsne = TSNE(n_components=2, perplexity=10, verbose=2, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)

#save the TSNE results back to the database
UPDATE_TSNE = """
UNWIND $data as row
MATCH (p:Paper {title: row.title})
SET p.tsne_x = row.x, p.tsne_y = row.y
"""

with driver.session() as session:
    session.run(UPDATE_TSNE, data=[{'title': paper['title'], 'x': tsne_result[0], 'y': tsne_result[1]} for paper, tsne_result in zip(papers, tsne_results)])

[t-SNE] Computing 18 nearest neighbors...
[t-SNE] Indexed 19 samples in 0.000s...
[t-SNE] Computed neighbors for 19 samples in 0.086s...
[t-SNE] Computed conditional probabilities for sample 19 / 19
[t-SNE] Mean sigma: 0.236983
[t-SNE] Computed conditional probabilities in 0.001s
[t-SNE] Iteration 50: error = 53.9376297, gradient norm = 0.7883933 (50 iterations in 0.273s)
[t-SNE] Iteration 100: error = 50.3236237, gradient norm = 0.7446774 (50 iterations in 0.011s)
[t-SNE] Iteration 150: error = 49.1673355, gradient norm = 0.4904715 (50 iterations in 0.011s)
[t-SNE] Iteration 200: error = 53.2442169, gradient norm = 1.0219053 (50 iterations in 0.009s)
[t-SNE] Iteration 250: error = 53.1747208, gradient norm = 0.3859081 (50 iterations in 0.010s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.174721
[t-SNE] Iteration 300: error = 0.7295227, gradient norm = 0.0105972 (50 iterations in 0.009s)
[t-SNE] Iteration 350: error = 0.3333885, gradient norm = 0.0120932 (50 i

In [141]:
#perform k-means clustering
from sklearn.cluster import KMeans
K=3

#convert the centroids to a numpy array
centroid_embeddings = np.array([centroid['coordinates'] for centroid in centroids])

# if there aren't enough centroids, initialize the rest randomly
if len(centroids) < K:
    # Initialize the missing centroids randomly
    if centroid_embeddings.shape[0] == 0:
        centroid_embeddings = "kmeans++"
    else:
        centroid_embeddings = np.vstack([centroid_embeddings, np.random.rand(K - len(centroids), 1536)])

#initialize the k-means algorithm
kmeans = KMeans(n_clusters=K, init=centroid_embeddings, n_init=1)

#fit the algorithm to the data
predictions = kmeans.fit_predict(embeddings)
predictions, kmeans.cluster_centers_

(array([2, 2, 2, 2, 1, 2, 1, 1, 1, 0, 2, 1, 2, 1, 0, 1, 2, 2, 2],
       dtype=int32),
 array([[ 0.00957119,  0.00659721,  0.01460397, ...,  0.00216391,
         -0.02703943, -0.01040091],
        [ 0.00425553,  0.00378272,  0.01124918, ..., -0.0180465 ,
         -0.01018701, -0.0188333 ],
        [-0.00924649,  0.0132581 ,  0.00046706, ..., -0.01001863,
         -0.01145077, -0.01652182]]))

In [142]:
#add the centroids to the database
ADD_CENTROIDS = """
UNWIND $centroids as centroid
MERGE (c:Centroid {id: centroid.id})
SET c.coordinates = centroid.coordinates
"""

centroids = [{'id': i, 'coordinates': centroid } for i, centroid in enumerate(kmeans.cluster_centers_)]

with driver.session() as session:
    session.run(ADD_CENTROIDS, centroids=centroids)

#add the cluster relationships to the database
ADD_CLUSTER_RELATIONSHIPS = """
UNWIND $data as d
MATCH (p:Paper {title: d.title})
MATCH (c:Centroid {id: d.cluster})
MERGE (p)-[:PART_OF_CLUSTER]->(c)
"""

data = [{'title': paper['title'], 'cluster': prediction} for paper, prediction in zip(papers, predictions)]

with driver.session() as session:
    session.run(ADD_CLUSTER_RELATIONSHIPS, data=data)


In [143]:
#compute the title of each centroid
NEAREST_NODES = """
match(c:Centroid)
call db.index.vector.queryNodes('paper-embeddings', 3, c.coordinates) YIELD node, score
RETURN c.id as cluster_id, COLLECT({title: node.title, score: score}) AS nearest_nodes
"""

with driver.session() as session:
    result = session.run(NEAREST_NODES, centroids=centroids)
    centroids_with_nearest_nodes = [dict(record) for record in result]

centroids_with_nearest_nodes

[{'cluster_id': 0,
  'nearest_nodes': [{'title': 'Microbially Induced Carbonate Precipitation Using Microorganisms Enriched from Calcareous Materials in Marine Environments and Their Metabolites',
    'score': 0.989421010017395},
   {'title': 'Characteristics of bio-CaCO 3 from microbial bio-mineralization with different bacteria species',
    'score': 0.9894202947616577},
   {'title': 'Biocement Fabrication and Design Application for a Sustainable Urban Area',
    'score': 0.9602454900741577}]},
 {'cluster_id': 1,
  'nearest_nodes': [{'title': 'Getting into the groove: Opportunities to enhance the ecological value of hard coastal infrastructure using ﬁne-scale surface textures',
    'score': 0.9751214981079102},
   {'title': 'Learning from nature to enhance Blue engineering of marine infrastructure',
    'score': 0.9691004753112793},
   {'title': 'Availability of microhabitats explains a widespread pattern and informs theory on ecological engineering of boulder reefs',
    'score': 0.

In [144]:
from langchain.document_loaders import PyPDFLoader
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

#langchain configuration
response_schemas = [
    ResponseSchema(name="labels", description="An array of objects of schema {cluster_id: int, title: str} representing the labels of each cluster. Only give one title per cluster. You are given data of a handful of papers, as well as the cluster they belong to. The score is a number betweeo 0-1 giving the confidence of the model that the paper belongs to the cluster. Make your labels general, remember you are only given a subset of the papers, there are many more and the label should represent all of them"),
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

format_instructions = output_parser.get_format_instructions()
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
    "We are clustering research papers based on their embeddings. You are given the titles of some research papers, as well as their distance to the cluster. Give each cluster a unique name that is representative of the data it is close to. .\n{format_instructions}\n{clusters}")]
    ,
    input_variables=["clusters"],
    partial_variables={"format_instructions": format_instructions}
)

chat_model = ChatOpenAI(temperature=0.5)

_input = prompt.format_prompt(clusters=centroids_with_nearest_nodes)
output = chat_model(_input.to_messages())
labels = output_parser.parse(output.content)
labels

{'labels': [{'cluster_id': 0, 'title': 'Microbial Carbonate Precipitation'},
  {'cluster_id': 1,
   'title': 'Enhancing Ecological Value of Coastal Infrastructure'},
  {'cluster_id': 2, 'title': 'Engineered Living Materials'}]}

In [145]:
#using the labels, update the database
UPDATE_CENTROID_TITLES = """
UNWIND $labels as label
MATCH (c:Centroid {id: label.cluster_id})
SET c.title = label.title
"""

with driver.session() as session:
    result = session.run(UPDATE_CENTROID_TITLES, labels=labels.get('labels'))