In [None]:
#r "nuget: Azure.AI.OpenAI, *-*"
#r "nuget: Azure, *-*"
#r "nuget: Azure.Identity, *-*"
#r "nuget: dotenv.net, *-*"
#r "nuget: Microsoft.DotNet.Interactive.AIUtilities, *-*"
#r "nuget: Microsoft.ML.Tokenizers, *-*"
#r "nuget: Microsoft.SemanticKernel.Core, *-*"
#r "nuget: Neo4j.Driver, *-*"

using Microsoft.DotNet.Interactive;
using Microsoft.DotNet.Interactive.AIUtilities;
using dotenv.net;
using Azure.AI.OpenAI;
using Azure;
using Azure.Identity;
using OpenAI.Chat;
using System;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
using System.IO;
using Microsoft.SemanticKernel.Text;
using Microsoft.ML.Tokenizers;
using Neo4j.Driver;


In [4]:
DotEnv.Load();

var envVars = DotEnv.Read();

AzureOpenAIClient client = new(new Uri(envVars["AZURE_OPENAI_ENDPOINT"]), 
    new AzureKeyCredential(envVars["AZURE_OPENAI_API_KEY"]));

var embeddings = envVars["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"];
var llm = envVars["AZURE_OPENAI_CHAT_DEPLOYMENT"];

In [33]:
public record DocunentMetadata(string id, string source);
public record ChunkMetadata(string id, string name, int sequence, string documentId, string text);
public record TripletRow(string head, string head_type, string relation, string tail, string tail_type);
public class EntityMetadata
{
    public string name { get; set; }
    public string type { get; set; }
    public string id { get; set; }
    public string text { get; set; }
    public Dictionary<string, ChunkMetadata> mentionedInChunks {get; set;} = new Dictionary<string, ChunkMetadata>();
}


In [None]:
ChatClient chatClient = client.GetChatClient(llm);
string fileName = "input/summaries.txt";
string fileText = File.ReadAllText(fileName);

DocunentMetadata documentMetatdata = new (Guid.NewGuid().ToString("N"), fileName);

var tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
#pragma warning disable SKEXP0050
var lines = TextChunker.SplitPlainTextLines(fileText, 500, text => tokenizer.CountTokens(text));
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, 500, 100, null, text => tokenizer.CountTokens(text));

string entityTypes = "BLOG_POST,BOOK,MOVIE,PRESENTATION,EVENT,ORGANIZATION,PERSON,PLACE,PRODUCT,REVIEW,ACTION";
string relationTypes = "INTRODUCED,USED_FOR,WRITTEN_IN,PART_OF,LOCATED_IN,GIVEN,LIVES_IN,TRAVELED_TO";

Dictionary<ChunkMetadata, List<TripletRow>> chunks = new Dictionary<ChunkMetadata, List<TripletRow>>();
int maxTripletsPerChunk = 10;
for (int i = 0; i < paragraphs.Count; i++)
{
    string text = paragraphs[i];

    ChunkMetadata chunkMetadata = new (Guid.NewGuid().ToString("N"), $"DocumentChunk{i}", i, documentMetatdata.id, text);

	string prompt =  $@"Please extract up to {maxTripletsPerChunk} knowledge triplets from the provied text.
    Each triplet should be in the form of (head, relation, tail) with their respective types.
    ######################
    ONTOLOGY:
    Entity Types: {entityTypes}
    Relation Types: {relationTypes}
    
    Use these entity types and relation types as a starting point, introduce new types if necessary based on the context.
    
    GUIDELINES:
    - Output in JSON format: [{{""head"": """", ""head_type"": """", ""relation"": """", ""tail"": """", ""tail_type"": """"}}]
    - Use the full form for entities (ie., 'Artificial Intelligence' instead of 'AI')
    - Keep entities and relation names concise (3-5 words max)
    - Break down complex phrases into multiple triplets
    - Ensure the knowledge graph is coherent and easily understandable
    ######################
    EXAMPLE:
    Text: Jason Haley, chief engineer of Jason Haley Consulting, wrote a new blog post titled 'Study Notes: GraphRAG - Property Grids' about creating a property grid RAG system using Semantic Kernel. 
    Output:
    [{{""head"": ""Jason Haley"", ""head_type"": ""PERSON"", ""relation"": ""SOFTWARE_DEVELOPER"", ""tail"": ""Jason Haley Consulting"", ""tail_type"": ""COMPANY""}},
     {{""head"": ""Jason Haley Consulting."", ""head_type"": ""COMPANY"", ""relation"": ""EMPLOYES"", ""tail"": ""Jason Haley"", ""tail_type"": ""PERSON""}},
     {{""head"": ""Study Notes: GraphRAG - Property Grids"", ""head_type"": ""BLOG_POST"", ""relation"": ""WRITTEN_BY"", ""tail"": ""Jason Haley"", ""tail_type"": ""PERSON""}},
     {{""head"": ""property grid RAG system"", ""head_type"": ""SOFTWARE_SYSTEM"", ""relation"": ""USES"", ""tail"": ""Semantic Kernel"", ""tail_type"": ""TECHNOLOGY""}}]
    ######################
    Text: {text}
    ######################
    Output:";

	ChatCompletion completion = chatClient.CompleteChat(
    	[
        	new UserChatMessage(prompt),
    	]);

	Console.WriteLine($"{completion.Role}: {completion.Content[0].Text}");
    List<TripletRow> rows =  JsonSerializer.Deserialize<List<TripletRow>>(completion.Content[0].Text.Replace("```json", "").Replace("```","").Replace("'", "").Trim());
    
    chunks.Add(chunkMetadata, rows);
}

Console.WriteLine($"Number of chunks: {chunks.Count}");

In [81]:
public class Utilities
{    
    public static EntityMetadata PopulateEntityMetadata(ChunkMetadata chunkMetadata, TripletRow triplet, EntityMetadata entityMetadata, bool isHead = true)
    {
        entityMetadata.id = Guid.NewGuid().ToString("N");

        if (isHead)
        {
            entityMetadata.name = CreateName(triplet.head);
            entityMetadata.type = triplet.head_type;
            entityMetadata.text = triplet.head;
        }
        else
        {
            entityMetadata.name = CreateName(triplet.tail);
            entityMetadata.type = triplet.tail_type;
            entityMetadata.text = triplet.tail;
        }

        entityMetadata.mentionedInChunks.Add(chunkMetadata.id, chunkMetadata);
        
        return entityMetadata;
    }

    public static string CreateName(string text)
    {
        if (string.IsNullOrEmpty(text))
            return text;

        // Split the text into words
        string[] words = text.Split(new[] { ' ', '-', '_' }, StringSplitOptions.RemoveEmptyEntries);

        StringBuilder nameText = new StringBuilder();
        
        foreach (string word in words)
        {
            // Capitalize the first letter and make the rest lowercase
            var lword = word;
            if (char.IsDigit(word[0]))
            {
                lword = "_" + word;
            }

            nameText.Append(lword.ToLower());
        }
        return Regex.Replace(nameText.ToString(), "[^a-zA-Z0-9_]", "");
    }
}

In [None]:
Dictionary<string,EntityMetadata> entities = new Dictionary<string,EntityMetadata>();

foreach (ChunkMetadata key in chunks.Keys)
{
    List<TripletRow> triplets = chunks[key];
    foreach (var triplet in triplets)
    {
        EntityMetadata entity;
        string pcHead = Utilities.CreateName(triplet.head);
        if (entities.ContainsKey(pcHead)) 
        {
            entity = entities[pcHead];
            if (!entity.mentionedInChunks.ContainsKey(key.id))
            {
                entity.mentionedInChunks.Add(key.id, key);
            }
        }
        else
        {
            entity = new EntityMetadata();   
            entities.Add(pcHead, Utilities.PopulateEntityMetadata(key, triplet, entity, true));
        }      

        string pcTail = Utilities.CreateName(triplet.tail);
        if (entities.ContainsKey(pcTail)) 
        {
            entity = entities[pcTail];
            if (!entity.mentionedInChunks.ContainsKey(key.id))
            {
                entity.mentionedInChunks.Add(key.id, key);
            }
        }
        else
        {
            entity = new EntityMetadata();   
            entities.Add(pcTail, Utilities.PopulateEntityMetadata(key, triplet, entity, false));
        }
    }
}

Console.WriteLine($"Unique entity count: {entities.Count}");

In [None]:
foreach(var key in entities.Keys)
{
    var e = entities[key];
    Console.WriteLine($"{key} Mentioned In {e.mentionedInChunks.Count} chunks");
}

In [93]:

List<string> entityCypherText = new List<string>(); // Document, DocumentChunk and Entity

entityCypherText.Add($"MERGE (Document1:DOCUMENT {{ id: '{documentMetatdata.id}', name:'Document1', type:'DOCUMENT', source: '{documentMetatdata.source}'}})"); 

foreach (var chunk in chunks.Keys)
{
    entityCypherText.Add($"MERGE (DocumentChunk{chunk.sequence}:DOCUMENT_CHUNK {{ id: '{chunk.id}', name: '{chunk.name}', type: 'DOCUMENT_CHUNK', documentId: '{chunk.documentId}', sequence: '{chunk.sequence}', text: \"{chunk.text.Replace("\"", "'")}\"}})");
    entityCypherText.Add($"MERGE (Document1)-[:CONTAINS]->(DocumentChunk{chunk.sequence})");
}

HashSet<string> types = new HashSet<string>();
foreach(var entity in entities.Keys)
{
    var labels = entities[entity];
    var pcEntity = entity;
    entityCypherText.Add($"MERGE ({pcEntity}:ENTITY {{ name: '{pcEntity}', type: '{labels.type}', id: '{labels.id}', text: '{labels.text}'}})");

    if (!types.Contains(labels.type))
    {
        types.Add(labels.type);
    }

    foreach(var key in labels.mentionedInChunks.Keys)
    {
        var documentChunk = labels.mentionedInChunks[key];
        entityCypherText.Add($"MERGE ({pcEntity})-[:MENTIONED_IN]->(DocumentChunk{documentChunk.sequence})");
    }
}

HashSet<string> relationships = new HashSet<string>();
foreach (ChunkMetadata key in chunks.Keys)
{
    List<TripletRow> triplets = chunks[key];
    foreach (var triplet in triplets)
    {
        var pcHead = Utilities.CreateName(triplet.head);
        var pcTail = Utilities.CreateName(triplet.tail);
        entityCypherText.Add($"MERGE ({pcHead})-[:{triplet.relation.Replace(" ", "_").Replace("-","_")}]->({pcTail})");

        string headRelationship = $"MERGE (DocumentChunk{key.sequence})-[:MENTIONS]->({pcHead})";
        if (!relationships.Contains(headRelationship))
        {
            relationships.Add(headRelationship);
            entityCypherText.Add(headRelationship);
        }
        
        string tailRelationship = $"MERGE (DocumentChunk{key.sequence})-[:MENTIONS]->({pcTail})";
        if (!relationships.Contains(tailRelationship))
        {
            relationships.Add(tailRelationship);
            entityCypherText.Add(tailRelationship);
        }
    }
}

In [None]:
foreach(var t in entityCypherText)
{
    Console.WriteLine(t);
}

In [None]:
foreach(var t in types.Keys)
{
    Console.WriteLine(t);
}

In [5]:
IAuthToken token = AuthTokens.Basic(
                envVars["NEO4J_USER"],
                envVars["NEO4J_PASSWORD"]
            );
IDriver driver = GraphDatabase.Driver(envVars["NEO4J_URI"], token);

QueryConfig config = new QueryConfig();


In [None]:
Console.WriteLine(entityCypherText.ToArray().Length);

In [94]:

using (var session = driver.AsyncSession())
{
    StringBuilder all = new StringBuilder();
    all.AppendJoin(Environment.NewLine, entityCypherText.ToArray());
    await driver.ExecutableQuery(all.ToString()).WithConfig(config).ExecuteAsync();
}



In [95]:
string createVectorIndex = @"CREATE VECTOR INDEX CHUNK_EMBEDDING IF NOT EXISTS
                            FOR (c:DOCUMENT_CHUNK) ON c.embedding
                            OPTIONS {indexConfig: {
                           `vector.dimensions`: 1536,
                            `vector.similarity_function`: 'cosine'
                            }}";

await driver.ExecutableQuery(createVectorIndex).WithConfig(config).ExecuteAsync();

In [96]:

string createFulltextIndex = @"CREATE FULLTEXT INDEX ENTITY_TEXT IF NOT EXISTS 
                                FOR (n:ENTITY) ON EACH [n.text]";
await driver.ExecutableQuery(createFulltextIndex).WithConfig(config).ExecuteAsync();

In [97]:

string populateEmbeddings = $@"
                            MATCH (n:DOCUMENT_CHUNK) WHERE n.text IS NOT NULL
                            WITH n, genai.vector.encode(
                                n.text,
                                'AzureOpenAI',
                                {{
                                    token: $token,
                                    resource: $resource,
                                    deployment: $deployment
                                }}) AS vector
                            CALL db.create.setNodeVectorProperty(n, 'embedding', vector)
                            ";
await driver.ExecutableQuery(populateEmbeddings)
    .WithParameters(new() { 
        {"token", envVars["AZURE_OPENAI_API_KEY"]}, 
        {"resource", envVars["AZURE_OPENAI_RESOURCE"]}, 
        {"deployment", envVars["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"]}})
    .WithConfig(config)
    .ExecuteAsync();

## Retrieval

In [6]:
//string questionText = "what are the blog post titles that are about Semantic Kernel?";
string questionText = "How many blog post did Jason write about Semantic Kernel and what is their titles?";

In [None]:
ChatClient chatClient = client.GetChatClient("chat");

int maxSynonyms = 10;

string prompt = $@"
Given a user question, generate synonyms or related keywords up to {maxSynonyms} in total, considering possible cases of capitalization, pluralization, and common expressions. Provide all synonyms/keywords separated by '~' symbols in a single line format: 'synonym1~synonyms2~...'.

QUERY: {questionText}
######################
KEYWORDS:
";
ChatCompletion completion = chatClient.CompleteChat(
    [
        new UserChatMessage(prompt),
    ]);

Console.WriteLine($"{completion.Role}: {completion.Content[0].Text}");

In [None]:
var synonyms = completion.Content[0].Text.Split("~");


var uniqueNodes = new Dictionary<string, string>();
foreach(var synonym in synonyms)
{
    Console.WriteLine(synonym);
    string cypher = $@"
                        CALL db.index.fulltext.queryNodes(""ENTITY_TEXT"", ""{synonym}"")
                        YIELD node AS e1
                        MATCH (e1)-[r]-(e2:ENTITY)
                        RETURN e1.id, e1.type, e1.text, e2.name, e2.type, e2.text, type(r)
                    ";

    var textSearchResult = await driver.ExecutableQuery(cypher)
                    .WithConfig(config)
                    .ExecuteAsync();
    if (textSearchResult.Result.Count() > 0)
    {
        foreach(var r in textSearchResult.Result)
        {
            var tripletText = $"{r["e1.text"]} -> {r["type(r)"]} -> {r["e2.text"]}";
            if (!uniqueNodes.ContainsKey(tripletText))
            {
                uniqueNodes.Add(tripletText,tripletText);
            }   
        }
    }
}

Console.WriteLine("");
Console.WriteLine($"{uniqueNodes.Count} Unique nodes with matches:");
foreach(var key in uniqueNodes.Keys)
{
    Console.WriteLine($"{key}");
}

In [13]:
string question = $@"
                    WITH genai.vector.encode(
                        $question,
                        'AzureOpenAI',
                        {{
                            token: $token,
                            resource: $resource,
                            deployment: $deployment
                        }}) AS question_embedding
                    CALL db.index.vector.queryNodes(
                        'CHUNK_EMBEDDING',
                        $top_k, 
                        question_embedding
                        ) YIELD node AS chunk, score 
                    RETURN chunk.id, chunk.text, score
                    ";

var chunkResult = await driver.ExecutableQuery(question)
                .WithParameters(new() { 
                    {"question", questionText},
                    {"token", envVars["AZURE_OPENAI_API_KEY"]}, 
                    {"resource", envVars["AZURE_OPENAI_RESOURCE"]}, 
                    {"deployment", envVars["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"]},
                    {"top_k", 5}})
                .WithConfig(config)
                .ExecuteAsync();

In [None]:
Console.WriteLine(JsonSerializer.Serialize(chunkResult, new JsonSerializerOptions {
             WriteIndented = true
         }));

StringBuilder chunkTexts = new StringBuilder();
foreach(var r in chunkResult.Result)
{
    chunkTexts.AppendLine($"Document: {{ text: {r["chunk.text"].ToString()} }}");
}

Console.WriteLine(chunkTexts.ToString());

In [None]:
ChatClient chatClient = client.GetChatClient("chat");

string prompt = $@"Question: {questionText}
                ######################
                Answer:";



string sysprompt = @"Be brief in your answers.
                    Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
                    For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.";

ChatCompletion completion = chatClient.CompleteChat(
    [
        new SystemChatMessage(sysprompt),
        new UserChatMessage(prompt),
    ]);

Console.WriteLine($"{completion.Role}: {completion.Content[0].Text}");

In [None]:
ChatClient chatClient = client.GetChatClient("chat");

string context = $@"Structured data:
    {string.Join(Environment.NewLine, uniqueNodes.Keys.ToArray())}
Unstructured data:
{chunkTexts.ToString()}
";

string prompt = $@"Answer the question based only on the following context:
			    {context}
                ######################
                Question: {questionText}
                ######################
                Answer:";


string sysprompt = @"Be brief in your answers.
                    Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
                    For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.";

ChatCompletion completion = chatClient.CompleteChat(
    [
        new SystemChatMessage(sysprompt),
        new UserChatMessage(prompt),
    ]);

Console.WriteLine($"{completion.Role}: {completion.Content[0].Text}");