# Clean data

In this notebook, we are cleaning the data and converting it into AlkgieV1 Entities.

This step is important in generating ids and further processing the data into a coherent set of domain specific entities that the Alkgie project can use.

## Flatten the data

In [None]:
#r "nuget:FSharp.Data"

open FSharp.Data
type ScrappedData = JsonProvider<"../data/scrapped/scrapped-dataset.json">
let datasets = ScrappedData.Load("../data/scrapped/scrapped-dataset.json")

type FlattenedData = {
    Link: Option<string>
    Name: Option<string>
    Description: string
    Headers: string[]
    AwesomeList: string
}

let flattenedData = 
    datasets
    |> Seq.collect (fun dataset ->
        dataset.Data
        |> Seq.map (fun item -> 
            {
                Link = item.Link
                Name = item.Name
                Description = item.Description
                Headers = item.Headers
                AwesomeList = dataset.Filename
            }
        )
    )
    |> Seq.toList

open System.IO
open System.Text.Json
let flattenedDataAsJson = JsonSerializer.Serialize(flattenedData);


In [6]:
#!set --value @fsharp:flattenedDataAsJson --name flattenedData

#r "nuget: Newtonsoft.Json, 13.0.3"
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

var sourceEntries =
    (from d in JArray.Parse(flattenedData)
    select new {
        Id = Guid.NewGuid(),
        SourceName = (string)d["AwesomeList"],
        EntityNameAlias = (string)d["Name"],
        Description = (string)d["Description"],
        Url = (string)d["Link"],
        Relations = d["Headers"].Values<string>()
    }).ToList();

display("Source Entries");
display(sourceEntries.Where(se => se.EntityNameAlias == "Graph Databases"));

// Generate source entries for Headers
var headerSourceEntries =
    sourceEntries
    .Select(se => new {se.SourceName, se.Relations})
    .SelectMany(se => se.Relations, (se, relation) => new {se.SourceName, Relation = relation, Parents = se.Relations.Skip(se.Relations.ToList().FindIndex(r => r == relation))})
    .DistinctBy(x => new {x.SourceName, x.Relation})
    .Select(se => new {
        Id = Guid.NewGuid(),
        SourceName = se.SourceName,
        EntityNameAlias = se.Relation,
        Description = "",
        Url = "",
        Relations = se.Parents
    }).ToList();

display("Header Source Entries");
display(headerSourceEntries.Where(se => se.EntityNameAlias == "Graph Databases"));
display(headerSourceEntries.Where(se => se.EntityNameAlias == "Infrastructure"));

// Combine the source entries and header source entries
var completeSourceEntries =
    from se in sourceEntries
    join hse in headerSourceEntries on new {se.EntityNameAlias, se.SourceName} equals new {hse.EntityNameAlias, hse.SourceName} into hsesGroup
    from hses in hsesGroup.DefaultIfEmpty() 
    select new {
        Id = hses?.Id ?? se.Id,
        SourceName = hses.SourceName,
        EntityNameAlias = se.EntityNameAlias,
        Description = se.Description,
        Url = se.Url,
        Relations = (hsesGroup.SelectMany(h => h.Relations).Concat(se.Relations)).Distinct()
    };
display("Complete Source Entries");
display(completeSourceEntries.Where(se => se.EntityNameAlias == "Graph Databases"));
display(completeSourceEntries.Where(se => se.EntityNameAlias == "Infrastructure"));



// Generate an entity for each source entry, giving each entry an id
// (deduplication occurs in the step 4 notebook)
var partialEntities =
    (from se in completeSourceEntries
    select new {
        Id = Guid.NewGuid(),
        DisplayName = se.EntityNameAlias,
        SourceEntries = new[] { se },
        EntityType = "TODO",
    } into partialEntity
    select partialEntity).ToList();

display("Partial Entities");
display(partialEntities.Where(pe => pe.DisplayName == "ArangoDB"));
display(partialEntities.Where(pe => pe.DisplayName == "Graph Databases"));
display(partialEntities.Where(pe => pe.DisplayName == "Infrastructure"));


// Calculate the relationships between entities
Guid? GetEntityIdForRelationship(string? sourceName, string? relationshipName) {
    var entitiesFromSource = partialEntities.Where(x => x.SourceEntries.Any(se => se.SourceName == sourceName));
    var entitiesWithName = entitiesFromSource.Where(x => x.DisplayName == relationshipName);
    return entitiesWithName.FirstOrDefault()?.Id;
}

display("GetEntityIdForRelationship");
display(GetEntityIdForRelationship("awesome-knowledge-graph.md", "Graph Databases"));

// Map new things
var withRelationships =
    partialEntities
    .Select(pe => new {
        Id = pe.Id,
        DisplayName = pe.DisplayName,
        EntityType = pe.EntityType,
        SourceEntries = pe.SourceEntries.Select(se => new {
            Id = se.Id,
            SourceName = se.SourceName,
            EntityNameAlias = se.EntityNameAlias,
            Description = se.Description,
            Url = se.Url,
            Relations = se.Relations.Select(r => GetEntityIdForRelationship(se.SourceName, r))
        })
    });

display("withRelationship");
display(withRelationships.Where(pe => pe.DisplayName == "ArangoDB"));


//display(relationships.Where(r => r.Key.RelationshipName == "Graph Databases"));




/*var relationships =
    partialEntities

    .SelectMany(pe => pe.SourceEntries)
    .SelectMany(se => se.Relations.Select(r => new {Key = new { SourceName = se.SourceName, RelationshipName = r } }))
    .Distinct()
    .Select(x => new { Key = x.Key, EntityId = x.Key.SourceName is not null ? GetEntityForRelationship(x.Key.SourceName, x.Key.RelationshipName) : null});*/

//display(relationships.Where(r => r.Key.RelationshipName == "Graph Databases"));


/*
display(relationships.Where(r => r.Key.RelationshipName == "Graph Databases"));


//display(GetEntityForRelationship("awesome-fsharp.md", "Table of Contents"));

/*
var relationships =
    partialEntities
        .SelectMany(pe => pe.SourceEntries)
        .Select(se => se.Relations.Select(r => new { SourceName = se.SourceName, RelationshipName = r }))
        .SelectMany(rel => rel)
        .Distinct()
        .Select(rel => new { Key = rel, RelationshipId = partialEntities.FirstOrDefault(pe => pe.DisplayName == rel.RelationshipName && pe.SourceEntries.Select(se => se.SourceName).Contains(rel.SourceName))?.Id })
        .ToList();
        //.Join(partialEntities, rel => rel.RelationshipName, p => p.DisplayName, (rel, p) => new { Key = rel, RelationshipId = p.Id });
        */
/*
display(relationships.Where(r => r.Key.RelationshipName == "Graph databases"));
*/
/*
// Today I learned that you can't override types using the "with" syntax, and you have
// to preserve the original type. Therefore I need to specify the creation of an entirely new anon object
// fully to correctly implement
var entitiesWithCompleteRelationships =
    from pe in partialEntities
    select new {
        Id = pe.Id,
        DisplayName = pe.DisplayName,
        EntityType = pe.EntityType,
        SourceEntries =
            from se in pe.SourceEntries
            select new {
                Id = se.Id,
                SourceName = se.SourceName,
                EntityNameAlias = se.EntityNameAlias,
                Description = se.Description,
                Url = se.Url,
                Relations = 
                    (from rel in relationships
                    where rel.Key.SourceName == se.SourceName && rel.Key.RelationshipName == se.EntityNameAlias
                    select rel.EntityId).Distinct()
            }
    };

// Remove reflexive relationships
//var entitiesNoReflexiveRelationships =
//    entitiesWithCompleteRelationships.Select(e => e with {SourceEntries = e.SourceEntries.Select(se => se with {Relations = se.Relations.Where(r => r != e.Id)})});

// Calculate the entity types using the relationship count
var entitiesWithEntityTypes =
    from e in entitiesWithCompleteRelationships
    select e with {
        EntityType = relationships.Any(x => x.EntityId == e.Id) ? "Theme" : "Code"
    };


var entities = JsonConvert.SerializeObject(entitiesWithEntityTypes.ToList(), Formatting.Indented);
*/

Source Entries

index,value
,
0,"{ Id = 4ea73aef-4499-4556-aa36-a63fa392827a, SourceName = awesome-knowledge-graph.md, EntityNameAlias = Graph Databases, Description = Graph Databases, Url = #graph-databases, Relations = Newtonsoft.Json.Linq.Extensions+<Convert>d__14`2[Newtonsoft.Json.Linq.JToken,System.String] }Id4ea73aef-4499-4556-aa36-a63fa392827aSourceNameawesome-knowledge-graph.mdEntityNameAliasGraph DatabasesDescriptionGraph DatabasesUrl#graph-databasesRelations[ Contents, Awesome Knowledge Graph ]"
,
Id,4ea73aef-4499-4556-aa36-a63fa392827a
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Graph Databases
Description,Graph Databases
Url,#graph-databases
Relations,"[ Contents, Awesome Knowledge Graph ]"

Unnamed: 0,Unnamed: 1
Id,4ea73aef-4499-4556-aa36-a63fa392827a
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Graph Databases
Description,Graph Databases
Url,#graph-databases
Relations,"[ Contents, Awesome Knowledge Graph ]"


Header Source Entries

index,value
,
0,"{ Id = e0501a17-e196-489e-9a07-43f8c367c078, SourceName = awesome-knowledge-graph.md, EntityNameAlias = Graph Databases, Description = , Url = , Relations = System.Linq.Enumerable+EnumerablePartition`1[System.String] }Ide0501a17-e196-489e-9a07-43f8c367c078SourceNameawesome-knowledge-graph.mdEntityNameAliasGraph DatabasesDescriptionUrlRelations[ Graph Databases, Infrastructure, Awesome Knowledge Graph ]"
,
Id,e0501a17-e196-489e-9a07-43f8c367c078
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Graph Databases
Description,
Url,
Relations,"[ Graph Databases, Infrastructure, Awesome Knowledge Graph ]"

Unnamed: 0,Unnamed: 1
Id,e0501a17-e196-489e-9a07-43f8c367c078
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Graph Databases
Description,
Url,
Relations,"[ Graph Databases, Infrastructure, Awesome Knowledge Graph ]"


index,value
,
0,"{ Id = febb655e-2dce-43f9-8c15-30b84223e291, SourceName = awesome-knowledge-graph.md, EntityNameAlias = Infrastructure, Description = , Url = , Relations = System.Linq.Enumerable+EnumerablePartition`1[System.String] }Idfebb655e-2dce-43f9-8c15-30b84223e291SourceNameawesome-knowledge-graph.mdEntityNameAliasInfrastructureDescriptionUrlRelations[ Infrastructure, Awesome Knowledge Graph ]"
,
Id,febb655e-2dce-43f9-8c15-30b84223e291
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Infrastructure
Description,
Url,
Relations,"[ Infrastructure, Awesome Knowledge Graph ]"

Unnamed: 0,Unnamed: 1
Id,febb655e-2dce-43f9-8c15-30b84223e291
SourceName,awesome-knowledge-graph.md
EntityNameAlias,Infrastructure
Description,
Url,
Relations,"[ Infrastructure, Awesome Knowledge Graph ]"


Complete Source Entries

Error: System.NullReferenceException: Object reference not set to an instance of an object.
   at Submission#9.<>c.<<Initialize>>b__0_13(<>f__AnonymousType0#6`2 <>h__TransparentIdentifier0, <>f__AnonymousType0#2`6 hses)
   at System.Linq.Enumerable.SelectManyIterator[TSource,TCollection,TResult](IEnumerable`1 source, Func`2 collectionSelector, Func`3 resultSelector)+MoveNext()
   at System.Linq.Enumerable.WhereEnumerableIterator`1.MoveNext()
   at System.Linq.Enumerable.SelectIterator[TSource,TResult](IEnumerable`1 source, Func`3 selector)+MoveNext()
   at Microsoft.DotNet.Interactive.Formatting.EnumerableExtensions.TakeAndCountRemaining[T](IEnumerable`1 source, Int32 count, Boolean forceCountRemainder) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\EnumerableExtensions.cs:line 23
   at Microsoft.DotNet.Interactive.Formatting.HtmlFormatter`1.<CreateTreeViewFormatterForAnyEnumerable>g__BuildTable|7_4(T source, FormatContext context, Func`2 getKeys, Func`2 getValues, Boolean summarize) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\HtmlFormatter{T}.cs:line 106
   at Microsoft.DotNet.Interactive.Formatting.HtmlFormatter`1.<>c__DisplayClass7_0.<CreateTreeViewFormatterForAnyEnumerable>b__3(T value, FormatContext context) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\HtmlFormatter{T}.cs:line 85
   at Microsoft.DotNet.Interactive.Formatting.HtmlFormatter`1.Format(T value, FormatContext context) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\HtmlFormatter{T}.cs:line 54
   at Microsoft.DotNet.Interactive.Formatting.HtmlFormatter.<>c.<.cctor>b__0_14(Object value, FormatContext context) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\HtmlFormatter.cs:line 225
   at Microsoft.DotNet.Interactive.Formatting.HtmlFormatter`1.Format(T value, FormatContext context) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\HtmlFormatter{T}.cs:line 54
   at Microsoft.DotNet.Interactive.Formatting.Formatter`1.FormatTo(T obj, FormatContext context, String mimeType) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\Formatter{T}.cs:line 68
   at Microsoft.DotNet.Interactive.Formatting.Formatter.ToDisplayString(Object obj, String mimeType) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive.Formatting\Formatter.cs:line 277
   at Microsoft.DotNet.Interactive.FormattedValue.<>c__DisplayClass12_0.<CreateManyFromObject>b__0(String mimeType) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive\FormattedValue.cs:line 49
   at System.Linq.Enumerable.SelectArrayIterator`2.Fill(ReadOnlySpan`1 source, Span`1 destination, Func`2 func)
   at System.Linq.Enumerable.SelectArrayIterator`2.ToArray()
   at Microsoft.DotNet.Interactive.FormattedValue.CreateManyFromObject(Object value, String[] mimeTypes) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive\FormattedValue.cs:line 47
   at Microsoft.DotNet.Interactive.KernelInvocationContextExtensions.Display(KernelInvocationContext context, Object value, String[] mimeTypes) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive\KernelInvocationContextExtensions.cs:line 22
   at System.DisplayExtensions.Display(Object value, String[] mimeTypes) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive\DisplayExtensions.cs:line 23
   at Microsoft.DotNet.Interactive.Kernel.display(Object value, String[] mimeTypes) in D:\a\_work\1\s\src\Microsoft.DotNet.Interactive\Kernel.Static.cs:line 30
   at Submission#9.<<Initialize>>d__0.MoveNext()
--- End of stack trace from previous location ---
   at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)

## Save cleaned data

Saving the results of data cleaning

In [7]:
#!set --value @csharp:entities --name entities

// Actual saving
open System.IO

let filePath = "../data/cleaned/cleaned-dataset.json"

let directoryPath = Path.GetDirectoryName(filePath)
if not <| Directory.Exists(directoryPath) then
    Directory.CreateDirectory(directoryPath) |> ignore

File.WriteAllText(filePath, entities)

Error: Value 'entities' not found in kernel csharp

Error: Value 'entities' not found in kernel csharp