# Clean data

In this notebook, we are cleaning the data and converting it into AlkgieV1 Entities.

This step is important in generating ids and further processing the data into a coherent set of domain specific entities that the Alkgie project can use.

## Flatten the data

In [1]:
#r "nuget:FSharp.Data"

open FSharp.Data
type ScrappedData = JsonProvider<"../data/scrapped/scrapped-dataset.json">
let datasets = ScrappedData.Load("../data/scrapped/scrapped-dataset.json")

type FlattenedData = {
    Link: Option<string>
    Name: Option<string>
    Description: string
    Headers: string[]
    AwesomeList: string
}

let flattenedData = 
    datasets
    |> Seq.collect (fun dataset ->
        dataset.Data
        |> Seq.map (fun item -> 
            {
                Link = item.Link
                Name = item.Name
                Description = item.Description
                Headers = item.Headers
                AwesomeList = dataset.Filename
            }
        )
    )
    |> Seq.toList

open System.IO
open System.Text.Json
let flattenedDataAsJson = JsonSerializer.Serialize(flattenedData);


In [2]:
#!set --value @fsharp:flattenedDataAsJson --name flattenedData

#r "nuget: Newtonsoft.Json, 13.0.3"
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

var sourceEntries =
    from d in JArray.Parse(flattenedData)
    select new {
        Id = Guid.NewGuid(),
        SourceName = (string)d["AwesomeList"],
        EntityNameAlias = (string)d["Name"],
        Description = (string)d["Description"],
        Url = (string)d["Link"],
        Relations = d["Headers"].Values<string>()
    };


// Generate an entity for each source entry, giving each entry an id
// (deduplication occurs in the step 4 notebook)
var partialEntities =
    from se in sourceEntries
    select new {
        Id = Guid.NewGuid(),
        DisplayName = se.EntityNameAlias,
        SourceEntries = new[] { se },
        EntityType = "TODO",
    } into partialEntity
    select partialEntity;

// Calculate the relationships between entities
var relationships =
    from pe in partialEntities
    from se in pe.SourceEntries
    from r in se.Relations
    join p in partialEntities on r equals p.DisplayName
    select new {
        Key = new {
            SourceName = se.SourceName,
            RelationshipName = p.DisplayName
        },
        RelationshipId = p.Id,
    };

// Today I learned that you can't override types using the "with" syntax, and you have
// to preserve the original type. Therefore I need to specify the creation of an entirely new anon object
// fully to correctly implement
var entitiesWithCompleteRelationships =
    from pe in partialEntities
    select new {
        Id = pe.Id,
        DisplayName = pe.DisplayName,
        EntityType = pe.EntityType,
        SourceEntries =
            from se in pe.SourceEntries
            select new {
                Id = se.Id,
                SourceName = se.SourceName,
                EntityNameAlias = se.EntityNameAlias,
                Description = se.Description,
                Url = se.Url,
                Relations = 
                    from rel in relationships
                    where rel.Key.SourceName == se.SourceName && rel.Key.RelationshipName == se.EntityNameAlias
                    select rel.RelationshipId
            }
    };

// Calculate the entity types using the relationship count
var entitiesWithEntityTypes =
    from e in entitiesWithCompleteRelationships
    select e with {
        EntityType = relationships.Any(x => x.RelationshipId == e.Id) ? "Theme" : "Code"
    };


var entities = JsonConvert.SerializeObject(entitiesWithEntityTypes.ToList(), Formatting.Indented);

## Save cleaned data

Saving the results of data cleaning

In [3]:
#!set --value @csharp:entities --name entities

// Actual saving
open System.IO

let filePath = "../data/cleaned/cleaned-dataset.json"

let directoryPath = Path.GetDirectoryName(filePath)
if not <| Directory.Exists(directoryPath) then
    Directory.CreateDirectory(directoryPath) |> ignore

File.WriteAllText(filePath, entities)