# Clean data

In this notebook, we are cleaning the data and converting it into AlkgieV1 Entities.

This step is important in generating ids and further processing the data into a coherent set of domain specific entities that the Alkgie project can use.

## Flatten the data

In [19]:
#r "nuget:FSharp.Data"

open FSharp.Data
type ScrappedData = JsonProvider<"../data/scrapped/scrapped-dataset.json">
let datasets = ScrappedData.Load("../data/scrapped/scrapped-dataset.json")

type FlattenedData = {
    Link: Option<string>
    Name: Option<string>
    Description: string
    Headers: string[]
    AwesomeList: string
}

let flattenedData = 
    datasets
    |> Seq.collect (fun dataset ->
        dataset.Data
        |> Seq.map (fun item -> 
            {
                Link = item.Link
                Name = item.Name
                Description = item.Description
                Headers = item.Headers
                AwesomeList = dataset.Filename
            }
        )
    )
    |> Seq.toList
flattenedData


## Transform flattened data into AlkgieV1 entities.

AlkgieV1 entities is the name I'm giving to the data format / schema that this project produces. It's not a final format, hence the V1.

Borrowing inspiration from thematic analysis, these entities define their names as "codes", as in an identified code in thematic analysis.

Themes are groupings of codes that form a cohesive whole.

In most instances I expect codes to be specific software products such as programming lanugages (like f#, javascript), whereas I expect themes to be more related to concepts (such as the concept of programming languages itself).

We are also assigning an unique Id at this stage to make it easier to graph things. These Ids are non-stable between versions of the dataset. Hopefully a future version of this project will find a way to keep ids stable between versions.

So essentially this is the Relationship Classification (RC) stage of this data analysis project.

In [20]:
type AlkgieV1EntityTypes =
    | Theme
    | Code

type AlkgieV1SourceEntry = {
    Id: Guid
    SourceName: string
    EntityNameAlias: Option<string>
    Description: string
    Url: Option<string>
    Relations: Guid[]
    // TODO: may want to add metadata like date project was created, to compare timelines
}

type AlkgieV1Entity = {
    Id: Guid
    // DisplayName is the display name that will be priortized when displaying the entity
    // However the SourceEntries could contain aliases
    DisplayName: string
    // Source entries provide data for the entity aggregated from different sources.
    SourceEntries: AlkgieV1SourceEntry[]
    // TODO: If I allow source entries to have entitytype, then it might make it easier
    ///      To distinguish between when talking about something as a concept vs something as a code.
    //       YAGNI for now, but it might be useful in the future. Might write up a card on the github.
    EntityType: AlkgieV1EntityTypes
}

// Convert into source entries and generate source entry ids
let sourceEntries =
    flattenedData
    |> Seq.map (fun item -> 
        {
            Id = Guid.NewGuid()
            SourceName = item.AwesomeList
            EntityNameAlias = item.Name
            Description = item.Description
            Url = item.Link
            Relations = [||]
        }
    )

// Helper function to determine entity type
let getEntityType source name =
    match name with
    | None -> Code
    | Some actualName ->
        flattenedData
        |> Seq.exists (fun item -> item.AwesomeList = source && item.Headers |> Seq.contains actualName)
        |> function| true -> Theme | false -> Code

// Convert into entities and generate entity ids
let entitiesWithoutRelationships =
    sourceEntries
    |> Seq.map (fun item ->
        let name = match item.EntityNameAlias with | Some name -> name | None -> "MISSING DISPLAY NAME"
        {
            Id = Guid.NewGuid()
            DisplayName = name
            SourceEntries = [|item|]
            EntityType = getEntityType item.SourceName item.EntityNameAlias
        }
    )
    |> Seq.toList

// Helper to get relations for sourceName and entityNameAlias pair
let getRelations sourceName entityNameAlias =
    let headers =
        flattenedData
        |> Seq.filter (fun item -> item.AwesomeList = sourceName && item.Name = entityNameAlias)
        |> Seq.collect (fun item -> item.Headers) // Use Seq.collect to flatten the sequences of headers into a single sequence

    let headerEntities =
        entitiesWithoutRelationships
        |> Seq.filter (fun entity -> entity.SourceEntries |> Seq.exists (fun entry -> entry.SourceName = sourceName))
        |> Seq.filter (fun entity -> entity.SourceEntries |> Seq.exists (fun entry -> headers |> Seq.contains entity.DisplayName))
    
    headerEntities

let getRelationIds sourceName entityNameAlias =
    getRelations sourceName entityNameAlias
    |> Seq.map (fun entity -> entity.Id)
    |> Seq.toArray

// Generate complete entities with relationships
let entities =
    entitiesWithoutRelationships
    |> Seq.map (fun entity ->
        {
            Id = entity.Id
            DisplayName = entity.DisplayName
            EntityType = entity.EntityType
            SourceEntries = 
                entity.SourceEntries
                |> Seq.map (fun entry -> { entry with Relations = getRelationIds entry.SourceName entry.EntityNameAlias })
                |> Seq.toArray
        }
    )

## Save cleaned data

Saving the results of data cleaning

In [21]:

// This is a temporary hack to get around not being able to seralize Discriminated Unions
let entityTypeToString entityType =
    match entityType with
    | Theme -> "Theme"
    | Code -> "Code"

let temp =
    entities
    |> Seq.map( fun entity -> {|entity with EntityType = entityTypeToString entity.EntityType|})


// Actual saving
open System.IO
open System.Text.Json

let filePath = "../data/cleaned/cleaned-dataset.json"
let json = JsonSerializer.Serialize(temp, JsonSerializerOptions(WriteIndented = true))

let directoryPath = Path.GetDirectoryName(filePath)
if not <| Directory.Exists(directoryPath) then
    Directory.CreateDirectory(directoryPath) |> ignore

File.WriteAllText(filePath, json)