# Dedupe Data

In this notebook we will dedupe entities / perform entity resolution.

## What is entity resolution?

Entity Resolution is a technique to identify data records in a single data source or across multiple data sources that refer to the same real-world entity and to link the records together.

https://towardsdatascience.com/an-introduction-to-entity-resolution-needs-and-challenges-97fba052dde5


## Define types

Defining models in the clean dataset. While the data provider does a good job at extracting the schema, it is easier to work with the types when we define them.

In [25]:
type AlkgieV1EntityTypes =
    | Theme
    | Code

type AlkgieV1SourceEntry = {
    Id: Guid
    SourceName: string
    EntityNameAlias: Option<string>
    Description: string
    Url: Option<string>
    Relations: Guid[]
}

type AlkgieV1Entity =
    {
        Id: Guid
        DisplayName: string
        SourceEntries: AlkgieV1SourceEntry[]
        EntityType: AlkgieV1EntityTypes
    }
    member this.Relations with get() = this.SourceEntries |> Seq.collect (fun x -> x.Relations)


## Load the data into memory

First step is to take the cleaned data from step 3 and load it into the notebook.

In [26]:
#r "nuget:FSharp.Data"
open FSharp.Data
type CleanedData = JsonProvider<"../data/cleaned/cleaned-dataset.json">

let mapEntityType entityType =
    match entityType with
    | "Theme" -> Theme
    | "Code" -> Code
    | _ -> failwith "Unknown entity type"

let mapAlkgieV1SourceEntry (sourceEntry: CleanedData.SourceEntry) =
    { 
        Id = sourceEntry.Id;
        SourceName = sourceEntry.SourceName;
        EntityNameAlias = sourceEntry.EntityNameAlias;
        Description = sourceEntry.Description;
        Url = sourceEntry.Url;
        Relations = sourceEntry.Relations
    }

let mapAlkgieV1Entity (entity: CleanedData.Root) =
    {
        Id = entity.Id;
        DisplayName = entity.DisplayName;
        SourceEntries = entity.SourceEntries |> Seq.map mapAlkgieV1SourceEntry |> Seq.toArray;
        EntityType = mapEntityType entity.EntityType
    }

let dataset =
    CleanedData.Load("../data/cleaned/cleaned-dataset.json")
    |> Seq.map mapAlkgieV1Entity

Seq.length dataset

## Defining Dedupe Helpers

Some helper functions that make deduplication easier

In [27]:
let mergeEntities (entity1: AlkgieV1Entity) (entity2: AlkgieV1Entity) =
    let sourceEntries = Array.append entity1.SourceEntries entity2.SourceEntries
    let name = match (entity1.DisplayName, entity2.DisplayName) with
                | ("MISSING DISPLAY NAME", _) -> entity2.DisplayName
                | (_, "MISSING DISPLAY NAME") -> entity1.DisplayName
                | _ -> "MISSING DISPLAY NAME"
    {
        Id = entity1.Id;
        DisplayName = entity1.DisplayName;
        SourceEntries = sourceEntries;
        EntityType = entity1.EntityType;
    }

type MaybeBuilder() =

    member this.Bind(x, f) =
        match x with
        | None -> None // The entire maybe evaluates to none
        | Some a -> f a // f of a continues the expression by applying f to a

    member this.Return(x) =
        Some x // terminates the expression and returns the value

let maybe = new MaybeBuilder()

## Dedupe on exact name match

NOTE: This dedupes both themes and codes atm BTW

Performing a dedupe on themes if the names exactly match, as we expect they are talking about the same concept.
Note: Due to the way pragmatics work, it is possible for two themes to mean different things in different contexts (contextual overloading). Will need to do an investigation on if we can perform a more complex analysis.

(pragmatics as in pragmatics, semantics, syntax - language theory, modeling language theory specifically).

However as a first attempt, exact name matching should be pretty good.

In [32]:
let isExactNameMatch entity1 (alias: option<string>) =
    let answer = maybe {
        let! alias = alias
        return entity1.DisplayName.Trim().ToLower() = alias.Trim().ToLower()
    }
    answer |> function | Some(x) -> x | None -> false

let exactNameMatches = 
    dataset
    |> Seq.map (fun entity -> (entity, entity.SourceEntries |> Seq.map (fun sourceEntry -> sourceEntry.EntityNameAlias)))
    // Obtain exact name matching of aliases
    |> Seq.map (fun (entity1, aliases) ->
        let aliasSet =
            dataset
            |> Seq.filter (fun entity2 -> aliases |> Seq.exists (fun alias -> isExactNameMatch entity2 alias))
            |> Seq.map (fun entity2 -> entity2.Id)
            |> Set.ofSeq
        (entity1, aliasSet)
        )

let dupes =
    exactNameMatches
    |> Seq.groupBy (fun (_, aliasSet) -> aliasSet)
    |> Seq.filter (fun (_, entities) -> Seq.length entities > 1)
    |> Seq.toList

let duplicates =
    dupes
    |> Seq.collect (fun (_, entities) -> entities)
    |> Seq.map (fun (entity, _) -> entity)
    |> Seq.toList

let dedupes =
    dupes
    |> Seq.map (fun (_, entities) ->
        let entities = entities |> Seq.map (fun (entity, _) -> entity)
        let merged = entities |> Seq.reduce mergeEntities
        merged
    )
    |> Seq.toList

let updateEntity entity =
    let updateRelations relation =
        let answer =
            maybe {
                let! (relationshipDupes, _) = dupes |> Seq.tryFind (fun (aliasSet, _) -> aliasSet |> Set.contains relation)
                return relationshipDupes |> Seq.head
            }
        answer |> function | Some(x) -> x | None -> relation

    let updateSourceEntry entry =
        { entry with Relations = entry.Relations |> Array.map updateRelations }

    { entity with SourceEntries = entity.SourceEntries |> Array.map updateSourceEntry }

let dedupedDataset =
    dataset
    |> Seq.except duplicates
    |> Seq.append dedupes
    |> Seq.map updateEntity
    |> Seq.toList

dedupedDataset
|> Seq.length

### Test dedupe worked

This section just contains a little bit of working code to sanity check that the deduplication code is creating working relationships.

In [31]:
// Writing these using the f# query syntax because why not try it out.

let entitiesToFind =
    query {
        for entity in dedupedDataset do
        where (entity.SourceEntries |> Seq.exists (fun sourceEntry -> sourceEntry.Relations.Length > 1))
        select entity.Relations
        head
    }

let entitiesFromRelations =
    query {
        for entity in dedupedDataset do
        leftOuterJoin relation in entitiesToFind on (entity.Id = relation) into results
        for result in results do
        where (result <> Guid.Empty) // I dislike that it's the default (Guid.Empty) and not null/none
        select (entity, result)
        distinct
    }

// Test successful, able to retreive two entries from a record with multiple relations.
entitiesFromRelations

## Dedupe on link

> TODO: Do something intellegent to check if the base url is the same, and if they are, deduplicate it!