# Dedupe Data

In this notebook we will dedupe entities / perform entity resolution.

## What is entity resolution?

Entity Resolution is a technique to identify data records in a single data source or across multiple data sources that refer to the same real-world entity and to link the records together.

https://towardsdatascience.com/an-introduction-to-entity-resolution-needs-and-challenges-97fba052dde5


## Load the data into memory

First step is to take the cleaned data from step 3 and load it into the notebook.

In [1]:
#r "nuget:FSharp.Data"
open FSharp.Data
type CleanedData = JsonProvider<"../data/cleaned/cleaned-dataset.json">

type AlkgieV1Relation = {
    Id: Guid
    Source: string
}

type AlkgieV1EntityTypes =
    | Theme
    | Code

type AlkgieV1Entity = {
    Id: Guid
    Relations: AlkgieV1Relation[]
    Link: Option<string>
    Name: Option<string>
    Description: string
    Source: string
    EntityType: AlkgieV1EntityTypes
}

let mapRelations (relations: CleanedData.Relation[]) =
    relations
    |> Seq.map (fun relation -> { Id = relation.Id; Source = relation.Source })
    |> Seq.toArray

let mapEntityType entityType =
    match entityType with
    | "Theme" -> Theme
    | "Code" -> Code
    | _ -> failwith "Unknown entity type"

let dataset =
    CleanedData.Load("../data/cleaned/cleaned-dataset.json")
    |> Seq.map (fun x -> { Id = x.Id; Relations = mapRelations x.Relations; Link = x.Link; Name = x.Name; Description = x.Description; Source = x.Source; EntityType =  mapEntityType x.EntityType })

Seq.length dataset

## Dedupe on exact name match

NOTE: This dedupes both themes and codes atm BTW

Performing a dedupe on themes if the names exactly match, as we expect they are talking about the same concept.
Note: Due to the way pragmatics work, it is possible for two themes to mean different things in different contexts (contextual overloading). Will need to do an investigation on if we can perform a more complex analysis.

(pragmatics as in pragmatics, semantics, syntax - language theory, modeling language theory specifically).

However as a first attempt, exact name matching should be pretty good.

In [5]:
let mergeEntities (entity1: AlkgieV1Entity) (entity2: AlkgieV1Entity) =
    let relations = Array.append entity1.Relations entity2.Relations
    
    let link = match entity1.Link with
               | Some(link) -> Some(link)
               | None -> entity2.Link
    let name = match entity1.Name with
               | Some(name) -> Some(name)
               | None -> entity2.Name
    { Id = entity1.Id; Relations = relations; Link = link; Name = name; Description = entity1.Description; Source = entity1.Source; EntityType = entity1.EntityType }

let duplicateGroups =
    dataset
        |> Seq.groupBy (fun x ->
            match x.Name with
            | Some(name) -> name.Trim().ToLower()
            | None -> "")
        |> Seq.filter (fun (_, entities) -> Seq.length entities > 1)

let deduped =
    duplicateGroups
    |> Seq.map (fun (_, entities) -> entities |> Seq.reduce mergeEntities)

let updateRelationships (entity: AlkgieV1Entity) =
    let duplicateIds = duplicateGroups |> Seq.collect (fun (_, entities) -> entities |> Seq.map (fun entity -> entity.Id))

    if duplicateIds |> Seq.exists (fun duplicateId -> entity.Id = duplicateId) then
        let oldId = entity.Id
        let _, duplicateIdGroup = duplicateGroups |> Seq.find (fun (_, entities) -> entities |> Seq.exists (fun entity -> entity.Id = oldId))

        let newEntity = deduped |> Seq.find (fun entity -> duplicateIdGroup |> Seq.exists (fun entity -> entity.Id = entity.Id))
        let newId = newEntity.Id

        { 
            entity with 
                Relations = 
                    entity.Relations
                    |> Array.map (fun relation -> if relation.Id = oldId then { relation with Id = newId } else relation)
                    |> Array.distinct
        }
    else
        entity


let duplicates = duplicateGroups |> Seq.collect (fun (_, entities) -> entities)

let mutable dedupedDataset =
    dataset
    |> Seq.except duplicates
    |> Seq.append deduped
    |> Seq.map updateRelationships

Seq.length dedupedDataset