# Building a Simple Recommendation System in FSharp

## Introduction 

## Preliminaries

In [1]:
//#r "nuget:Microsoft.ML"
//#r "nuget:MathNet.Numerics"
//#r "nuget:MathNet.Numerics.FSharp"
//#r "nuget:FSharp.Data"

#load "Paket.fsx"
Paket.Package [ "FSharp.Data"; "FsLab" ]
#load "Paket.Generated.Refs.fsx"

### Import Packages

In [2]:
open System
open System.Text
open FSharp.Data
open MathNet.Numerics
open System.Collections.Generic
open Microsoft.FSharp.Collections

## Domain

In [3]:
type MovieCompareData = { title : string; soup : string; soupList : string list }

## Data Processing

### Tokenization 

In [4]:
let tokenizeAndClean (words : string) : string list =
    // Tokenize
    let split = words.Split(' ')
    
    // Lowercase
    let lowered = 
        split
        |> Array.map(fun s -> s.ToLower())
    
    // Remove Common Stop Words that don't add meaning
    let commonStopWords = 
        Set.ofList ["ourselves"; "hers"; "between"; "yourself"; "but"; "again"; "there"; "about"; "once"; "during"; "out"; "very"; "having"; "with"; "they"; "own"; "an"; "be"; "some"; "for"; "do"; "its"; "yours"; "such"; "into"; "of"; "most"; "itself"; "other"; "off"; "is"; "s"; "am"; "or"; "who"; "as"; "from"; "him"; "each"; "the"; "themselves"; "until"; "below"; "are"; "we"; "these"; "your"; "his"; "through"; "don"; "nor"; "me"; "were"; "her"; "more"; "himself"; "this"; "down"; "should"; "our"; "their"; "while"; "above"; "both"; "up"; "to"; "ours"; "had"; "she"; "all"; "no"; "when"; "at"; "any"; "before"; "them"; "same"; "and"; "been"; "have"; "in"; "will"; "on"; "does"; "yourselves"; "then"; "that"; "because"; "what"; "over"; "why"; "so"; "can"; "did"; "not"; "now"; "under"; "he"; "you"; "herself"; "has"; "just"; "where"; "too"; "only"; "myself"; "which"; "those"; "i"; "after"; "few"; "whom"; "t"; "being"; "if"; "theirs"; "my"; "against"; "a"; "by"; "doing"; "it"; "how"; "further"; "was"; "here"; "than"]
    let notStopWords =
        lowered
        |> Array.filter(fun s -> not (Set.contains s commonStopWords))

    // Remove Punctuation
    let nonPunctation =  
        notStopWords
        |> Array.map(fun x -> x.Replace("�", "")
                               .Replace("'", "")
                               .Replace(":", "")
                               .Replace(".", "")
                               .Replace(",", "")
                               .Replace("-", "")
                               .Replace("!", "")
                               .Replace("?", "")
                               .Replace("\"", "")
                               .Replace(";", ""))
    
    // Lexicographically sort
    let sorted = Array.sort nonPunctation
    
    // Concatenate the joined data
    sorted
    |> Array.toList

In [5]:
let sample = "The quick brown. fox jumps! over the; Lazy dog" 
let tokenizedSample = tokenizeAndClean sample
tokenizedSample

["brown"; "dog"; "fox"; "jumps"; "lazy"; "quick"; "the"]

### Sanitization Functions

In [6]:
// Grab Genre
[<Literal>]
let SampleGenresJson = "[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 80, \"name\": \"Crime\"}]"

type GenreProvider = JsonProvider< SampleGenresJson >

let sanitizeGenre (genres : string) : string list = 
    let parsed = GenreProvider.Parse(genres)
    parsed
    |> Array.map( fun x -> x.Name.Replace(" ", "")) 
    |> String.concat " "
    |> tokenizeAndClean

// Grab Keywords
[<Literal>]
let SampleKeywordsJson = "[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\": 2964, \"name\": \"future\"}, {\"id\": 3386, \"name\": \"space war\"}]"

type KeywordsProvider = JsonProvider< SampleKeywordsJson >

let sanitizeKeywords (keywords : string) : string list = 
    let parsed = KeywordsProvider.Parse(keywords)
    parsed
    |> Array.map( fun x -> x.Name.Replace(" ", ""))
    |> String.concat " "
    |> tokenizeAndClean

// Grab Overview
let sanitizeOverview (overview : string) : string list = 
    let nonAsciiRemoved = Encoding.ASCII.GetString(Encoding.ASCII.GetBytes(overview))
    tokenizeAndClean nonAsciiRemoved
    |> List.filter( fun x -> x <> String.Empty )

## Load Data

In [7]:
let extractData =
    let data = CsvFile.Load( "../../Desktop/F#/FSharp-Advent-2019/data/tmdb_5000_movies.csv").Cache()
    let mutable output = []
    for row in data.Rows do
        let title    = (row.GetColumn "title")
        
        // Genres
        let genres        = sanitizeGenre (row.GetColumn "genres")
        let getSoupGenres = genres |> String.concat " "
        
        // Keywords
        let keywords       = sanitizeKeywords (row.GetColumn "keywords")
        let getSoupKeyword = keywords |> String.concat " "
        
        // Overview
        let overview        = sanitizeOverview (row.GetColumn "overview")
        let getSoupOverview = overview |> String.concat " "
        
        // Soup
        let soup     = getSoupGenres + " " + getSoupKeyword + " " + getSoupOverview
        let soupList = genres @ keywords @ overview
        
        // Construct data type
        let movieCompareData = { title = title; 
                                 soup  = soup;
                                 soupList = soupList }
        
        // Append the output
        output <- output @ [movieCompareData]
        
    output
    
let getDictOfData = 
    let out = Dictionary<string, MovieCompareData>()
    for w in extractData do
        out.[w.title] <- w
    out

In [8]:
getDictOfData
|> Seq.take 1

seq
  [[Avatar, {title = "Avatar";
 soup =
  "action adventure fantasy sciencefiction 3d alien alienplanet antiwar battle cgi cultureclash future futuristic loveaffair marine mindandsoul powerrelations romance society soldier space spacecolony spacetravel spacewar tribe 22nd alien becomes century civilization dispatched following marine mission moon orders pandora paraplegic protecting torn unique";
 soupList =
  ["action"; "adventure"; "fantasy"; "sciencefiction"; "3d"; "alien";
   "alienplanet"; "antiwar"; "battle"; "cgi"; "cultureclash"; "future";
   "futuristic"; "loveaffair"; "marine"; "mindandsoul"; "powerrelations";
   "romance"; "society"; "soldier"; "space"; "spacecolony"; "spacetravel";
   "spacewar"; "tribe"; "22nd"; "alien"; "becomes"; "century"; "civilization";
   "dispatched"; "following"; "marine"; "mission"; "moon"; "orders"; "pandora";
   "paraplegic"; "protecting"; "torn"; "unique"];}]]

### Word Frequency Table Helpers

In [24]:
let getAllWords = 
    extractData
    |> Seq.map(fun x -> x.soupList)
    |> Seq.concat 
    |> Seq.distinct

let getWordCountForSoup (movieCompare : MovieCompareData) = 
    let words = getAllWords
    let wordCount = new Dictionary<string, int>()
    for w in words do
        let count = 
            movieCompare.soupList
            |> List.filter( fun x -> x = w )
            |> List.length
        if not (wordCount.ContainsKey w) then wordCount.[w] <- count
        else wordCount.[w] <- wordCount.[w] + count
    wordCount
    
let getWordCountVectorForSoup (movieCompare : MovieCompareData) = 
    (getWordCountForSoup movieCompare).Values
    |> Seq.map (fun x -> double(x))
    |> Seq.toArray
    
let getWordCountVector (movieName : string) = 
    let dictOfData = getDictOfData
    if getDictOfData.ContainsKey movieName then Some (getWordCountVectorForSoup getDictOfData.[movieName])
    else
        failwith "Movie Not Found!"

In [32]:
printfn "%A" (getWordCountForSoup (getDictOfData.["Avatar"]))
let avatarVector = getWordCountVector "Avatar"
printfn "%A" avatarVector

printfn "%A" (getWordCountForSoup (getDictOfData.["The Dark Knight Rises"]))
let darkKnightVector = getWordCountVector "The Dark Knight Rises"
printfn "%A" darkKnightVector

seq [[action, 1]; [adventure, 1]; [fantasy, 1]; [sciencefiction, 1]; ...]
Some
  [|1.0; 1.0; 1.0; 1.0; 1.0; 2.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 2.0;
    1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0;
    1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 1.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; ...|]
seq [[action, 1]; [adventure, 0]; [fantasy, 0]; [sciencefiction, 0]; ...]
Some
  [|1.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    1.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0;
    0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.0; 0.

## Cosine Distance

In [33]:
let x : double[] = [| 3.; 1. |]
let y : double[] = [| 3.; 3. |]

printfn "Cosine Distance when values are equal: %A" (Distance.Cosine(x, x))
printfn "Cosine Distance when values are different: %A" (Distance.Cosine(x, y))

Cosine Distance when values are equal: 0.0
Cosine Distance when values are different: 0.105572809


In [34]:
let computeCosineDistance (movie1 : string) (movie2 : string)  = 
    let movie1Vector = getWordCountVector movie1
    let movie2Vector = getWordCountVector movie2
    match movie1Vector, movie2Vector with
    | Some m1, Some m2 -> Distance.Cosine(m1, m2) 
    | None, Some _ -> failwith "Movie 1 not found in the data."
    | Some _, None -> failwith "Movie 2 not found in the data."
    | _ -> failwith "Neither of the movies found in the data."

In [36]:
computeCosineDistance "Avatar" "The Dark Knight Rises"

0.966873067

# Recommendations

In [44]:
let recommendMovies (movie : string) (recommendationCount : int) =
    if getDictOfData.ContainsKey movie then
        let movieData = getDictOfData.[movie]
        getDictOfData
        |> Seq.filter( fun x -> not( x.Value = movieData )) // Don't include the current item in question.
        |> Seq.map( fun x -> (x.Value.title, computeCosineDistance movie x.Value.title ))
        |> Seq.sort
        |> Seq.take (recommendationCount)
    else
        failwith "Movie not found!"

In [45]:
recommendMovies "Avatar" 10