# Building a Simple Recommendation System in FSharp

## Install Nuget Packages

In [3]:
//#r "nuget:Microsoft.ML"
//#r "nuget:MathNet.Numerics"
//#r "nuget:MathNet.Numerics.FSharp"
//#r "nuget:FSharp.Data"
//#r "nuget:Deedle"

#load "Paket.fsx"
Paket.Package [ "FSharp.Data"; "FsLab" ]
#load "Paket.Generated.Refs.fsx"

## Import Packages

In [4]:
// Imports
open System
open FSharp.Data
open MathNet.Numerics
open System.Collections.Generic
open Microsoft.FSharp.Collections
open System.Text

## Cosine Distance

In [5]:
let x : double[] = [| 1.; 3. |]
let y : double[] = [| 3.; 1. |]

Distance.Cosine(x, y)

0.4

## Domain

In [27]:
type MovieData = { title    : string; 
                   genres   : string list; 
                   keywords : string list; 
                   overview : string list;
                   soupList : string list }
                          
type MovieCompareData = { title    : string; 
                          soup     : string; 
                          original : MovieData }

## Word Soup Ingredients

In [7]:
let tokenizeAndClean (words : string) : string list =
    // Tokenize
    let split = words.Split(' ')
    
    // Lowercase
    let lowered = 
        split
        |> Array.map(fun s -> s.ToLower())
    
    // Remove Common Stop Words that don't add meaning.
    let commonStopWords = 
        Set.ofList ["ourselves"; "hers"; "between"; "yourself"; "but"; "again"; "there"; "about"; "once"; "during"; "out"; "very"; "having"; "with"; "they"; "own"; "an"; "be"; "some"; "for"; "do"; "its"; "yours"; "such"; "into"; "of"; "most"; "itself"; "other"; "off"; "is"; "s"; "am"; "or"; "who"; "as"; "from"; "him"; "each"; "the"; "themselves"; "until"; "below"; "are"; "we"; "these"; "your"; "his"; "through"; "don"; "nor"; "me"; "were"; "her"; "more"; "himself"; "this"; "down"; "should"; "our"; "their"; "while"; "above"; "both"; "up"; "to"; "ours"; "had"; "she"; "all"; "no"; "when"; "at"; "any"; "before"; "them"; "same"; "and"; "been"; "have"; "in"; "will"; "on"; "does"; "yourselves"; "then"; "that"; "because"; "what"; "over"; "why"; "so"; "can"; "did"; "not"; "now"; "under"; "he"; "you"; "herself"; "has"; "just"; "where"; "too"; "only"; "myself"; "which"; "those"; "i"; "after"; "few"; "whom"; "t"; "being"; "if"; "theirs"; "my"; "against"; "a"; "by"; "doing"; "it"; "how"; "further"; "was"; "here"; "than"]
    let notStopWords =
        lowered
        |> Array.filter(fun s -> not (Set.contains s commonStopWords))

    let nonPunctation =  
        notStopWords
        |> Array.map(fun x -> x.Replace("�", "")
                               .Replace("'", "")
                               .Replace(":", "")
                               .Replace(".", "")
                               .Replace(",", "")
                               .Replace("-", "")
                               .Replace("!", "")
                               .Replace("?", "")
                               .Replace("\"", "")
                               .Replace(";", ""))
    
    // Lexicographically sort
    let sorted = Array.sort nonPunctation
    
    // Concatenate the joined data
    sorted
    |> Array.toList
 
// Gets the Frequency table.
let countWords = Seq.countBy id

// Gets the freauencies per word.
let getFrequencies (tokenizedSample: string list) =  
    (countWords tokenizedSample)
    |> Seq.map (fun (a,b) -> b)
    
// Gets the words in the frequency table.s
let getWords (tokenizedSample: string list) =  
    (countWords tokenizedSample)
    |> Seq.map (fun (a,b) -> a)

In [8]:
let sample = "The quick brown. fox jumps! over the; Lazy dog" 
let tokenizedSample = tokenizeAndClean sample
tokenizedSample

["brown"; "dog"; "fox"; "jumps"; "lazy"; "quick"; "the"]

## Sanitization Functions

In [9]:
// Grab Genre
[<Literal>]
let SampleGenresJson = "[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 80, \"name\": \"Crime\"}]"

type GenreProvider = JsonProvider< SampleGenresJson >

let sanitizeGenre (genres : string) : string list = 
    let parsed = GenreProvider.Parse(genres)
    parsed
    |> Array.map( fun x -> x.Name.Replace(" ", "")) 
    |> String.concat " "
    |> tokenizeAndClean

// Grab Keywords
[<Literal>]
let SampleKeywordsJson = "[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\": 2964, \"name\": \"future\"}, {\"id\": 3386, \"name\": \"space war\"}]"

type KeywordsProvider = JsonProvider< SampleKeywordsJson >

let sanitizeKeywords (keywords : string) : string list = 
    let parsed = KeywordsProvider.Parse(keywords)
    parsed
    |> Array.map( fun x -> x.Name.Replace(" ", ""))
    |> String.concat " "
    |> tokenizeAndClean

// Grab Overview
let sanitizeOverview (overview : string) : string list = 
    let nonAsciiRemoved = Encoding.ASCII.GetString(Encoding.ASCII.GetBytes(overview))
    tokenizeAndClean nonAsciiRemoved
    |> List.filter( fun x -> x <> String.Empty )

## Load Data

In [29]:
let extractData =
    let data = CsvFile.Load( "../../Desktop/F#/FSharp-Advent-2019/data/tmdb_5000_movies.csv").Cache()
    let mutable output = []
    for row in data.Rows do
        let title    = (row.GetColumn "title")
        
        // Genres
        let genres        = sanitizeGenre (row.GetColumn "genres")
        let getSoupGenres = genres |> String.concat " "
        
        // Keywords
        let keywords       = sanitizeKeywords (row.GetColumn "keywords")
        let getSoupKeyword = keywords |> String.concat " "
        
        // Overview
        let overview        = sanitizeOverview (row.GetColumn "overview")
        let getSoupOverview = overview |> String.concat " "
        
        // Soup
        let soup     = getSoupGenres + " " + getSoupKeyword + " " + getSoupOverview
        let soupList = genres @ keywords @ overview
        
        // Construct data types
        let movieData = { title = title; 
                          genres = genres; 
                          keywords = keywords; 
                          overview = overview;
                          soupList = soupList }
                          
        let movieCompareData = { title    = title; 
                                 soup     = soup; 
                                 original = movieData }
        
        // Append the output
        output <- output @ [movieCompareData]
        
    output
    |> Seq.ofList
        
extractData
|> Seq.take 1

seq
  [{title = "Avatar";
    soup =
     "action adventure fantasy sciencefiction 3d alien alienplanet antiwar battle cgi cultureclash future futuristic loveaffair marine mindandsoul powerrelations romance society soldier space spacecolony spacetravel spacewar tribe 22nd alien becomes century civilization dispatched following marine mission moon orders pandora paraplegic protecting torn unique";
    original =
     {title = "Avatar";
      genres = ["action"; "adventure"; "fantasy"; "sciencefiction"];
      keywords =
       ["3d"; "alien"; "alienplanet"; "antiwar"; "battle"; "cgi"; "cultureclash";
        "future"; "futuristic"; "loveaffair"; "marine"; "mindandsoul";
        "powerrelations"; "romance"; "society"; "soldier"; "space";
        "spacecolony"; "spacetravel"; "spacewar"; "tribe"];
      overview =
       ["22nd"; "alien"; "becomes"; "century"; "civilization"; "dispatched";
        "following"; "marine"; "mission"; "moon"; "orders"; "pandora";
        "paraplegic"; "protec

### Word Frequency Table Helpers

In [58]:
let getAllWords = 
    extractData
    |> Seq.map(fun x -> x.original.soupList)
    |> Seq.concat 
    |> Seq.distinct

let getWordCountForSoup (movieCompare : MovieCompareData) = 

    // I feel like using a _mutable_ dictionary here.. 
    let words = getAllWords
    let counts = 
        getWords
    printfn "%A" words
    printfn "%A" counts
 
let first = 
    extractData
    |> Seq.take 1
    |> Seq.item 0
    
getWordCountForSoup first

seq ["action"; "adventure"; "fantasy"; "sciencefiction"; ...]
seq
  [[(true, 1); (false, 40)]; [(false, 40); (true, 1)]; [(false, 40); (true, 1)];
   [(false, 40); (true, 1)]; ...]
