# Graph Analytics - Collaborative Filterung

<a href="https://colab.research.google.com/github/joerg84/Graph_Powered_ML_Workshop/blob/master/Graph_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We have all seen product recommandations like "People who have looked that item x, also bought item y."
In this notebook we develop a similar system for an even better cause: Figuring out which movie to watch next.
For this we consider a simple dataset with user ratings for movies and then use a technique called [Collaborative Filterung](https://en.wikipedia.org/wiki/Collaborative_filtering) to identify which new movies might be worth watching based on other movies we liked.

![ratings](https://github.com/joerg84/Graph_Powered_ML_Workshop/blob/master/img/user_movie_rating.png?raw=1)

First, setting up our environment.

In [None]:
%%capture
!git clone https://github.com/joerg84/Graph_Powered_ML_Workshop.git
!rsync -av Graph_Powered_ML_Workshop/ ./ --exclude=.git
!pip3 install pyarango

In [None]:
import csv
import json
import requests
import sys
import oasis


from pyArango.connection import *
from pyArango.collection import Collection, Edges, Field
from pyArango.graph import Graph, EdgeDefinition
from pyArango.collection import BulkOperation as BulkOperation

Next, create a temporary database instance backed by ArangoDB's Managed Cloud Service Oasis:

In [None]:
# Retrieve tmp credentials from ArangoDB Tutorial Service
login = oasis.getTempCredentials()

# Connect to the temp database
conn = oasis.connect(login)
db = conn[login["dbName"]] 

In [None]:
print("https://"+login["hostname"]+":"+str(login["port"]))
print("Username: " + login["username"])
print("Password: " + login["password"])
print("Database: " + login["dbName"])

Lets define a structure for a simple train network.

Let us take a short look at our dataset which–as often in realworld scenarios– comes in csv format. 

In [None]:
print("User Data")
!head -n 3 data/users.csv 
print()
print("Movies Data")
!head -n 3 data/movies.csv 
print()
print("Rating Data")
!head -n 3 data/ratings.csv 


Create a graph with Users and Movies as Vertices, and Ratings as edges between.

In [None]:
from pyArango.collection import Collection, Field
from pyArango.graph import Graph, EdgeDefinition


class Users(Collection):
    _fields = {
        "user_id": Field(),
        "age": Field(),
        "gender": Field()
    }
    
class Movies(Collection):
    _fields = {
        "movie_id": Field(),
        "movie_title": Field(),
        "release_data": Field()
    }

class Ratings(Edges): 
    _fields = {
        #user_id and item_id are encoded by _from, _to 
        "rating": Field(),
        "timestamp": Field()
    }

class IMDBGraph(Graph) :
    _edgeDefinitions = [EdgeDefinition("Ratings", fromCollections=["Users"], toCollections=["Movies"])]
    _orphanedCollections = []

db.createCollection("Users")
db.createCollection("Movies")
db.createCollection("Ratings")
iMDBGraph = db.createGraph("IMDBGraph", replicationFactor=3)

print("Collection/Graph Setup done.")

In [None]:
collection = db["Users"]
with BulkOperation(collection, batchSize=100) as col:
    with open('data/users.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        #Skip header
        next(reader)
        for row in reader:
            user_id,age,gender,occupation,zip = tuple(row)
            doc = col.createDocument()
            doc["_key"] = user_id
            doc["age"] = age
            doc["gender"] = gender
            doc.save()

collection = db["Movies"]
with BulkOperation(collection, batchSize=100) as col:
    with open('data/movies.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        #Skip header
        next(reader)
        for row in reader:
            movie_id, movie_title , release_date , video_release_date , url , unknown , action , adventure , animation , childrens , comedy , crime , documentary , drama , fantasy , noir , horror , musical , mystery , romance , scifi , thriller , war , western = tuple(row)
            doc = col.createDocument()
            doc["_key"] = movie_id
            doc["movie_title"] = movie_title
            doc["release_date"] = release_date
            doc.save()

collection = db["Ratings"]
with BulkOperation(collection, batchSize=1000) as col:
    with open('data/ratings.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        #Skip header
        next(reader)
        for row in reader:
            user_id,movie_id,rating,timestamp = tuple(row)
            doc = col.createDocument()
            doc["_from"] = "Users/"+user_id
            doc["_to"] = "Movies/"+movie_id
            doc["ratings"] = rating
            doc["timestamp"] = timestamp
            doc.save()
        
print("Import Done")

Let us build the Collaborative Filtering step by step;

1. Find movies I rated with 5 stars
2. Find users who also rated these movies also with 5 stars
3. Find additional movies also rated 5 stars by those users


In [None]:
my_ratings = """
WITH Movies, Users, Ratings
FOR movie, edge IN 1..1 
  OUTBOUND 'Users/1'
  GRAPH 'IMDBGraph'
  FILTER TO_NUMBER(edge.ratings) == 5
  LIMIT 10
  RETURN {
        "movie" : movie.movie_title,
        "rating" : edge.ratings
    }
"""

queryResult = db.AQLQuery(my_ratings, rawResults=True)
for result in queryResult:
    print("Movie: " + result["movie"])
    print("Rating: " + result["rating"])
    print()

In [None]:
alike_users = """
WITH Movies, Users, Ratings
FOR movie, edge IN 1..1 
  OUTBOUND 'Users/1'
  GRAPH 'IMDBGraph'
  FILTER TO_NUMBER(edge.ratings) == 5
  
      FOR user, edge2 IN ANY movie Ratings
            FILTER TO_NUMBER(edge2.ratings) == 5
            LIMIT 10
            RETURN DISTINCT {
                "user" : user._key,
                "age" : user.age
            }
"""

queryResult = db.AQLQuery(alike_users, rawResults=True)
for result in queryResult:
    print("User: " + result["user"])
    print("Age: " + result["age"])
    print()

In [None]:
new_movies = """
WITH Movies, Users, Ratings
FOR movie, edge IN 1..1 
  OUTBOUND 'Users/1'
  GRAPH 'IMDBGraph'
  FILTER TO_NUMBER(edge.ratings) == 5
  
      FOR user, edge2 IN ANY movie Ratings
            FILTER TO_NUMBER(edge2.ratings) == 5
           // All users who have also rated that movie with 5 stars
          FOR movie2, edge3 IN ANY user Ratings
              FILTER TO_NUMBER(edge3.ratings) == 5
              LIMIT 10
              RETURN DISTINCT {
                 "title" : movie2.movie_title
              }
"""

queryResult = db.AQLQuery(new_movies, rawResults=True)
for result in queryResult:
    print("Movie: " + result["title"])
    print()

In [None]:
# Delete collections
db.dropAllCollections() 
db.reload()