# Latihan 1 | The Movie Recommendation Problem

In [11]:
import os
import sys
import pandas as pd

data_folder = os.path.join(os.path.expanduser(".\\"), "Movielens 100k Dataset","ml-100k")
ratings_filename = os.path.join(data_folder, "u.data")

In [12]:
all_ratings = pd.read_csv(ratings_filename, delimiter="\t",
                         header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])

In [13]:
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'], unit='s')

In [14]:
all_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


# Latihan 2 | Data Preprocessin

In [15]:
all_ratings["Favorable"] = all_ratings["Rating"] > 3

In [16]:
all_ratings[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False


In [17]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

In [18]:
favorable_ratings = ratings[ratings["Favorable"]]

In [19]:
favorable_reviews_by_users = dict((k, frozenset(v.values))
                                for k, v in favorable_ratings.groupby("UserID")["MovieID"])

In [20]:
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()

In [21]:
num_favorable_by_movie.sort_values(by="Favorable", ascending=False).head()

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100
100,89
258,83
181,79
174,74


# Latihan 3 | The Apriori Implementation

In [22]:
frequent_itemsets = {}

In [23]:
min_support = 50

In [24]:
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                           for movie_id, row in num_favorable_by_movie.iterrows()
                           if row["Favorable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]),
                                                                      min_support))

There are 16 movies with more than 50 favorable reviews


In [26]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items()
                 if frequency >= min_support])

for k in range(2, 20):
    # Generate candidates of Length k, using the frequent itemsets of Length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users,
                                                   frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets)==0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
        
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        frequent_itemsets[k] = cur_frequent_itemsets
del frequent_itemsets[1]

I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


# Latihan 4 | Extracting Association Rules

In [27]:
candidate_rules = []
for itemsets_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
            
print("There are {} candidate rules".format(len(candidate_rules)))

There are 15285 candidate rules


In [28]:
print(candidate_rules[:5])

[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [29]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

In [30]:
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
                
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / 
                   float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in candidate_rules}

rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items()
                   if confidence > 0.8}

In [32]:
print(candidate_rules[:5])

[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [31]:
from operator import itemgetter

sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1 ))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}\n".format(rule_confidence[(premise, conclusion)]))

Rule #1
Rule: If a person recommends frozenset({98, 181}) they will also recommend 50
 - Confidence: 1.000

Rule #2
Rule: If a person recommends frozenset({172, 79}) they will also recommend 174
 - Confidence: 1.000

Rule #3
Rule: If a person recommends frozenset({258, 172}) they will also recommend 174
 - Confidence: 1.000

Rule #4
Rule: If a person recommends frozenset({1, 181, 7}) they will also recommend 50
 - Confidence: 1.000

Rule #5
Rule: If a person recommends frozenset({1, 172, 7}) they will also recommend 174
 - Confidence: 1.000



In [34]:
movie_name_filename = os.path.join(data_folder, "u.item")
movie_name_data = pd.read_csv(movie_name_filename, delimiter="|", header=None, encoding = "mac-roman")

In [35]:
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>",\
                          "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",\
                          "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",\
                          "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

movie_name_data.head()

Unnamed: 0,MovieID,Title,Release Date,Video Release,IMDB,<UNK>,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [37]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend 50
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend 174
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend 174
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend 50
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend 174
 - Confidence: 1.000



# Latihan 5 | Evaluation

In [38]:
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values))
    for k, v in test_favorable.groupby("UserID")["MovieID"])

In [40]:
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule
        
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1
            else:
                incorrect_counts[candidate_rule]+=1

test_confidence = {candidate_rule:
                  (correct_counts[candidate_rule] / 
                  float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]))
                  for candidate_rule in rule_confidence}

In [41]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule : if a person recommends {0} they will also recommend {1}"
         .format(premise_names, conclusion_name))
    print(" - Train confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print("- Test Confidence: {0:.3f}\n".format(test_confidence.get((premise, conclusion), -1)))

Rule #1
Rule : if a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
 - Train confidence: 1.000
- Test Confidence: 0.936

Rule #2
Rule : if a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
 - Train confidence: 1.000
- Test Confidence: 0.876

Rule #3
Rule : if a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
 - Train confidence: 1.000
- Test Confidence: 0.841

Rule #4
Rule : if a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Train confidence: 1.000
- Test Confidence: 0.932

Rule #5
Rule : if a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Train confidence: 1.000
- Test Confidence