In [1]:
import pandas as pd
import numpy as np

In [3]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [4]:
df.head(7)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488


In [None]:
#https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html

### if a person recommends a given set of movies A, they will also recommend B

In [5]:
df["Favourable"]=df["rating"]>3 #ratings>3 can be deemed favourable

In [6]:
ratings=df[df['user_id'].isin(range(200))]

In [7]:
favourable_ratings=ratings[ratings["Favourable"]]

In [14]:
favourable_reviews_by_users=favourable_ratings.groupby( [ "user_id", "item_id"] )

In [16]:
num_favourable_by_movie=ratings[["item_id","Favourable"]].groupby("item_id").sum()

In [17]:
num_favourable_by_movie.head(6)

Unnamed: 0_level_0,Favourable
item_id,Unnamed: 1_level_1
1,66.0
2,5.0
3,4.0
4,21.0
5,6.0
6,6.0


In [20]:
large=num_favourable_by_movie.sort_values(by="Favourable", ascending=False)#sort in descending order
large.head(n=10)

Unnamed: 0_level_0,Favourable
item_id,Unnamed: 1_level_1
50,100.0
100,89.0
258,83.0
181,79.0
174,74.0
98,70.0
127,70.0
56,67.0
7,67.0
1,66.0


In [21]:
frequent_itemsets = {}

In [22]:
min_support = 50

In [23]:
frequent_itemsets[1]=dict((frozenset ((item_id,)), 
                           row["Favourable"])
                          for item_id, row in num_favourable_by_movie.iterrows()
                          if row["Favourable"] > min_support)

In [19]:
from collections import defaultdict

In [24]:
def find_frequent_itemsets(favourable_reviews_by_users, k_1_itemsets,min_support):
    counts = defaultdict(int)
    for user, reviews in favourable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])                   
                    

In [None]:
import sys
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favourable"])
                                for movie_id, row in num_favourable_by_movie.iterrows()
                                if row["Favourable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favourable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

In [29]:
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 16 frequent itemsets


In [30]:
# Now we create the association rules. First, they are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

There are 16 candidate rules


In [31]:
print(candidate_rules[:5])

[(frozenset([]), 286), (frozenset([]), 7), (frozenset([]), 64), (frozenset([]), 79), (frozenset([]), 258)]


In [32]:
# Now, we compute the confidence of each of these rules.  
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favourable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

AttributeError: 'DataFrameGroupBy' object has no attribute 'items'

In [33]:
# Choose only rules above a minimum confidence level
min_confidence = 0.9

In [34]:
# Filter out the rules with poor confidence
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

NameError: name 'rule_confidence' is not defined