In [1]:
import os
import pandas as pd
data_folder = os.path.join(os.path.expanduser("~"),"C://Users//user//Desktop//notebook//DATA_MINING//HW1","ml-100k")
ratings_filename = os.path.join(data_folder, "u.data")

In [2]:
all_ratings = pd.read_csv(ratings_filename, delimiter="\t",
 header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])

In [3]:
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],
 unit='s')

In [4]:
all_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
all_ratings["Favorable"] = all_ratings["Rating"] > 3

In [6]:
all_ratings[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,62,257,2,1997-11-12 22:07:14,False
11,286,1014,5,1997-11-17 15:38:45,True
12,200,222,5,1997-10-05 09:05:40,True
13,210,40,3,1998-03-27 21:59:54,False
14,224,29,3,1998-02-21 23:40:57,False


In [7]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

In [8]:
favorable_ratings_mask = ratings["Favorable"]
favorable_ratings = ratings[favorable_ratings_mask]

In [9]:
favorable_reviews_by_users = dict(
    (k, frozenset(v.values))for k, v in 
    favorable_ratings.groupby("UserID")["MovieID"])

In [10]:
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()

In [11]:
num_favorable_by_movie.sort_values(by="Favorable", ascending=False).head()

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
50,100
100,89
258,83
181,79
174,74


In [12]:
frequent_itemsets = {}

In [13]:
min_support = 50

In [14]:
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
    for movie_id, row in num_favorable_by_movie.iterrows()
    if row["Favorable"] > min_support)

In [15]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets,min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [16]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets =find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()
del frequent_itemsets[1]

I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11


In [17]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [18]:
print(candidate_rules[:5])

[(frozenset({7}), 1), (frozenset({1}), 7), (frozenset({50}), 1), (frozenset({1}), 50), (frozenset({1}), 56)]


In [19]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [20]:
rule_confidence = {candidate_rule:
                    (correct_counts[candidate_rule] / 
                    float(correct_counts[candidate_rule] +
                    incorrect_counts[candidate_rule]))
                    for candidate_rule in candidate_rules}

In [21]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(),key=itemgetter(1), reverse=True)

for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

    
# from operator import itemgetter
# sorted_confidence = sorted(rule_confidence.items(),
#     key=itemgetter(1), reverse=True)
# for index in range(5):
#     print("Rule #{0}".format(index + 1))
#     (premise, conclusion) = sorted_confidence[index][0]
#     print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
#     print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
#     print("")

Rule #1
Rule: If a person recommends frozenset({98, 181}) they will also recommend 50
 - Confidence:1.000

Rule #2
Rule: If a person recommends frozenset({172, 79}) they will also recommend 174
 - Confidence:1.000

Rule #3
Rule: If a person recommends frozenset({258, 172}) they will also recommend 174
 - Confidence:1.000

Rule #4
Rule: If a person recommends frozenset({1, 181, 7}) they will also recommend 50
 - Confidence:1.000

Rule #5
Rule: If a person recommends frozenset({1, 172, 7}) they will also recommend 174
 - Confidence:1.000



In [22]:
movie_name_filename = os.path.join(data_folder, "u.item")
movie_name_data = pd.read_csv(movie_name_filename, delimiter="|",header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date",
    "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
    "Animation", "Children's", "Comedy", "Crime", "Documentary",
    "Drama", "Fantasy", "Film-Noir",
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller",
    "War", "Western"]

In [23]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [24]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Confidence: 1.000



In [25]:
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])

In [26]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [27]:
test_confidence = {candidate_rule:
                    (correct_counts[candidate_rule] / 
                    float(correct_counts[candidate_rule] + 
                    incorrect_counts[candidate_rule]))
                    for candidate_rule in rule_confidence}
print(len(test_confidence))

15285


In [28]:
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in
    premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
   
    print("")


Rule #1
Rule: If a person recommends Silence of the Lambs, The (1991), Return of the Jedi (1983) they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.936

Rule #2
Rule: If a person recommends Empire Strikes Back, The (1980), Fugitive, The (1993) they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.876

Rule #3
Rule: If a person recommends Contact (1997), Empire Strikes Back, The (1980) they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence: 0.841

Rule #4
Rule: If a person recommends Toy Story (1995), Return of the Jedi (1983), Twelve Monkeys (1995) they will also recommend Star Wars (1977)
 - Train Confidence: 1.000
 - Test Confidence: 0.932

Rule #5
Rule: If a person recommends Toy Story (1995), Empire Strikes Back, The (1980), Twelve Monkeys (1995) they will also recommend Raiders of the Lost Ark (1981)
 - Train Confidence: 1.000
 - Test Confidence

In [45]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

In [46]:
min_confidence = 0.9

In [47]:
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

5152


In [48]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

In [49]:
top_confidence = sorted_confidence[:10]
print(top_confidence)

[((frozenset({98, 181}), 50), 1.0), ((frozenset({172, 79}), 174), 1.0), ((frozenset({258, 172}), 174), 1.0), ((frozenset({1, 181, 7}), 50), 1.0), ((frozenset({1, 172, 7}), 174), 1.0), ((frozenset({56, 1, 50}), 174), 1.0), ((frozenset({56, 1, 181}), 50), 1.0), ((frozenset({1, 98, 181}), 50), 1.0), ((frozenset({1, 172, 181}), 50), 1.0), ((frozenset({56, 1, 64}), 98), 1.0)]


In [50]:
counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for rule, confidence in top_confidence:
        premise, conclusion = rule
        if premise.issubset(reviews) and conclusion in reviews:
            counts[rule] += 1
top_support = {rule: support / len(favorable_reviews_by_users) for rule, support in counts.items()}

In [51]:
top_confidence_dict = {}
for rule, confidence in top_confidence:
    top_confidence_dict[rule] = confidence

In [52]:
# find the lift for each of these rules
conclusion_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for rule, confidence in top_confidence:
        premise, conclusion = rule
        if conclusion in reviews:
            conclusion_counts[rule] += 1
top_lift = {rule: top_confidence_dict[rule] / (count / len(favorable_reviews_by_users)) for rule, count in conclusion_counts.items()}

In [53]:
top_support_dict = {}
for rule, support in top_support.items():
    top_support_dict[rule] = support

top_lift_dict = {}
for rule, lift in top_lift.items():
    top_lift_dict[rule] = lift

In [55]:
# find the Chi-Square for each of these rules
import numpy as np
from scipy.stats import chi2_contingency
chi2 = defaultdict(int)
print(top_confidence)
for rule, confidence in top_confidence:
    premise, conclusion = rule
    print([correct_counts[rule], incorrect_counts[rule]])
    print([conclusion_counts[rule] - correct_counts[rule]])
    print(len(favorable_reviews_by_users))
    print(conclusion_counts[rule])
    observed = np.array([[correct_counts[rule], incorrect_counts[rule]],
                         [conclusion_counts[rule] - correct_counts[rule],
                          len(favorable_reviews_by_users) - conclusion_counts[rule] - incorrect_counts[rule]]])
    print(observed)
    chi2[rule] = chi2_contingency(observed)[0]

[((frozenset({98, 181}), 50), 1.0), ((frozenset({172, 79}), 174), 1.0), ((frozenset({258, 172}), 174), 1.0), ((frozenset({1, 181, 7}), 50), 1.0), ((frozenset({1, 172, 7}), 174), 1.0), ((frozenset({56, 1, 50}), 174), 1.0), ((frozenset({56, 1, 181}), 50), 1.0), ((frozenset({1, 98, 181}), 50), 1.0), ((frozenset({1, 172, 181}), 50), 1.0), ((frozenset({56, 1, 64}), 98), 1.0)]
[34, 0]
[66]
199
100
[[34  0]
 [66 99]]
[34, 0]
[40]
199
74
[[ 34   0]
 [ 40 125]]
[28, 0]
[46]
199
74
[[ 28   0]
 [ 46 125]]
[21, 0]
[79]
199
100
[[21  0]
 [79 99]]
[20, 0]
[54]
199
74
[[ 20   0]
 [ 54 125]]
[26, 0]
[48]
199
74
[[ 26   0]
 [ 48 125]]
[18, 0]
[82]
199
100
[[18  0]
 [82 99]]
[24, 0]
[76]
199
100
[[24  0]
 [76 99]]
[25, 0]
[75]
199
100
[[25  0]
 [75 99]]
[20, 0]
[50]
199
70
[[ 20   0]
 [ 50 129]]


In [56]:
# find Kulczynski for each of these rules
kulczynski = defaultdict(int)
for rule, confidence in top_confidence:
    premise, conclusion = rule
    kulczynski[rule] = (correct_counts[rule] / (correct_counts[rule] + incorrect_counts[rule])) + (correct_counts[rule] / conclusion_counts[rule])
    kulczynski[rule] /= 2

In [57]:
# find cosine for each of these rules

cosine = defaultdict(int)
for rule, confidence in top_confidence:
    premise, conclusion = rule
    cosine[rule] = correct_counts[rule] / np.sqrt(conclusion_counts[rule] * (correct_counts[rule] + incorrect_counts[rule]))

In [58]:
# find all-confidence for each of these rules
all_confidence = defaultdict(int)
for rule, confidence in top_confidence:
    premise, conclusion = rule
    all_confidence[rule] = correct_counts[rule] / (correct_counts[rule] + incorrect_counts[rule] + conclusion_counts[rule] - correct_counts[rule])

In [59]:
# find max-confidence for each of these rules
max_confidence = defaultdict(int)
for rule, confidence in top_confidence:
    premise, conclusion = rule
    max_confidence[rule] = max(correct_counts[rule] / (correct_counts[rule] + incorrect_counts[rule]),
                               correct_counts[rule] / conclusion_counts[rule])

In [60]:
# confidence and support, lift, Chi-Square, Kulczynski, cosine, all-confidence, max-confidence

import pandas as pd
df = pd.DataFrame.from_dict(top_confidence_dict, orient='index')
df = df.rename(columns={0: 'confidence'})
df['support'] = pd.Series(top_support_dict)
df['lift'] = pd.Series(top_lift_dict)
df['chi2'] = pd.Series(chi2)
df['kulczynski'] = pd.Series(kulczynski)
df['cosine'] = pd.Series(cosine)
df['all_confidence'] = pd.Series(all_confidence)
df['max_confidence'] = pd.Series(max_confidence)
df = df.sort_values(by='confidence', ascending=False)
df

Unnamed: 0,confidence,support,lift,chi2,kulczynski,cosine,all_confidence,max_confidence
"((98, 181), 50)",1.0,0.170854,1.99,38.231413,0.67,0.583095,0.34,1.0
"((172, 79), 174)",1.0,0.170854,2.689189,66.061635,0.72973,0.677834,0.459459,1.0
"((258, 172), 174)",1.0,0.140704,2.689189,51.956844,0.689189,0.615125,0.378378,1.0
"((1, 181, 7), 50)",1.0,0.105528,1.99,21.071216,0.605,0.458258,0.21,1.0
"((1, 172, 7), 174)",1.0,0.100503,2.689189,34.628345,0.635135,0.519875,0.27027,1.0
"((56, 1, 50), 174)",1.0,0.130653,2.689189,47.473457,0.675676,0.592749,0.351351,1.0
"((56, 1, 181), 50)",1.0,0.090452,1.99,17.465336,0.59,0.424264,0.18,1.0
"((1, 98, 181), 50)",1.0,0.120603,1.99,24.802982,0.62,0.489898,0.24,1.0
"((1, 172, 181), 50)",1.0,0.125628,1.99,26.075863,0.625,0.5,0.25,1.0
"((56, 1, 64), 98)",1.0,0.100503,2.842857,37.875708,0.642857,0.534522,0.285714,1.0


In [62]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule, confidence in top_confidence:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [68]:
df['test_confidence'] = pd.Series(test_confidence)
df

Unnamed: 0,confidence,support,lift,chi2,kulczynski,cosine,all_confidence,max_confidence,test_confidence
"((98, 181), 50)",1.0,0.170854,1.99,38.231413,0.67,0.583095,0.34,1.0,0.935897
"((172, 79), 174)",1.0,0.170854,2.689189,66.061635,0.72973,0.677834,0.459459,1.0,0.875912
"((258, 172), 174)",1.0,0.140704,2.689189,51.956844,0.689189,0.615125,0.378378,1.0,0.840909
"((1, 181, 7), 50)",1.0,0.105528,1.99,21.071216,0.605,0.458258,0.21,1.0,0.932432
"((1, 172, 7), 174)",1.0,0.100503,2.689189,34.628345,0.635135,0.519875,0.27027,1.0,0.903226
"((56, 1, 50), 174)",1.0,0.130653,2.689189,47.473457,0.675676,0.592749,0.351351,1.0,0.816092
"((56, 1, 181), 50)",1.0,0.090452,1.99,17.465336,0.59,0.424264,0.18,1.0,0.969697
"((1, 98, 181), 50)",1.0,0.120603,1.99,24.802982,0.62,0.489898,0.24,1.0,0.932584
"((1, 172, 181), 50)",1.0,0.125628,1.99,26.075863,0.625,0.5,0.25,1.0,0.970588
"((56, 1, 64), 98)",1.0,0.100503,2.842857,37.875708,0.642857,0.534522,0.285714,1.0,0.794118


    1.在程式裡
    2.在程式裡
    3.對印出的結果，我不是很了解為什麼confident,max confident永遠都是1，除此之外，我們可以發現test confidence都頗高。
    且各項指標基本保持正相關。
    data 來源：https://grouplens.org/datasets/movielens/
    參考網站：https://chwang12341.medium.com/machine-learning-%E9%97%9C%E8%81%AF%E5%88%86%E6%9E%90-apriori%E6%BC%94%E7%AE%97%E6%B3%95-%E8%A9%B3%E7%B4%B0%E8%A7%A3%E8%AA%AA%E5%95%A4%E9%85%92%E8%88%87%E5%B0%BF%E5%B8%83%E7%9A%84%E8%83%8C%E5%BE%8C%E5%8E%9F%E7%90%86-python%E5%AF%A6%E4%BD%9C-scikit-learn%E4%B8%80%E6%AD%A5%E4%B8%80%E6%AD%A5%E6%95%99%E5%AD%B8-76b7778f8f34