# Assignment 2
## Question 1
#### 1.Take a look at the book crossing dataset at http://www2.informatik.uni-freiburg.de/~cziegler/BX/
#### 2.Download the csv files 
#### 3.The format of the book ratings file is User,ISBN,Book-Rating
#### 4.The books file has more descriptive information
#### 5.Run the Apriori algorithm on this dataset. What association rules do you find? 

In [1]:
#importing required packages
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sys

In [2]:
data_folder = os.path.join(os.path.expanduser("~"), "Data", "BX-CSV-Dump")
ratings_filename = os.path.join(data_folder, "BX-Book-Ratings.csv")
#show(ratings_filename)
names = ['UserID','ISBN','Rating']
all_ratings = pd.read_csv(ratings_filename,sep = ';',header = 0,names = names,encoding="ISO-8859-1")
all_ratings[:5]

Unnamed: 0,UserID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
# Not all ratings are favourable! Our goal is "other recommended books", so we only want favourable ratings
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[15:20]

Unnamed: 0,UserID,ISBN,Rating,Favorable
15,276746,786014512,0,False
16,276747,60517794,9,True
17,276747,451192001,0,False
18,276747,609801279,0,False
19,276747,671537458,9,True


In [4]:
all_ratings[all_ratings["UserID"] == 276747][:5]

Unnamed: 0,UserID,ISBN,Rating,Favorable
16,276747,60517794,9,True
17,276747,451192001,0,False
18,276747,609801279,0,False
19,276747,671537458,9,True
20,276747,679776818,8,True


In [5]:
# We start by creating a dataset of each user's favourable reviews
favorable_ratings = all_ratings[all_ratings["Favorable"]]
favorable_ratings[:5]

Unnamed: 0,UserID,ISBN,Rating,Favorable
1,276726,0155061224,5,True
4,276729,0521795028,6,True
6,276736,3257224281,8,True
7,276737,0600570967,6,True
8,276744,038550120X,7,True


In [6]:
# We are only interested in the reviewers who have more than one review
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["ISBN"])
len(favorable_reviews_by_users)

76328

In [7]:
# Find out how many movies have favourable ratings
num_favorable_by_book = all_ratings[["ISBN", "Favorable"]].groupby("ISBN").sum()
num_favorable_by_book.sort("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
ISBN,Unnamed: 1_level_1
316666343,696.0
385504209,478.0
312195516,377.0
971880107,338.0
679781587,331.0


In [8]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_book in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_book,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((book_id,)), row["Favorable"])
                                for book_id, row in num_favorable_by_book.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} books with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("Found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

There are 490 books with more than 50 favorable reviews
Found 92 frequent itemsets of length 2
Found 45 frequent itemsets of length 3
Found 16 frequent itemsets of length 4
Found 2 frequent itemsets of length 5
Did not find any frequent itemsets of length 6


In [9]:
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 155 frequent itemsets


In [10]:
# Now we create the association rules. First, they are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

There are 393 candidate rules


In [11]:
print(candidate_rules[:7])

[(frozenset({'044022165X'}), '044021145X'), (frozenset({'044021145X'}), '044022165X'), (frozenset({'0439139597'}), '0439064864'), (frozenset({'0439064864'}), '0439139597'), (frozenset({'0515128546'}), '0515128554'), (frozenset({'0515128554'}), '0515128546'), (frozenset({'0312966970'}), '0312971346')]


In [12]:
# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + 
                                                                          incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

# Choose only rules above a minimum confidence level
min_confidence = 0.9

In [13]:
# Filter out the rules with poor confidence
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

22


In [14]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends frozenset({'0590353403', '043935806X'}) they will also recommend 0439064864
 - Confidence: 0.974

Rule #2
Rule: If a person recommends frozenset({'0439136350', '0590353403', '043935806X'}) they will also recommend 0439064864
 - Confidence: 0.972

Rule #3
Rule: If a person recommends frozenset({'0590353403', '043935806X', '0439139597'}) they will also recommend 0439064864
 - Confidence: 0.964

Rule #4
Rule: If a person recommends frozenset({'0590353403', '043935806X', '0439139597'}) they will also recommend 0439136350
 - Confidence: 0.964

Rule #5
Rule: If a person recommends frozenset({'0439064864', '0590353403', '043935806X', '0439139597'}) they will also recommend 0439136350
 - Confidence: 0.963



In [15]:
#Finding the name of the book from ISBN number
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
book_name_filename = os.path.join(data_folder, "BX-Books.csv")
book_name_data = pd.read_csv(book_name_filename, sep=';', delimiter = ';',encoding='ISO-8859-1',warn_bad_lines  = False, error_bad_lines  = False)
print (book_name_data.shape)

(271360, 8)


In [16]:
book_name_data[0:5]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [17]:
def get_book_name(isbn):
    title_object = book_name_data[book_name_data["ISBN"] == isbn]["Book-Title"]
    title = title_object.values[0]
    return title

In [18]:
get_book_name('0374157065')

'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It'

In [19]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_book_name(idx) for idx in premise)
    conclusion_name = get_book_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Confidence: 0.974

Rule #2
Rule: If a person recommends Harry Potter and the Prisoner of Azkaban (Book 3), Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Confidence: 0.972

Rule #3
Rule: If a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Goblet of Fire (Book 4) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Confidence: 0.964

Rule #4
Rule: If a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Goblet of Fire (Book 4) they will also recommend 

## Question 2
#### 1.Take a look at the users.csv file.  Note some of the information in this file: age and location
#### 2.Investigate if it is possible to change the code that you created to group by age and also by location.In other words, users in NYC who buy a particular book are likely to buy another. For age, you can use age bands such as 0-5, 6-9, 10-12, 13-17, 18-25, 26-35, 36-55,55-70,70+
#### 3.Submit your code as well as a writeup of what you found out about the dataset

In [20]:
data_folder = os.path.join(os.path.expanduser("~"), "Data", "BX-CSV-Dump")
file_1 = os.path.join(data_folder, "BX-Book-Ratings.csv")
file_2 = os.path.join(data_folder, "BX-Users.csv")

In [21]:
all_ratings=pd.merge(pd.read_csv(file_1,sep = ';',header = 0,names=['UserID', 'ISBN', 'Rating'],encoding="ISO-8859-1"),
                  pd.read_csv(file_2,sep = ';',header = 0,names = ['UserID', 'Location', 'Age'],encoding="ISO-8859-1"), 
                     on='UserID')
all_ratings[:5]

Unnamed: 0,UserID,ISBN,Rating,Location,Age
0,276725,034545104X,0,"tyler, texas, usa",
1,276726,0155061224,5,"seattle, washington, usa",
2,276727,0446520802,0,"h, new south wales, australia",16.0
3,276729,052165615X,3,"rijeka, n/a, croatia",16.0
4,276729,0521795028,6,"rijeka, n/a, croatia",16.0


In [22]:
#Adding Age Range to the data
all_ratings['Age Range']=pd.cut(all_ratings['Age'],[0,5,9,12,17,25,35,55,70])
all_ratings[4:10]

Unnamed: 0,UserID,ISBN,Rating,Location,Age,Age Range
4,276729,0521795028,6,"rijeka, n/a, croatia",16.0,"(12, 17]"
5,276733,2080674722,0,"paris, n/a, france",37.0,"(35, 55]"
6,276736,3257224281,8,"salzburg, salzburg, austria",,
7,276737,0600570967,6,"sydney, new south wales, australia",14.0,"(12, 17]"
8,276744,038550120X,7,"torrance, california, usa",,
9,276745,342310538,10,"berlin, berlin, germany",27.0,"(25, 35]"


In [23]:
#Checking if the Ages are allocated properly to Age Range
print(pd.crosstab(all_ratings['Age Range'],all_ratings['Age']))

Age        0.0    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    \
Age Range                                                                  
(0, 5]         0    569    341    193    586    209      0      0      0   
(5, 9]         0      0      0      0      0      0     18    217    574   
(9, 12]        0      0      0      0      0      0      0      0      0   
(12, 17]       0      0      0      0      0      0      0      0      0   
(17, 25]       0      0      0      0      0      0      0      0      0   
(25, 35]       0      0      0      0      0      0      0      0      0   
(35, 55]       0      0      0      0      0      0      0      0      0   
(55, 70]       0      0      0      0      0      0      0      0      0   

Age        9.0    ...    212.0  219.0  220.0  223.0  226.0  228.0  229.0  \
Age Range         ...                                                      
(0, 5]         0  ...        0      0      0      0      0      0      0   
(5, 9]     

In [24]:
# Not all ratings are favourable! Our goal is "other recommended books", so we only want favourable ratings
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[15:20]

Unnamed: 0,UserID,ISBN,Rating,Location,Age,Age Range,Favorable
15,276746,786014512,0,"fort worth, ,",,,False
16,276747,60517794,9,"iowa city, iowa, usa",25.0,"(17, 25]",True
17,276747,451192001,0,"iowa city, iowa, usa",25.0,"(17, 25]",False
18,276747,609801279,0,"iowa city, iowa, usa",25.0,"(17, 25]",False
19,276747,671537458,9,"iowa city, iowa, usa",25.0,"(17, 25]",True


In [25]:
all_ratings[all_ratings["UserID"] == 276747][:5]

Unnamed: 0,UserID,ISBN,Rating,Location,Age,Age Range,Favorable
16,276747,60517794,9,"iowa city, iowa, usa",25.0,"(17, 25]",True
17,276747,451192001,0,"iowa city, iowa, usa",25.0,"(17, 25]",False
18,276747,609801279,0,"iowa city, iowa, usa",25.0,"(17, 25]",False
19,276747,671537458,9,"iowa city, iowa, usa",25.0,"(17, 25]",True
20,276747,679776818,8,"iowa city, iowa, usa",25.0,"(17, 25]",True


In [26]:
# We start by creating a dataset of each user's favourable reviews
favorable_ratings = all_ratings[all_ratings["Favorable"]]
favorable_ratings[50:55]

Unnamed: 0,UserID,ISBN,Rating,Location,Age,Age Range,Favorable
128,276814,586207414,7,"bundaberg, queensland, australia",39.0,"(35, 55]",True
129,276814,812571029,9,"bundaberg, queensland, australia",39.0,"(35, 55]",True
132,276820,140260498,9,"scarbrough, ontario, canada",19.0,"(17, 25]",True
133,276822,60096195,10,"calgary, alberta, canada",11.0,"(9, 12]",True
134,276822,141310340,9,"calgary, alberta, canada",11.0,"(9, 12]",True


In [27]:
# We are only interested in the reviewers who have more than one review
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["ISBN"])
len(favorable_reviews_by_users)

76328

### Grouping by Age

In [28]:
# Find out how many movies have favourable ratings by Age
num_favorable_by_book = all_ratings[["Favorable","Age Range"]].groupby('Age Range').sum()
num_favorable_by_book.sort("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
Age Range,Unnamed: 1_level_1
"(35, 55]",116709.0
"(25, 35]",102995.0
"(17, 25]",43955.0
"(55, 70]",19832.0
"(12, 17]",9070.0


In [29]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_book in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_book,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((book_id,)), row["Favorable"])
                                for book_id, row in num_favorable_by_book.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} books with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("Found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

There are 8 books with more than 50 favorable reviews
Did not find any frequent itemsets of length 2
Found a total of 0 frequent itemsets


### Grouping by Location

In [30]:
# Find out how many movies have favourable ratings by Location
num_favorable_by_book = all_ratings[["Favorable","Location"]].groupby('Location').sum()
num_favorable_by_book.sort("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
Location,Unnamed: 1_level_1
"n/a, n/a, n/a",8068.0
"toronto, ontario, canada",6370.0
"morrow, georgia, usa",5805.0
"london, england, united kingdom",4176.0
"ottawa, ontario, canada",3099.0


In [31]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_book in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_book,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((book_id,)), row["Favorable"])
                                for book_id, row in num_favorable_by_book.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} books with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("Found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

There are 1449 books with more than 50 favorable reviews
Did not find any frequent itemsets of length 2
Found a total of 0 frequent itemsets


#### When we try to change the code to group by Age and by Location, it is found that there are no frequent itemsets.So it is not possible to change our code as we will not be able to find any rules about what a user will recommend, thus as our algorithm will not work.