## Collaborative Filtering Recommendation System

### Personalized Books Recommendation System using Correlation Measure

Load required packages

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display,Markdown

Load required dataset and removing the unnecessary attributes

In [2]:
books = pd.read_csv('F:/RS/BX-Books/BX-Books.csv',sep = ';',error_bad_lines=False,encoding='latin-1')
books.drop(["Book-Author","Year-Of-Publication","Publisher","Image-URL-S","Image-URL-M","Image-URL-L"],axis=1,inplace=True)
books.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISBN,Book-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [3]:
rating = pd.read_csv('F:/RS/BX-Books/BX-Book-Ratings.csv',sep = ';',error_bad_lines=False,encoding='latin-1')
rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


Loading the required csv files and checking for null values

#### Data Preparation

Merging the two datasets: books & rating

In [4]:
book_rating = pd.merge(books, rating, on= 'ISBN')
book_rating.head()

Unnamed: 0,ISBN,Book-Title,User-ID,Book-Rating
0,195153448,Classical Mythology,2,0
1,2005018,Clara Callan,8,5
2,2005018,Clara Callan,11400,0
3,2005018,Clara Callan,11676,8
4,2005018,Clara Callan,41385,0


Data Filtering

In [5]:
book_rating_filtered = book_rating[1:229974] #subsetting the data to avoid getting memory error

In [6]:
p_table = pd.pivot_table(book_rating_filtered, values ='Book-Rating', index =['User-ID'], 
                         columns =['Book-Title'], aggfunc = np.sum) \
            .fillna(0)
p_table.head()

Book-Title,'48,01-01-00: A Novel of the Millennium,100 Best-Loved Poems (Dover Thrift Editions),"100 Great Fantasy Short, Short Stories",101 Bright Ideas: Esl Activities for All Ages,101 Dalmatians,101 Dinosaur Jokes,101 Telephone Jokes,11th Hour,13 99 Euros,...,Zone Perfect Meals in Minutes: 150 Fast and Simple Healthy Recipes from the Bestselling Authorof the Zone and Mastering the Zone,Zone Ã?Â©rogÃ?Â¨ne,Zoot Suit and Other Plays,Zope Bible,"\O\"" Is for Outlaw""","\The Twilight of the Idols (Classics S.)""",de Parte de La Princesa Muerta,iI Paradiso Degli Orchi,"metamorfosis, La (ClÃ¡sicos selecciÃ³n series)",stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Dynamic Function for providing recommendatios based on user-item similarity (correlation measure)

In [7]:
def corr_rs(df,table,a1,a2,item):
    selected_item = table[item]
    books_like_item = table.corrwith(selected_item)
    corr_books_like_item = pd.DataFrame(books_like_item,columns=['Correlation'])
    corr_books_like_item.dropna(inplace = True)
    corr_books_like_item.head()
    corr_books_like_item.sort_values("Correlation",ascending=False).head()
    ratings_mean_count = pd.DataFrame(df.groupby(a1)[a2].mean())
    ratings_mean_count["ratings_count"] = df.groupby(a1)[a2].count()
    corr_books_like_item = corr_books_like_item.join(ratings_mean_count["ratings_count"])
    recommended_books_similar_to_item = corr_books_like_item[corr_books_like_item["ratings_count"]>600].sort_values("Correlation",ascending = False).head(10)
    if recommended_books_similar_to_item.empty == True:
        display(Markdown("**No Books Recommended for {}**".format(item)))
    else:
        display(Markdown('**Books Recommended for {}**'.format(item)))
        print(recommended_books_similar_to_item)

##### Recommendations

In [8]:
corr_rs(book_rating,p_table,"Book-Title","Book-Rating","stardust")

**Books Recommended for stardust**

                            Correlation  ratings_count
Book-Title                                            
The Da Vinci Code              0.053898            898
Life of Pi                     0.035829            664
Bridget Jones's Diary          0.030935            815
The Testament                  0.030238            617
The Summons                    0.021349            655
The Nanny Diaries: A Novel     0.016714            828
Angels &amp; Demons            0.015463            670
The Secret Life of Bees        0.013191            774
Snow Falling on Cedars         0.012686            662
A Painted House                0.010276            838


In [9]:
# display(Markdown('**Recommendations for {}'.format(item)))
corr_rs(book_rating,p_table,"Book-Title","Book-Rating","11th Hour")

**No Books Recommended for 11th Hour**