## Building a Recommendation Engine from our Search Engine base

The Recommendation system base will be used to understand the interactive system we are building for the bookworm system. This file explains the interactions between the recommendations and the books. 

In [32]:
lmiller_liked_books= ["11069349", "13537029", "26827125", "18209268", "18693763"]

In [33]:
!head data/book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [34]:
#load in by line so you don't use too much memory
csv_book_mapping = {}
with open("data/book_id_map.csv", "r") as f: 
    while True: 
        line = f.readline()
        if not line: 
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [35]:
len(csv_book_mapping) #over 2 million mapping

2360651

In [36]:
!wc -l goodreads_interactions.csv #lots of interactions 4GB

wc: goodreads_interactions.csv: open: No such file or directory


In [37]:
!ls -lh | grep goodreads_interactions

In [38]:
!head goodreads_interactions.csv

head: goodreads_interactions.csv: No such file or directory


In [39]:
overlap_users = set() #make a set so everything is unique

with open("data/goodreads_interactions.csv", 'r') as f: 
    while True: 
        line = f.readline()
        if not line: 
            break 
        user_id, csv_id, _, rating, _ = line.split(",") # _ for what i dont care about

        if user_id in overlap_users:
            continue
        try: 
            rating = int(rating)
        except ValueError: 
            continue

        book_id = csv_book_mapping[csv_id] #book id to mapping csv id

        if book_id in lmiller_liked_books and rating >=4: 
            overlap_users.add(user_id)

In [40]:
rec_lines = [] 

with open("data/goodreads_interactions.csv", 'r') as f: 
    while True: 
        line = f.readline()
        if not line: 
            break 
        user_id, csv_id, _, rating, _ = line.split(",") # _ for what i dont care about
        
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])


In [41]:
len(overlap_users) # a lot of overlapping users

9866

In [42]:
len(rec_lines) # a lot of books, needs filtering before recommendations

6836480

In [43]:
import pandas as pd

recs_df = pd.DataFrame(rec_lines, columns = ["user_id", "book_id", "rating"])
recs_df["book_id"] = recs_df["book_id"].astype(str)

In [44]:
recs_df

Unnamed: 0,user_id,book_id,rating
0,4,250729,3
1,4,32071,3
2,4,50833,5
3,4,13496,4
4,4,13284283,4
...,...,...,...
6836475,875922,1211950,3
6836476,875922,3273,3
6836477,875922,19486412,5
6836478,875922,1466455,0


In [45]:
top_recs = recs_df["book_id"].value_counts().head(10)
top_recs = top_recs.index.values

In [46]:
top_recs

array(['18693763', '22557272', '18143977', '2767052', '11870085', '2657',
       '4671', '3', '19063', '77203'], dtype=object)

In [47]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)
books_titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,modified_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
3,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
4,378460,The Wanting of Levine,12,https://www.goodreads.com/book/show/378460.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the wanting of levine


In [48]:
#find book titles where bookid is in top recs
books_titles[books_titles["book_id"].isin(top_recs)]

Unnamed: 0,book_id,title,ratings,url,cover_image,modified_title
60363,77203,The Kite Runner,1848782,https://www.goodreads.com/book/show/77203.The_...,https://images.gr-assets.com/books/1484565687m...,the kite runner
325058,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
426763,19063,The Book Thief,1193697,https://www.goodreads.com/book/show/19063.The_...,https://images.gr-assets.com/books/1390053681m...,the book thief
530012,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
757036,18693763,Everything I Never Told You,115500,https://www.goodreads.com/book/show/18693763-e...,https://images.gr-assets.com/books/1386795198m...,everything i never told you
904234,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
1004013,18143977,All the Light We Cannot See,498685,https://www.goodreads.com/book/show/18143977-a...,https://images.gr-assets.com/books/1451445646m...,all the light we cannot see
1004474,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
1247951,22557272,The Girl on the Train,1076144,https://www.goodreads.com/book/show/22557272-t...,https://images.gr-assets.com/books/1490903702m...,the girl on the train
1252139,11870085,The Fault in Our Stars,2429317,https://www.goodreads.com/book/show/11870085-t...,https://images.gr-assets.com/books/1360206420m...,the fault in our stars


This is a very generic list, we want to see what top book from similar readers are rather than top books overall. 

In [49]:
all_recs = recs_df["book_id"].value_counts()

In [50]:
all_recs

18693763    8918
22557272    6673
18143977    6357
2767052     6279
11870085    6030
            ... 
1235617        1
18774905       1
910381         1
1202527        1
17554307       1
Name: book_id, Length: 622617, dtype: int64

In [51]:
all_recs = all_recs.to_frame().reset_index()

In [52]:
all_recs

Unnamed: 0,index,book_id
0,18693763,8918
1,22557272,6673
2,18143977,6357
3,2767052,6279
4,11870085,6030
...,...,...
622612,1235617,1
622613,18774905,1
622614,910381,1
622615,1202527,1


In [53]:
all_recs.columns = ["book_id", "book_count"]

In [54]:
all_recs

Unnamed: 0,book_id,book_count
0,18693763,8918
1,22557272,6673
2,18143977,6357
3,2767052,6279
4,11870085,6030
...,...,...
622612,1235617,1
622613,18774905,1
622614,910381,1
622615,1202527,1


In [55]:
all_recs = all_recs.merge(books_titles, how="inner", on="book_id")

In [56]:
all_recs

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,modified_title
0,18693763,8918,Everything I Never Told You,115500,https://www.goodreads.com/book/show/18693763-e...,https://images.gr-assets.com/books/1386795198m...,everything i never told you
1,22557272,6673,The Girl on the Train,1076144,https://www.goodreads.com/book/show/22557272-t...,https://images.gr-assets.com/books/1490903702m...,the girl on the train
2,18143977,6357,All the Light We Cannot See,498685,https://www.goodreads.com/book/show/18143977-a...,https://images.gr-assets.com/books/1451445646m...,all the light we cannot see
3,2767052,6279,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
4,11870085,6030,The Fault in Our Stars,2429317,https://www.goodreads.com/book/show/11870085-t...,https://images.gr-assets.com/books/1360206420m...,the fault in our stars
...,...,...,...,...,...,...,...
570646,1235617,1,The Screwtape Letters,52,https://www.goodreads.com/book/show/1235617.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the screwtape letters
570647,18774905,1,"Midnight Secretary, Vol. 7",54,https://www.goodreads.com/book/show/18774905-m...,https://images.gr-assets.com/books/1398189641m...,midnight secretary vol 7
570648,910381,1,Clarence the Copy Cat,144,https://www.goodreads.com/book/show/910381.Cla...,https://s.gr-assets.com/assets/nophoto/book/11...,clarence the copy cat
570649,1202527,1,"Snow Drop, Volume 1",438,https://www.goodreads.com/book/show/1202527.Sn...,https://images.gr-assets.com/books/1328727107m...,snow drop volume 1


In [57]:
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

In [58]:
all_recs.sort_values("score", ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,modified_title,score
0,18693763,8918,Everything I Never Told You,115500,https://www.goodreads.com/book/show/18693763-e...,https://images.gr-assets.com/books/1386795198m...,everything i never told you,688.577697
13354,34019109,75,"Ninth House (Alex Stern, #1)",11,https://www.goodreads.com/book/show/34019109-n...,https://s.gr-assets.com/assets/nophoto/book/11...,ninth house alex stern 1,511.363636
8054,26856502,123,"Vengeful (Villains, #2)",35,https://www.goodreads.com/book/show/26856502-v...,https://s.gr-assets.com/assets/nophoto/book/11...,vengeful villains 2,432.257143
4499,28170940,213,"Lethal White (Cormoran Strike, #4)",106,https://www.goodreads.com/book/show/28170940-l...,https://s.gr-assets.com/assets/nophoto/book/11...,lethal white cormoran strike 4,428.009434
102,34273236,2804,Little Fires Everywhere,21135,https://www.goodreads.com/book/show/34273236-l...,https://images.gr-assets.com/books/1490351351m...,little fires everywhere,372.009274
6267,34927828,157,The Great Alone,70,https://www.goodreads.com/book/show/34927828-t...,https://images.gr-assets.com/books/1501852384m...,the great alone,352.128571
6196,24909347,158,"Obsidio (The Illuminae Files, #3)",82,https://www.goodreads.com/book/show/24909347-o...,https://images.gr-assets.com/books/1501704611m...,obsidio the illuminae files 3,304.439024
15370,36300633,64,The Iron Season,14,https://www.goodreads.com/book/show/36300633-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the iron season,292.571429
16237,36301023,61,"My Plain Jane (The Lady Janies, #2)",13,https://www.goodreads.com/book/show/36301023-m...,https://images.gr-assets.com/books/1507936746m...,my plain jane the lady janies 2,286.230769
567,32920226,1135,"Sing, Unburied, Sing",4592,https://www.goodreads.com/book/show/32920226-s...,https://images.gr-assets.com/books/1499340866m...,sing unburied sing,280.536803


In [59]:
popular_recs = all_recs = all_recs[all_recs["book_count"] > 75].sort_values("score", ascending=False)

In [60]:
def make_clickable(val): 
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val): 
    return '<img src="{}" width=50></img>'.format(val)

popular_recs[~popular_recs["book_id"].isin(lmiller_liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,modified_title,score
8054,26856502,123,"Vengeful (Villains, #2)",35,Goodreads,,vengeful villains 2,432.257143
4499,28170940,213,"Lethal White (Cormoran Strike, #4)",106,Goodreads,,lethal white cormoran strike 4,428.009434
102,34273236,2804,Little Fires Everywhere,21135,Goodreads,,little fires everywhere,372.009274
6267,34927828,157,The Great Alone,70,Goodreads,,the great alone,352.128571
6196,24909347,158,"Obsidio (The Illuminae Files, #3)",82,Goodreads,,obsidio the illuminae files 3,304.439024
567,32920226,1135,"Sing, Unburied, Sing",4592,Goodreads,,sing unburied sing,280.536803
3550,34217599,267,Future Home of the Living God,263,Goodreads,,future home of the living god,271.060837
3194,24493732,297,Solutions and Other Problems,334,Goodreads,,solutions and other problems,264.098802
525,25810500,1192,What is Not Yours is Not Yours,5470,Goodreads,,what is not yours is not yours,259.755759
176,28815371,2228,The Mothers,22346,Goodreads,,the mothers,222.141949


## Build this Recommendation Engine into a Function

Taking each part of the recommendation system and making it into a function so we can integrate it into a widget format for search queries later on. This will be integrated with our search function as well. 

In [61]:
import pandas as pd

def recommend_books(liked_books, csv_book_mapping_path, interactions_path, books_titles_path):
    """
    Recommend books based on user interactions and a list of liked books.

    Parameters:
    - liked_books (list of str): List of book IDs the user likes.
    - csv_book_mapping_path (str): Path to the book ID mapping CSV file.
    - interactions_path (str): Path to the Goodreads interactions CSV file.
    - books_titles_path (str): Path to the books titles JSON file.

    Returns:
    - pd.DataFrame: Top recommended books with scores, links, and images.
    """
    #book ID mapping
    csv_book_mapping = {}
    with open(csv_book_mapping_path, "r") as f:
        for line in f:
            csv_id, book_id = line.strip().split(",")
            csv_book_mapping[csv_id] = book_id

    #overlap users who rated liked books highly
    overlap_users = set()
    with open(interactions_path, 'r') as f:
        for line in f:
            user_id, csv_id, _, rating, _ = line.split(",")
            try:
                rating = int(rating)
            except ValueError:
                continue

            book_id = csv_book_mapping.get(csv_id)
            if book_id in liked_books and rating >= 4:
                overlap_users.add(user_id)

    #recommendations based on overlap users
    rec_lines = []
    with open(interactions_path, 'r') as f:
        for line in f:
            user_id, csv_id, _, rating, _ = line.split(",")
            if user_id in overlap_users:
                book_id = csv_book_mapping.get(csv_id)
                rec_lines.append([user_id, book_id, int(rating)])

    #DataFrame for recommendations
    recs_df = pd.DataFrame(rec_lines, columns=["user_id", "book_id", "rating"])
    recs_df["book_id"] = recs_df["book_id"].astype(str)

    #calculate top recommendations
    top_recs = recs_df["book_id"].value_counts()
    books_titles = pd.read_json(books_titles_path)
    books_titles["book_id"] = books_titles["book_id"].astype(str)
    all_recs = top_recs.to_frame().reset_index()
    all_recs.columns = ["book_id", "book_count"]

    #book details
    all_recs = all_recs.merge(books_titles, how="inner", on="book_id")
    all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

    #recommendations
    popular_recs = all_recs[all_recs["book_count"] > 75].sort_values("score", ascending=False)

    #books already liked by the user
    popular_recs = popular_recs[~popular_recs["book_id"].isin(liked_books)]

    #links and images for better presentation
    def make_clickable(val):
        return f'<a target="_blank" href="{val}">Goodreads</a>'

    def show_image(val):
        return f'<img src="{val}" width=50></img>'

    return popular_recs.head(10).style.format({'url': make_clickable, 'cover_image': show_image})




## Using our Recommendation Engine: 

The book recommendation system uses a list of bookids. An actual recomendation system would take a book name, convert it to a book id, then provide recomendations. 

In [62]:
liked_books = ["11069349", "13537029", "26827125", "18209268", "18693763"]
recommend_books(
    liked_books, 
    "data/book_id_map.csv", 
    "data/goodreads_interactions.csv", 
    "books_titles.json")

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,modified_title,score
8054,26856502,123,"Vengeful (Villains, #2)",35,Goodreads,,vengeful villains 2,432.257143
4499,28170940,213,"Lethal White (Cormoran Strike, #4)",106,Goodreads,,lethal white cormoran strike 4,428.009434
102,34273236,2804,Little Fires Everywhere,21135,Goodreads,,little fires everywhere,372.009274
6267,34927828,157,The Great Alone,70,Goodreads,,the great alone,352.128571
6196,24909347,158,"Obsidio (The Illuminae Files, #3)",82,Goodreads,,obsidio the illuminae files 3,304.439024
567,32920226,1135,"Sing, Unburied, Sing",4592,Goodreads,,sing unburied sing,280.536803
3550,34217599,267,Future Home of the Living God,263,Goodreads,,future home of the living god,271.060837
3194,24493732,297,Solutions and Other Problems,334,Goodreads,,solutions and other problems,264.098802
525,25810500,1192,What is Not Yours is Not Yours,5470,Goodreads,,what is not yours is not yours,259.755759
176,28815371,2228,The Mothers,22346,Goodreads,,the mothers,222.141949


In [64]:
book_ids = ["11069349", "13537029", "26827125", "18209268", "18693763"]

# Load the titles from JSON
titles = pd.read_json("books_titles.json")

# Filter the titles DataFrame for the given book IDs
matched_books = titles[titles["book_id"].isin(book_ids)]

# Display the results
print(matched_books[["book_id", "title"]])

Empty DataFrame
Columns: [book_id, title]
Index: []


In [68]:
titles[titles["book_id"] == "13537029"]

Unnamed: 0,book_id,title,ratings,url,cover_image,modified_title
