# Part3 ML Recommender System

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("./cleaned_datasets/books_clean.csv")
df.shape
df.head(3)

Unnamed: 0,author,avg_rating,genres,language,num_pages,num_ratings,num_reviews,title,url
0,jon krakauer,4.0,environment travel survival biography memoir a...,English,215.0,983231.0,24367.0,Into the Wild,https://www.goodreads.com/book/show/1845.Into_...
1,bell hooks,4.14,social movement politic sociology race women s...,English,123.0,18885.0,1586.0,Feminism Is for Everybody: Passionate Politics,https://www.goodreads.com/book/show/168484.Fem...
2,mark bowden,4.28,war africa north american cultural politic mil...,English,386.0,59451.0,1727.0,Black Hawk Down: A Story of Modern War,https://www.goodreads.com/book/show/55403.Blac...


## Preprocessing
We first try and create some features to train a model

In [3]:
# Feature 1: Group the ratings
ratings = ['very low','low','neutral','high','very high']

df['avg_rating'] = df['avg_rating'].apply(lambda x: ratings[0] if x<=1
                                         else (ratings[1] if (x <=2) & (x>1)
                                              else (ratings[2] if (x<=3) & (x>2)
                                                   else (ratings[3] if (x<=4) & (x>3)
                                                        else ratings[4]))))

In [4]:
# Feature 2: Group the languages
languages = ['English','German','Spanish','French','Dutch']

df['language'] = df['language'].apply(lambda x: 'others' if x not in languages else x)

In [5]:
# Feature 3: Group average number of reviews and ratings

# Using minmaxscaler and ball tree nearest neighbors classifier
- avg rating
- language
- num_reviews
- and 
- num_pages

**Variables has been renamed**

# https://thecleverprogrammer.com/2021/01/17/book-recommendation-system/

In [6]:
#One hot encode
rating_df = pd.get_dummies(df['avg_rating'])
language_df = pd.get_dummies(df['language'])

In [7]:
features = pd.concat([rating_df, 
                      language_df, 
                      df['num_reviews'], 
                      df['num_pages']], axis=1)

In [8]:
mms = MinMaxScaler()
features = mms.fit_transform(features)

In [9]:
ball_tree = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')

In [10]:
ball_tree.fit(features)

NearestNeighbors(algorithm='ball_tree', n_neighbors=6)

In [11]:
dist,id_list = ball_tree.kneighbors(features)

In [12]:
def ball_tree_recommender(book,df,id_list,col='title'):
    '''
    book = title of the book user would use to search for recommendation
    df = original dataframe used to train the model
    idlist = list of ids of titles generated from the model
    col = column name of the book's titles in the dataframe used
    '''
    i = df[df[col] == book].index[0]
    return [df.loc[x][col] for x in id_list[i]]

In [13]:
recommendations = ball_tree_recommender("Becoming",df=df,id_list=id_list)
recommendations

['Becoming',
 'Cinder',
 'Water for Elephants',
 'The Maze Runner',
 'The Cruel Prince',
 'A Thousand Splendid Suns']

# This is sample to get URL to show book image !

In [None]:
#Recommending top 5 similar books

def recommendations(title):
    
    # finding cosine similarity for the vectors

    cosine_similarities = cosine_similarity(tfidf_vectors,  tfidf_vectors)
    
    # taking the title and book image link and store in new data frame called books
    books = df[['title', 'image_link']]
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['title']).drop_duplicates()
         
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]
    recommend = books.iloc[book_indices]
    for index, row in recommend.iterrows():

        response = requests.get(row['image_link'])
        img = Image.open(BytesIO(response.content))
        plt.figure()
        plt.imshow(img)
        plt.title(row['title'])

# Not Enough Ram to run this!

# https://analyticsindiamag.com/how-to-build-a-content-based-movie-recommendation-system-in-python/

Now we will transform the overview column in the vector form so that we can compute similarity. Use the below code to convert it.  We have used TFidfVectorizer for the same. 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words = 'english')

matrix = tf.fit_transform(df['genres'])

Now we are ready to compute cosine similarity 
to check what all movies are of the same content 
on the basis of the overview column that was present in the data set.  

In [20]:
#from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [None]:
#cosine_similarities = linear_kernel(matrix,matrix)
cosine_similarities = cosine_similarity(matrix,matrix)

After this, we will reset the index with the movie name that is the original title and will define a function for the recommendation that will search for similar movies by checking cosine similarities and will return us.

In [None]:
movie_title = df['title']

indices = pd.Series(df.index, index=df['title'])

def movie_recommend(original_title):

    idx = indices[original_title]

    sim_scores = list(enumerate(cosine_similarities[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:31]

    movie_indices = [i[0] for i in sim_scores]

    return movie_title.iloc[movie_indices]

Now we will compute top 10 recommendations for 3 different movies and check the results. Use the below code to the same. 

In [None]:
movie_recommend('Feminism Is for Everybody: Passionate Politics')