# Step 7: Experiment with Various Models

__1. Load, Clean, and Wrangle Data__

In [18]:
import os

# Data ETL
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import glob


# Plots/visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Vector computations 
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Deep learning
import tensorflow as tf
import keras
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# Stacking sparse matrices
from scipy.sparse import vstack


In [29]:
# The following loading code was copied from user Oli Hill on Kaggle, who themself copied from user Sandy1112.
# Filepaths have been changed

book_rating_raw = pd.DataFrame()
df_list = []
for file in glob.glob("./data/book*.csv"):
    df_list.append(pd.read_csv(file))
           
book_rating_raw = pd.concat(df_list).set_index('Id')

        
user_rating_raw = pd.DataFrame()
df_list = []
for file in glob.glob("./data/user_rating*.csv"):
    df_list.append(pd.read_csv(file))

user_rating_raw = pd.concat(df_list, ignore_index=True)

# This is directly copied from Oli Hill, who uses sklearn's ordinal encoder to convert ratings to a number scale from 0-5,
# 0 represents no rating, users who only have 0 ratings will be dropped.
order = [["This user doesn't have any rating",'did not like it', 'it was ok', 'liked it', 'really liked it', 'it was amazing']]

encoder = OrdinalEncoder(categories=order)
user_rating_raw['NumberRating'] = encoder.fit_transform(user_rating[['Rating']])

# Drop users with no ratings
user_rating_count = user_rating_raw.groupby('Name')['ID'].count()
user_rating_count = user_rating_count.to_frame()
cols = user_rating_count.columns.tolist()
cols[0] = 'Number of reviews'
user_rating_count.columns = cols


user_rating_count = user_rating_count.sort_values('Number of reviews', ascending =False)

user_rating_raw = user_rating_raw[user_rating_raw['NumberRating'] != 0]


user_rating_count2 = user_rating_raw.groupby('Rating')['ID'].count()
user_rating_count2 = user_rating_count2.to_frame()
cols = user_rating_count2.columns.tolist()
cols[0] = 'Number of reviews'
user_rating_count2.columns = cols

print(user_rating_count2)

user_rating_count2 = user_rating_raw.groupby('NumberRating')['ID'].count()
user_rating_count2 = user_rating_count2.to_frame()
cols = user_rating_count2.columns.tolist()
cols[0] = 'Number of reviews'
user_rating_count2.columns = cols

print(user_rating_count2)

                 Number of reviews
Rating                            
did not like it               7811
it was amazing               92354
it was ok                    28811
liked it                     96047
really liked it             132808
              Number of reviews
NumberRating                   
1.0                        7811
2.0                       28811
3.0                       96047
4.0                      132808
5.0                       92354


In [None]:
# The following taken from the recommendation system mini-project

# Find empty rows to slice dataframe for each book
tmp_books = user_ratings_raw[user_ratings_raw['Rating'].isna()]['Id'].reset_index()
book_indices = [[index, int(book[:-1])] for index, book in tmp_books.values]

# Shift the book_indices by one to get start and endpoints of all books
shifted_book_indices = deque(book_indices)
shifted_book_indices.rotate(-1)

# Gather all dataframes
user_data = []

# Iterate over all books
for [df_id_1, book_id], [df_id_2, next_book_id] in zip(book_indices, shifted_book_indices):
    
    # Check if it is the last book in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create book_id column
    tmp_df['Book'] = book_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(10)


In [24]:
book_rating_raw.head()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,...,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3,Description,Count of text reviews,PagesNumber
0,1,Harry Potter and the Half-Blood Prince (Harry ...,1:9896,652.0,4:556485,total:2298124,16,9,Scholastic Inc.,28062,...,eng,J.K. Rowling,4.57,2:25317,5:1546466,,3:159960,,,
1,2,Harry Potter and the Order of the Phoenix (Har...,1:12455,870.0,4:604283,total:2358637,1,9,Scholastic Inc.,29770,...,eng,J.K. Rowling,4.5,2:37005,5:1493113,0439358078,3:211781,,,
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,1:108202,309.0,4:1513191,total:6587388,1,11,Scholastic Inc,75911,...,eng,J.K. Rowling,4.47,2:130310,5:4268227,,3:567458,,,
3,4,Harry Potter and the Chamber of Secrets (Harry...,1:11896,352.0,4:706082,total:2560657,1,11,Scholastic,244,...,eng,J.K. Rowling,4.42,2:49353,5:1504505,0439554896,3:288821,,,
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,1:10128,435.0,4:630534,total:2610317,1,5,Scholastic Inc.,37093,...,eng,J.K. Rowling,4.57,2:24849,5:1749958,043965548X,3:194848,,,


In [21]:
book_rating_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850310 entries, 0 to 1850309
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Id                     int64  
 1   Name                   object 
 2   RatingDist1            object 
 3   pagesNumber            float64
 4   RatingDist4            object 
 5   RatingDistTotal        object 
 6   PublishMonth           int64  
 7   PublishDay             int64  
 8   Publisher              object 
 9   CountsOfReview         int64  
 10  PublishYear            int64  
 11  Language               object 
 12  Authors                object 
 13  Rating                 float64
 14  RatingDist2            object 
 15  RatingDist5            object 
 16  ISBN                   object 
 17  RatingDist3            object 
 18  Description            object 
 19  Count of text reviews  float64
 20  PagesNumber            float64
dtypes: float64(4), int64(5), object(12)
memory usage: 296.

In [22]:
user_rating.head()

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it


In [23]:
user_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362596 entries, 0 to 362595
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      362596 non-null  int64 
 1   Name    362596 non-null  object
 2   Rating  362596 non-null  object
dtypes: int64(1), object(2)
memory usage: 8.3+ MB


__Filtering__

In [17]:
# Filter sparsely rated books
min_book_ratings = 50
book_ratings_filter = (book_rating['Id'].value_counts()>min_book_ratings)
book_ratings_filter = book_ratings_filter[book_ratings_filter].index.tolist()
book_ratings_filter

[]

In [10]:
# Filter ratings to contain only ratings by users who have rated more than X books
# Going to go with 50 books this time around simply for the sake of retaining more data

dfRatingsLessUsers = dfRatings['User-ID'].value_counts() > 50
y = x[x].index  #User-IDs
print(y.shape)
dfRatings = dfRatings[dfRatings['User-ID'].isin(y)]
dfRatings.info()

(1295,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 173173 entries, 1202 to 1149637
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      173173 non-null  int64 
 1   ISBN         173173 non-null  object
 2   Book-Rating  173173 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 5.3+ MB


In [11]:
dfBookPivot = dfRatings.pivot_table(columns='User-ID', index='Book-Title', values='Book-Rating')
dfBookPivot.fillna(0, inplace=True)
dfBookPivot.shape

NameError: name 'dfRatingsFinal' is not defined