In [0]:
# Google Cloud things

project_id = 'cs229-project-258114'
bucket_name = 'goodreads_reviews'

from google.colab import auth
auth.authenticate_user()

!gcloud config set project {project_id}

# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://{bucket_name}/finalbooks.csv /tmp/finalbooks.csv
!gsutil cp gs://{bucket_name}/finalratings.csv /tmp/finalratings.csv 
!gsutil cp gs://{bucket_name}/train.csv /tmp/train.csv 
!gsutil cp gs://{bucket_name}/test.csv /tmp/test.csv 
## !gsutil cp gs://{bucket_name}/Popularity/goodreads-best-books.zip /tmp/best.zip

# basics
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tools
import scipy
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


finalbooks = pd.read_csv('../tmp/finalbooks.csv')
ratings = pd.read_csv('../tmp/finalratings.csv')
train = pd.read_csv('../tmp/train.csv')
test = pd.read_csv('../tmp/test.csv')

In [0]:
## Done
## train, test = train_test_split(ratings,
                               ##stratify=ratings['newuser_id'], 
                               ##test_size=0.20,
                               ##random_state=42)

In [0]:
##test['newbookid'].value_counts()

In [0]:
## NDGC

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

In [0]:
## MAP

def precision_k(r, k):
    """Score is precision @ k
    Args:
        r: Binary Y/N in Predicted Rank Order (1st element is top recommendation)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    """Average Precision
    Args:
        r: Binary Y/N in Predicted Rank Order (1st element is top recommendation)
    Returns:
        Average Precision
    """
    r = np.asarray(r) != 0
    out = [precision_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    """Mean Average Precision (MAP)
    Args:
        rs: Iterator / For each user: Binary Y/N in Predicted Rank Order
    Returns:
        MAP
    """
    return np.mean([average_precision(r) for r in rs]) 


In [0]:
def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

In [0]:
# Distribution of the # of Ratings per Book
facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(ratings['newbookid'].value_counts(), 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of # of reviews per Book in Dataset (mean = 597 reviews / max = 22806 / min = 8 )')
plt.show()

In [0]:
## DEFINING THE TAIL
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

#finalbooks.loc[finalbooks.popshare >= 0.8].describe()
tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

In [0]:
# Distribution of the # of Ratings per Book
facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(ratings['newbookid'].value_counts(), 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of # of reviews per Book in Dataset (mean = 597 reviews / max = 22806 / min = 8 )')
plt.show()

In [0]:
## BASELINE APPROACH: POPULARITY MODEL

popular = finalbooks.groupby('newbookid')['ratings_count','average_rating'].sum().sort_values(by = 'ratings_count',ascending=False).reset_index()
popular.head(5)

In [0]:
poprank = test.merge(popular,on = 'newbookid')
poprank['pred']= np.round(poprank['average_rating'])
poprank['ratingYN']= [1 if x>= 4 else 0 for x in poprank['rating']]


poprank = poprank.sort_values(by=['newuser_id', 'ratings_count'], ascending=False)
poprank.head(5)

In [0]:

poplista = []

for i in range(15000):
  a = poprank.loc[poprank.newuser_id == i+1]['rating'].tolist()
  poplista.append(a)
  if (i+1)%1000 == 0: print("done: ", i+1)
   



In [0]:
top = poprank.sort_values('ratings_count',ascending = False).groupby('newuser_id').head(10)
top

In [0]:
b = np.array([ndcg_k(r, len(r)) for r in poplista])

facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the Popularity model')
plt.show()

# [ndcg_k(r, len(r)) for r in poplista]
d = b[b == 1]
sum(d)/15000

In [0]:

# print('Popularity Model MAP: ', mean_average_precision(poplistb))
print('(1) Popularity Model RMSE: ', np.round(rmse(poprank['rating'],poprank['average_rating']), decimals=3))
print('(2) Popularity Model NDCG: ', np.round(mean_ndcg(poplista), decimals=3))
print("(2) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(2) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))
#print('(3) Popularity Model Div10 Score: ',np.round(sum(np.in1d(top.newbookid, tail.newbookid))/len(top), decimals=2))
#print('(3) Popularity Model Div50 Score: ',np.round(sum(np.in1d(top.newbookid, tail.newbookid))/len(top), decimals=2))

In [0]:
popranktrain = test.merge(popular,on = 'newbookid')
popranktrain['pred']= np.round(poprank['average_rating'])

In [0]:
poplisttrain = []
for i in range(15000):
    a = popranktrain.loc[popranktrain.newuser_id == i+1]['rating'].tolist()
    poplisttrain.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [0]:
print('(1) Pop Train Model RMSE: ', np.round(rmse(popranktrain['pred'],popranktrain['rating']), decimals=3))
print('(2) Pop Train Model NDCG: ', np.round(mean_ndcg(poplisttrain), decimals=3))

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
test.to_csv( '../tmp/test.csv' , index = False )
!cp /tmp/test.csv drive/My\ Drive/