In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from os import path
import zipfile
import pandas as pd

In [3]:
if not path.exists('./goodreads10k/ratings.csv'):
    with zipfile.ZipFile("./goodreads10k/goodreads10k.zip","r") as zip_ref:
        zip_ref.extractall("./goodreads10k")

### Load data

In [5]:
books = pd.read_csv('./goodreads10k/books.csv')
ratings = pd.read_csv('./goodreads10k/ratings.csv')

In [6]:
ratings = ratings.sort_values(by='book_id').reset_index(drop=True)
ratings

Unnamed: 0,user_id,book_id,rating
0,29300,1,4
1,6590,1,3
2,7546,1,5
3,43484,1,1
4,18689,1,5
...,...,...,...
5976474,31293,10000,3
5976475,12272,10000,4
5976476,35330,10000,4
5976477,46337,10000,5


In [7]:
from goodreads10k.categories import get_categories
book_categories = get_categories()

In [11]:
from surprise import Reader, Dataset

reader = Reader(rating_scale=(1, 5))
trainset = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader).build_full_trainset()

# Matrix Factorization

![title](https://miro.medium.com/max/4320/1*b4M7o7W8bfRRxdMxtFoVBQ.png)




In [16]:
from SVD import NeuralNetworkSR

nn = NeuralNetworkSR(ratings, num_epochs=2, n_features=30, log_freq=50, batch_size=10000, hiddens=[400, 100], lr=0.001, wd=1e-1)
nn.fit(trainset)

[1,    50] loss: 38.947
[1,   100] loss: 37.829
[1,   150] loss: 36.807
[1,   200] loss: 35.811
[1,   250] loss: 34.872
[1,   300] loss: 33.886
[1,   350] loss: 32.939
[1,   400] loss: 32.114
[1,   450] loss: 31.238
[1,   500] loss: 30.407
[1,   550] loss: 29.675
[2,    50] loss: 28.141
[2,   100] loss: 27.452
[2,   150] loss: 26.682
[2,   200] loss: 26.109
[2,   250] loss: 25.390
[2,   300] loss: 24.687
[2,   350] loss: 24.078
[2,   400] loss: 23.482
[2,   450] loss: 22.903
[2,   500] loss: 22.293
[2,   550] loss: 21.762
Finished Training


<SVD.NeuralNetworkSR at 0x23e3d186148>

In [18]:
import numpy as np

i_weights = nn.net.i_embeddings.cpu().weight.detach().numpy() 
i_bias = nn.net.i_biases.cpu().weight.detach().numpy()

book_embeddings = np.hstack([i_weights, i_bias])

### TSNE dimension reduction

Each book embedding is projected onto 2D space.

In [22]:
from sklearn.manifold import TSNE

book_embeddings_2D = TSNE(n_components=2, perplexity=50, learning_rate=200, n_iter=1000, verbose=1).fit_transform(book_embeddings)
    

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.074s...
[t-SNE] Computed neighbors for 10000 samples in 7.272s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.305080
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.204346
[t-SNE] KL divergence after 1000 iterations: 4.369103


### Save results

In [23]:
books_final_df = books.copy()

books_final_df['x_tsne'] = book_embeddings_2D[:, 0]
books_final_df['y_tsne'] = book_embeddings_2D[:, 1]

books_final_df.to_csv('books_with_embeddings.csv', index=False)