# BookFindr
### Ignacio Díaz-Guardamino García

Esta aplicación escanea los libros de una biblioteca en .txt y recomienda, a partir de un libro, otro para leer a continuación.

In [1]:
import pandas as pd
import numpy as np
import gensim.models.word2vec as w2v
import multiprocessing
import glob, os
import re
import pprint
import sklearn.manifold
import matplotlib.pyplot as plt

In [2]:
text_corpus = []
booksdf = pd.DataFrame(columns = ['name', 'content']) 
for file in glob.glob("*.txt"):
    with open(file, 'r', encoding="utf8") as f:
        print(file)
        i=0
        cont = ""
        for line in f:
            l=re.sub("[^a-z\s]+", "", line.lower().rstrip("\n"))  #minusculas y espacios.
            words = l.split()
            text_corpus.append(words)
            cont=cont+" "+l
        df=pd.DataFrame([[file, cont]],columns = ['name', 'content']) 
        booksdf=pd.concat([df, booksdf])

(Trilogia El Siglo 02) El Invie - Ken Follett.txt
Duenas, Maria - El tiempo entre - Author.txt
El Abuelo Que Salto Por La Vent - Jonas Jonasson.txt
El aprendiz - Clive Cussler.txt
El laberinto azul - Douglas Preston.txt
Huesos en el jardin - Henning Mankell.txt
Inferno - Dan Brown.txt
La Cupula - Stephen King.txt
La De Los Tristes Destinos - Perez Galdos, Benito.txt
Los Dientes Del Tigre I - Clancy, Tom.txt
Los inocentes - David Baldacci.txt
Ola de calor NH1 - Richard Castle.txt
Pisando los talones - Henning Mankell.txt
Vargas Llosa, Mario - La tia Ju - Sin determinar.txt


In [3]:
booksdf.head()

Unnamed: 0,name,content
0,"Vargas Llosa, Mario - La tia Ju - Sin determin...",vargas llosa mario la tia julia y el escribi...
0,Pisando los talones - Henning Mankell.txt,la noche de san juan alguien agazapado tras u...
0,Ola de calor NH1 - Richard Castle.txt,richard castle nikki heat ola de calor ...
0,Los inocentes - David Baldacci.txt,los inocentes david baldacci traducci...
0,"Los Dientes Del Tigre I - Clancy, Tom.txt",los dientes del tigre i sobrecubierta n...


In [4]:
# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 50
# Minimum word count threshold.
min_word_count = 1

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7


downsampling = 1e-1

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

songs2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

songs2vec.build_vocab(text_corpus)
print (len(text_corpus))

166375


In [5]:
import time
start_time = time.time()

songs2vec.train(text_corpus, total_examples=songs2vec.corpus_count, epochs=10)

if not os.path.exists("trained"):
    os.makedirs("trained")

songs2vec.save(os.path.join("trained", "songs2vectors_ES.w2v"))

print("--- %s seconds ---" % (time.time() - start_time))

--- 34.17405676841736 seconds ---


In [6]:
songs2vec = w2v.Word2Vec.load(os.path.join("trained", "songs2vectors_ES.w2v"))

#### Let's explore our model

Find similar words

In [7]:
songs2vec.wv.most_similar("escalera")

[('caracol', 0.8572943806648254),
 ('rampa', 0.8272514343261719),
 ('escalinata', 0.8189619183540344),
 ('consola', 0.816263735294342),
 ('ventana', 0.8154088854789734),
 ('pasillo', 0.8115946650505066),
 ('barandilla', 0.8095236420631409),
 ('verja', 0.8056825995445251),
 ('esquina', 0.8041695356369019),
 ('acera', 0.8040857911109924)]

In [8]:
songs2vec.wv.most_similar("gris")

[('negro', 0.8709320425987244),
 ('color', 0.8564888834953308),
 ('oscuro', 0.8563015460968018),
 ('terno', 0.8539880514144897),
 ('rojo', 0.8518260717391968),
 ('blanco', 0.8457569479942322),
 ('inmaculado', 0.8456488251686096),
 ('marrn', 0.8438411951065063),
 ('prpura', 0.8423329591751099),
 ('oliva', 0.835707426071167)]

In [9]:
songs2vec.wv.most_similar("tiempo")

[('llevabais', 0.7659791707992554),
 ('lapso', 0.7491129636764526),
 ('apremia', 0.7344460487365723),
 ('cavilar', 0.7288386821746826),
 ('mucho', 0.7262729406356812),
 ('ausentars', 0.7255872488021851),
 ('cunto', 0.71293705701828),
 ('durara', 0.7094148993492126),
 ('orlos', 0.7093833684921265),
 ('prolongado', 0.7042495012283325)]

Words out of context

In [10]:
songs2vec.wv.doesnt_match("perro gato queso elefante".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'queso'

In [11]:
songs2vec.most_similar(positive=['amarillo', 'oscuro'], negative=['claro'])
#queen

  """Entry point for launching an IPython kernel.


[('amarillento', 0.782090425491333),
 ('gris', 0.7743739485740662),
 ('inmaculado', 0.7706445455551147),
 ('negro', 0.7696190476417542),
 ('rojo', 0.7580102682113647),
 ('rectngulo', 0.753308892250061),
 ('fieltro', 0.7430537939071655),
 ('flanqueado', 0.7399861216545105),
 ('tallado', 0.7354665994644165),
 ('oliva', 0.730684757232666)]

Semantic distance between words

In [12]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = songs2vec.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))

In [13]:
nearest_similarity_cosmul("beso", "amor", "tiempo")

beso es a amor, lo que caf es a tiempo


With the word vector embeddings in place, it is now time to calculate the normalised vector sum of each song. This process can take some time since it has to be done for each of 57,000 songs.

In [14]:

def songVector(row):
    vector_sum = 0
    words = row.lower().split()
    for word in words:
        vector_sum = vector_sum + songs2vec[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum


import time
start_time = time.time()

booksdf['bookV'] = booksdf['content'].apply(songVector)

  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """


**t-sne and random song selection** 

The songs have 50 dimensions each. Application of t-sne is memory intensive and hence it is slightly easier on the computer to use a random sample of the 57,000 songs.

In [15]:
book_vectors = []
from sklearn.model_selection import train_test_split

train, test = train_test_split(booksdf, test_size = 0.1)


for book_vector in train['bookV']:
    book_vectors.append(book_vector)

train.head(10)

Unnamed: 0,name,content,bookV
0,Pisando los talones - Henning Mankell.txt,la noche de san juan alguien agazapado tras u...,"[[-0.027191596, 0.18668945, -0.19717966, -0.05..."
0,"La De Los Tristes Destinos - Perez Galdos, Ben...",annotation el gran friso narrativo de los e...,"[[-0.034949593, 0.19055082, -0.22733259, -0.06..."
0,"Los Dientes Del Tigre I - Clancy, Tom.txt",los dientes del tigre i sobrecubierta n...,"[[-0.044030853, 0.16540188, -0.19034635, -0.04..."
0,El Abuelo Que Salto Por La Vent - Jonas Jonass...,annotation momentos antes de que empiece la ...,"[[-0.04785654, 0.20633532, -0.2063438, -0.0536..."
0,La Cupula - Stephen King.txt,la cpula la cpula stephen king la c...,"[[-0.0655239, 0.16675738, -0.20991945, -0.0483..."
0,El laberinto azul - Douglas Preston.txt,a las nueve y veinte de la noche llaman al ti...,"[[-0.038612917, 0.16618598, -0.22943357, -0.07..."
0,Inferno - Dan Brown.txt,el smbolo perdido sobrecubierta none ...,"[[-0.07164767, 0.18479377, -0.20860207, -0.015..."
0,Los inocentes - David Baldacci.txt,los inocentes david baldacci traducci...,"[[-0.05155166, 0.1710463, -0.21186417, -0.0446..."
0,(Trilogia El Siglo 02) El Invie - Ken Follett.txt,sin determinar la cada de los gigantes po...,"[[-0.08367297, 0.1891148, -0.22900401, -0.0411..."
0,"Vargas Llosa, Mario - La tia Ju - Sin determin...",vargas llosa mario la tia julia y el escribi...,"[[-0.039699826, 0.18477929, -0.21834657, -0.05..."


I had a fairly measly 4gb machine and wasn't able to generate a more accurate model. However, one can play around with the number of iterations, learning rate and other factors to fit the model better. If you have too many dimensions (~300+), it might make sense to use PCA first and then t-sne.

In [16]:
X = np.array(book_vectors).reshape((12, 50))

start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=250, random_state=0, verbose=2)

all_word_vectors_matrix_2d = tsne.fit_transform(X)

print("--- %s seconds ---" % (time.time() - start_time))

[t-SNE] Computing 11 nearest neighbors...
[t-SNE] Indexed 12 samples in 0.000s...
[t-SNE] Computed neighbors for 12 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 12 / 12
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] Computed conditional probabilities in 0.003s
[t-SNE] Iteration 50: error = 57.3040390, gradient norm = 0.1503435 (50 iterations in 0.008s)
[t-SNE] Iteration 100: error = 49.2206573, gradient norm = 0.0624859 (50 iterations in 0.006s)
[t-SNE] Iteration 150: error = 48.9843292, gradient norm = 0.2331342 (50 iterations in 0.006s)
[t-SNE] Iteration 200: error = 45.8773422, gradient norm = 0.2595071 (50 iterations in 0.007s)
[t-SNE] Iteration 250: error = 43.7331009, gradient norm = 0.2320879 (50 iterations in 0.006s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 43.733101
[t-SNE] KL divergence after 251 iterations: 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458

In [17]:
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])

df.head(10)

train.head()

df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

Joining two dataframes to obtain each song's corresponding X,Y co-ordinate.

In [18]:
two_dimensional_songs = pd.concat([train, df], axis=1)

two_dimensional_songs.head()

Unnamed: 0,name,content,bookV,X,Y
0,Pisando los talones - Henning Mankell.txt,la noche de san juan alguien agazapado tras u...,"[[-0.027191596, 0.18668945, -0.19717966, -0.05...",25.193199,5.747581
1,"La De Los Tristes Destinos - Perez Galdos, Ben...",annotation el gran friso narrativo de los e...,"[[-0.034949593, 0.19055082, -0.22733259, -0.06...",-7.722825,-4.567806
2,"Los Dientes Del Tigre I - Clancy, Tom.txt",los dientes del tigre i sobrecubierta n...,"[[-0.044030853, 0.16540188, -0.19034635, -0.04...",41.580544,35.345367
3,El Abuelo Que Salto Por La Vent - Jonas Jonass...,annotation momentos antes de que empiece la ...,"[[-0.04785654, 0.20633532, -0.2063438, -0.0536...",24.602962,9.269307
4,La Cupula - Stephen King.txt,la cpula la cpula stephen king la c...,"[[-0.0655239, 0.16675738, -0.20991945, -0.0483...",19.401373,-8.756375


**Plotting the results**

Using plotly, I plotted the results so that it becomes easier to explore similar songs based on their colors and clusters.

In [19]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

import plotly.graph_objs as go

trace1 = go.Scatter(
    y = two_dimensional_songs['Y'],
    x = two_dimensional_songs['X'],
    text = two_dimensional_songs['name'],
    mode='markers',
    marker=dict(
        size= 16,#[10],#'7',
        color = np.random.randn(5717), #set color equal to a variable
        colorscale='Viridis',
        showscale=True
    )
)
data = [trace1]

iplot(data)

In [20]:
#esta funcion busca libros cercanos en el grafico.
def closest(df,book):
    x=df[df['name'] == book]["X"].values[0]
    y=df[df['name'] == book]["Y"].values[0]
    dmin=100000000
    name=""
    
    for index, b in df.iterrows():
        dist=(x - b["X"])**2+(y - b["Y"])**2   #distancia euclidea, no hace falta raiz para comparar
        if dist!=0 and dist<dmin:
            dmin=dist
            name=b["name"]
    return name

In [21]:
book="El aprendiz - Clive Cussler.txt"    
print("un libro similar a "+book+" en tu colección es "+closest(two_dimensional_songs,book))

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
book="Los Dientes Del Tigre I - Clancy, Tom.txt"    
print("un libro similar a "+book+" en tu colección es "+closest(two_dimensional_songs,book))