# TSC_UC3M_2019  

Author: Jon Lérida  
Description: The following notebook aims to cover the first steps in the word embeddings classification algorithms.

In [57]:
from gensim.models import KeyedVectors
import gensim
from gensim.test.utils import common_texts

import os
import subprocess

import wikipediaapi

from time import monotonic as timer

import pandas as pd

import numpy as np


from nltk import download
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import sys

from pprint import pprint

# Load the FastText file  
This file contains the whole Wikipedia vocabulary in embedding format (about 6 Gb). Each word is mapped to a high dimension (i.e, 300) vector. So the final matrix will be an $N \times 300$, where $N$ is the number of words in the document.

In [3]:
model_path = 'wiki_data/wiki.en.vec'
start = timer()
wv = KeyedVectors.load_word2vec_format(model_path)

print('Elapsed time (mins): {:.2f}'.format((timer() - start)/60))

Elapsed time (mins): 7.24


In [5]:
print("The model has %s tokens with size %s" % (len(wv.vocab), wv.vector_size))

The model has 2519370 tokens with size 300


_wv_ variable contains the full file (_ie_, all the embeddings) as a dictionary list, so we can do things like:

In [181]:
pprint(wv['car'][0:10])

array([-0.092271, -0.14855 , -0.14696 ,  0.013   , -0.40305 , -0.31004 ,
        0.1022  , -0.42087 , -0.22948 ,  0.12853 ], dtype=float32)


to get the first 10 components of the _car_ vector representation. Also, _KeyedVectors_ module allows us to check for word similarities:

In [187]:
for word_sim in wv.similar_by_word('car'):
    print("Word:", word_sim[0], "similarity score:", word_sim[1])

  if np.issubdtype(vec.dtype, np.int):


Word: cars similarity score: 0.8341586589813232
Word: automobile similarity score: 0.7181168794631958
Word: truck similarity score: 0.7055484056472778
Word: motorcar similarity score: 0.6987981796264648
Word: vehicle similarity score: 0.6951144337654114
Word: driver similarity score: 0.6925972700119019
Word: drivecar similarity score: 0.6851067543029785
Word: minivan similarity score: 0.6729590892791748
Word: roadster similarity score: 0.6720188856124878
Word: racecars similarity score: 0.6717766523361206


# Download some Wikipedia definitions  

This trial will include _Mathematics, Economics, Psicology_

In [121]:
# Create an instance
wiki_wiki = wikipediaapi.Wikipedia('en')

categories = ['Mathematics', 'Economics', 'Philosophy', 'Art']

docs = []

# Include the category definition as a list
for cat in categories:
    print("Getting definition of ", cat)
    docs.append({cat: wiki_wiki.page(cat).text})

print("Number of Documents: ", len(docs))

Getting definition of  Mathematics
Getting definition of  Economics
Getting definition of  Philosophy
Getting definition of  Art
Number of Documents:  4


# Corpus acquisition  
Using the _gensim_ Python module

In [153]:
wnl = WordNetLemmatizer()
stopwords_en = stopwords.words('english')

corpus_clean = []

for doc in docs:
    for val in doc.values():
        print("Tokenizing", list(doc.keys())[0])
        # Tokenize each text entry. 
        tokens = word_tokenize(val)
        tokens_filtered = [el.lower() for el in tokens if el.isalnum()]
        tokens_lemmatized = [wnl.lemmatize(el) for el in tokens_filtered]
        tokens_clean = [token for token in tokens_lemmatized if token not in stopwords_en]    
        corpus_clean.append(tokens_clean)
        

print("\nDone\n")
# Delete digit tokens
corpus_clean_no_number = []
for n, corpus in enumerate(corpus_clean):
    print("Cleaning corpus", list(docs[n].keys())[0])
    corpus_clean_no_number.append([x for x in corpus if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())])
    
    
    
# Creamos el diccionario de tokens y eliminamos los números
D = gensim.corpora.Dictionary(corpus_clean_no_number)
n_tokens = len(D)
print("\nDictionary w/o numbers contains", len(D), "unique tokens")

print('\nFirst terms in the dictionary (not by frequency):')
for n in range(15):
    print(str(n), ':', D[n])
    
    
no_below = 1
no_above = .75 
D.filter_extremes(no_below=no_below, no_above=no_above, keep_n=500000)
n_tokens = len(D)
print("\n\nFiltering dictionary...")
print('The dictionary contains', n_tokens, 'terms')
print('First terms in the dictionary:')
for n in range(10):
    print(str(n), ':', D[n])
    
    
print('The dictionary contains', n_tokens, 'terms')    


corpus_bow = [D.doc2bow(doc) for doc in corpus_clean]

Tokenizing Mathematics
Tokenizing Economics
Tokenizing Philosophy
Tokenizing Art

Done

Cleaning corpus Mathematics
Cleaning corpus Economics
Cleaning corpus Philosophy
Cleaning corpus Art

Dictionary w/o numbers contains 4726 unique tokens

First terms in the dictionary (not by frequency):
0 : 10th
1 : 16th
2 : 17th
3 : 18th
4 : 1930s
5 : 19th
6 : 20th
7 : 2nd
8 : 3rd
9 : 6th
10 : 9th
11 : abel
12 : abstract
13 : abstraction
14 : abstractness


Filtering dictionary...
The dictionary contains 4448 terms
First terms in the dictionary:
0 : 10th
1 : 16th
2 : 17th
3 : 18th
4 : 1930s
5 : 2nd
6 : 3rd
7 : 6th
8 : 9th
9 : abel
The dictionary contains 4448 terms


### Now, we have a dictionary ($D$) which contains all the tokens used in the definitions  

additionally, each article (definition) has been converted to a sparse vector, where each position contains a tuple formed by (word_index, frequency). This way the memory usage has been reduced, since only few entries per article are stored in memory. For exampe, for definition _Mathemathics_, the first 10 terms are:

In [159]:
print(corpus_bow[0][0:10])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1)]


Summarizing, we have downloaded some Wikipedia definitions and saved them as token vectors (_i.e_, each document is a list of words). After that, some language proccesing has been made. Finally, the document is represented as a sparse vector, which contains the token indexes and its frequency in the given article.

# Getting the vector representation of the tokens

In [172]:
embedding = []
for n, index in enumerate(D):
    token = D[index]
    try:
        embedding.append(wv[token])
    except KeyError:
        embedding.append(np.array([np.nan]*wv.vector_size))
        
        
print("First 10 components..\n", embedding[10][0:10])


First 10 components..
 [ 0.17572  0.3706  -0.31275  0.289    0.17087  0.39907  0.14805 -0.34401
 -0.3345   0.41444]


# Pandas Dataframe format

In [178]:
df = pd.DataFrame(embedding)

# Add token column
df['Token'] = [D[index] for index in D]
# Delete NaN entries (i.e, tokens which have not vector representation)
df.dropna(inplace = True)
df.reset_index(drop=True, inplace = True)

print("NaN number:", df.isnull().sum().sum())
print("Number of stored tokens", df.shape[0])

df.head()


NaN number: 0
Number of stored tokens 4404


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Token
0,-0.074886,0.038086,-0.20421,-0.088999,-0.074444,0.15969,0.28693,0.002115,0.018306,0.48267,...,-0.021786,0.25102,0.30669,-0.16479,0.36931,0.078496,-0.012397,0.052154,0.16901,abel
1,0.17572,0.3706,-0.31275,0.289,0.17087,0.39907,0.14805,-0.34401,-0.3345,0.41444,...,-0.065901,0.034094,0.25057,0.25822,-0.42897,-0.17585,-0.012072,0.088724,-0.051181,abstract
2,-0.028491,0.33054,-0.62929,0.25551,0.16547,0.3752,0.28396,-0.36109,-0.43001,0.54781,...,-0.16995,0.041594,0.26723,0.39555,-0.25651,-0.26778,0.21825,0.36796,0.10891,abstraction
3,0.045932,0.24893,-0.52394,0.46715,0.10393,0.23662,0.14605,-0.23357,-0.24333,0.31687,...,-0.084081,0.07557,0.037863,0.4609,-0.46469,-0.23977,0.18029,0.24079,0.01081,abstractness
4,-0.30977,0.15833,-0.19837,0.44775,-0.30332,-0.22506,0.005589,-0.23027,-0.046159,0.2115,...,0.16468,0.037584,-0.11141,-0.012267,-0.13096,-0.094362,0.2869,0.62665,-0.48962,accelerating


# Save the model as a KeyedVector object

so, next time we want to work with the same corpus data it won't be necessary to repeat the whole proccess (which is slow, since big files are stored in memory)

In [188]:
# Create an empty model
wv_corpus = KeyedVectors(vector_size= wv.vector_size)

# Fill with the dataframe corpus
wv_corpus.add(entities= df.Token, weights= df.loc[:, range(wv.vector_size)])

print("Corpus succesfully saved in the object")

# Save in disk storage
wv_corpus.save('TSC_Corpus1')

Corpus succesfully saved in the object
