In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_colwidth = 200

In [3]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
etiquetas = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

In [4]:
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Documento': corpus, 'Categoria':etiquetas})
corpus_df = corpus_df[['Documento', 'Categoria']]
corpus_df

Unnamed: 0,Documento,Categoria
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beautiful today,weather
7,The dog is lazy but the brown fox is quick!,animals


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
import re
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

## Modelo Bag of Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.2, max_df=0.8)
# Min_df y Max_df nos sirven para controlar el porcentaje mínimo y máximo de apariciones de un token
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [8]:
# Palabras únicas del corpus
vocabulario = cv.get_feature_names()
# Mostrar el vector
pd.DataFrame(cv_matrix, columns=vocabulario)


Unnamed: 0,bacon,beautiful,blue,brown,dog,eggs,fox,ham,lazy,love,quick,sausages,sky
0,0,1,1,0,0,0,0,0,0,0,0,0,1
1,0,1,1,0,0,0,0,0,0,1,0,0,1
2,0,0,0,1,1,0,1,0,1,0,1,0,0
3,1,0,0,0,0,1,0,1,0,0,0,1,0
4,1,0,0,0,0,1,0,1,0,1,0,1,0
5,0,0,1,1,1,0,1,0,1,0,1,0,0
6,0,1,1,0,0,0,0,0,0,0,0,0,2
7,0,0,0,1,1,0,1,0,1,0,1,0,0


## Modelo Bag of N - grams

In [None]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()
vocabulario = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocabulario)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocabulario = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocabulario)

## Document Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df