# Librerías

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

# Corpus etiquetado

In [None]:
etiquetas = ["dickens_a_christmas_carol",
             "dickens_a_tale_of_two_cities",
             "dickens_oliver_twist",
             "shakespeare_hamlet",
             "shakespeare_romeo_juliet",
             "shakespeare_the_merchant_of_venice"]
corpus = []

for etiqueta in etiquetas:
  archivo = open(etiqueta + ".txt", "r")
  corpus.append(archivo.read())
  archivo.close()


corpus = np.array(corpus)
df_corpus = pd.DataFrame({"documento": corpus,
                          "categoria": etiquetas})
df_corpus

Unnamed: 0,documento,categoria
0,"project,gutenberg,ebook,christmas,carol,charle...",dickens_a_christmas_carol
1,"project,gutenberg,ebook,tale,two,city,charles,...",dickens_a_tale_of_two_cities
2,"project,gutenberg,ebook,oliver,twist,charles,d...",dickens_oliver_twist
3,"project,gutenberg,ebook,hamlet,william,shakesp...",shakespeare_hamlet
4,"project,gutenberg,ebook,romeo,juliet,william,s...",shakespeare_romeo_juliet
5,"project,gutenberg,ebook,merchant,venice,willia...",shakespeare_the_merchant_of_venice


# Modelo de Bolsa de Palabras

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# bolsa de palabras en matriz dispersa
count_vectorizer = CountVectorizer(min_df=0.0, max_df=1.0)
matriz_conteo = count_vectorizer.fit_transform(corpus)
matriz_conteo

<6x15164 sparse matrix of type '<class 'numpy.int64'>'
	with 30424 stored elements in Compressed Sparse Row format>

In [None]:
# ver valores diferentes de cero en la matriz dispersa
print(matriz_conteo)

  (0, 10315)	89
  (0, 6108)	95
  (0, 4317)	20
  (0, 2370)	97
  (0, 2094)	9
  (0, 2250)	5
  (0, 3735)	2
  (0, 2159)	1
  (0, 15082)	2
  (0, 1591)	1
  (0, 8214)	26
  (0, 13475)	19
  (0, 178)	2
  (0, 13227)	13
  (0, 567)	13
  (0, 7486)	2
  (0, 1710)	7
  (0, 6117)	1
  (0, 6118)	1
  (0, 6119)	1
  (0, 6120)	1
  (0, 6121)	1
  (0, 6122)	1
  (0, 2172)	14
  (0, 6283)	5
  :	:
  (5, 12639)	1
  (5, 4629)	1
  (5, 1976)	1
  (5, 15036)	1
  (5, 4513)	1
  (5, 1070)	1
  (5, 3253)	1
  (5, 13697)	1
  (5, 730)	1
  (5, 5710)	1
  (5, 10074)	2
  (5, 11587)	1
  (5, 1329)	5
  (5, 9004)	1
  (5, 15012)	1
  (5, 3920)	1
  (5, 4522)	1
  (5, 848)	1
  (5, 1303)	2
  (5, 8513)	1
  (5, 450)	1
  (5, 14086)	1
  (5, 8185)	1
  (5, 5686)	1
  (5, 5687)	1


In [None]:
# ver la representación densa
matriz_conteo = matriz_conteo.toarray()
matriz_conteo

array([[1, 1, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [3, 0, 1, ..., 2, 1, 0],
       [3, 0, 0, ..., 1, 0, 2],
       [1, 0, 0, ..., 0, 0, 0]])

In [None]:
# obten todas las palabras únicas del corpus
vocabulario = count_vectorizer.get_feature_names_out()
# muestra los vectores de características del documento
pd.DataFrame(matriz_conteo, columns=vocabulario)

Unnamed: 0,000,00033,04,08,0em,10,100,101,102,103,...,yourn,youth,youthful,youthfulness,zeal,zealous,zenith,zip,zone,zounds
0,1,1,0,0,0,2,2,1,1,1,...,0,1,0,0,1,0,0,1,0,0
1,1,0,0,0,0,2,0,0,0,0,...,1,10,3,1,0,2,0,1,0,0
2,1,0,0,0,2,0,0,0,0,0,...,0,9,6,0,0,2,1,0,0,0
3,3,0,1,0,0,2,3,0,0,0,...,0,14,0,0,0,0,0,2,1,0
4,3,0,0,1,0,2,1,0,0,0,...,0,6,3,0,0,0,0,1,0,2
5,1,0,0,0,2,0,0,0,0,0,...,0,8,1,0,1,0,0,0,0,0


#Vocabulario (número de columnas de la matriz)

In [None]:
print(len(vocabulario), vocabulario)

np.savetxt("vocab.txt", vocabulario, fmt="%s", delimiter=",")

15164 ['000' '00033' '04' ... 'zip' 'zone' 'zounds']
