In [4]:
import numpy as np
import pandas as pd
import re
import csv
import contractions     #pip install contractions
from IPython.display import display, HTML
from collections import defaultdict, Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import ngrams

# nltk.download('words')
# nltk.download('wordnet')
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


# words = set(nltk.corpus.words.words())

from tokenizers import Tokenizer, CharBPETokenizer, BertWordPieceTokenizer  # conda install -c conda-forge tokenizers 

In [5]:
'''
Lectura de datos
'''

df = pd.read_csv('datosFinal.csv',sep=';',usecols=['Rating','Text'])
df = df.fillna(' ')  # Realizamos esto para quitarnos posibles errores de codigo
df.head()

Unnamed: 0,Rating,Text
0,4.0 out of 5 stars,These magnetic chargers are doing the job and ...
1,4.0 out of 5 stars,Was hesitant to believe these would work but t...
2,4.0 out of 5 stars,I like there was different size cords. Cords a...
3,4.0 out of 5 stars,"For apple device, just make sure you have the ..."
4,4.0 out of 5 stars,So far seem like decent cables. Just wish they...


# Pasos para realizar la limpieza

1. **Poner todo en minusculas**

    `txt.lower()`

2. **Tratar cosas como (n't) // (I'm) // (there's) porque borro los (') y queda mal, hay que separarlos previamente a realizar el borrado**

    `contractions.fix(txt)`

3. **Eliminar signos de puntuacion y numeros (no tratar espacios en blanco)**

    `re.sub(r'[^\w\s]+|\d+',' ',txt)`

4. **Eliminar caracteres non-ASCII (tildes, caracteres chinos, ...). Solo nos quedamos con caracteres ASCII ya que trabajamos con vocabulario ingles el cual se encuentra solamente en ese rango**

    `re.sub(r'\b\w*[^\x00-\x7F]+\w*\b', '', txt)`

5. **Elimino palabras sobrantes, stop-word removal**

    `words = nltk.word_tokenize(txt)`

    `filtered_words = [word for word in words if word not in stop_words]`

    `txtProcesado = ' '.join(filtered_words)`

6. **Realizo un Lemmitazitation o Porter Stemmer**

    `words = nltk.word_tokenize(txt)`
    
    - **Lemmitazitation**
    
        `lemmatized_words = [lemmatizer.lemmatize(word) for word in words]`

        `txtProcesado = ' '.join(lemmatized_words)`

    - **Porter Stemmer**

        `stemmed_words = [stemmer.stem(word) for word in words]`

        `txtProcesado = ' '.join(stemmed_words)`

7. **Elimino posibles ejemplos que se hayan quedado vacios**

    `mask = df['Text'] == ''`

    `df = df[~mask]`

In [6]:
'''
Creamos la tabla pandas con los textos limpios con la opción LEMMATIZATION
'''

dfLemmatizer = pd.DataFrame(columns=df.columns)

for index, row in df.iterrows():
    # Minusculas
    textProcesado = row['Text'].lower()
    # Contracciones
    textProcesado = contractions.fix(textProcesado)
    # Eliminar signos de puntuación y números
    textProcesado = re.sub(r'[^\w\s]+|\d+',' ',textProcesado)
    # Eliminar non-ASCII
    textProcesado = re.sub(r'\b\w*[^\x00-\x7F]+\w*\b', '', textProcesado)
    textProcesado = re.sub(r'\s+',' ',textProcesado)
    # if(textProcesado != ' ' and detect(textProcesado) != 'en'):
    #     textProcesado = traductor.translate(textProcesado)
    # Stop-words removal
    words = nltk.word_tokenize(textProcesado)
    filtered_words = [word for word in words if word not in stop_words]
    textProcesado = ' '.join(filtered_words)
    # Lemmitazitation
    words = nltk.word_tokenize(textProcesado)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    textProcesado = ' '.join(lemmatized_words)
    # Crear tabla limpia
    linea = [row['Rating'],textProcesado]
    dfLemmatizer = dfLemmatizer.append(pd.Series(linea,index=df.columns),ignore_index=True)

# Eliminamos ejemplos que se nos han quedado vacios tras la limpieza de los textos
ejVacios = dfLemmatizer['Text'].astype(str).eq('').sum()
print('Obtenemos {} ejemplos vacios'.format(ejVacios))

mask = dfLemmatizer['Text'] == ''
dfLemmatizer = dfLemmatizer[~mask]

ejVacios = dfLemmatizer['Text'].astype(str).eq('').sum()
print('Obtenemos {} ejemplos vacios'.format(ejVacios))

dfLemmatizer.head()

Obtenemos 16 ejemplos vacios
Obtenemos 0 ejemplos vacios


Unnamed: 0,Rating,Text
0,4.0 out of 5 stars,magnetic charger job charge iphone ipad mini c...
1,4.0 out of 5 stars,hesitant believe would work tell charge time i...
2,4.0 out of 5 stars,like different size cord cord durable blue lig...
3,4.0 out of 5 stars,apple device make sure arrow correctly inserte...
4,4.0 out of 5 stars,far seem like decent cable wish give included ...


In [None]:
'''
Creamos los arrays para trabjar mas adelante
'''

XLem = dfLemmatizer['Text'].values.copy()
yLem = dfLemmatizer['Rating'].values.copy()

In [None]:
'''
Escribimos los textos limpios en un csv
'''

# Abrir el archivo CSV para escritura
with open('textosLimpiosLemmatization.csv', mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)

    # Escribir los datos de X y y en el archivo CSV
    for x, y in zip(XLem, yLem):
        writer.writerow([x, y])

In [7]:
'''
Creamos la tabla pandas con los textos limpios con la opción PORTER STEMMER
'''

dfPorterStemmer = pd.DataFrame(columns=df.columns)

for index, row in df.iterrows():
    # Minusculas
    textProcesado = row['Text'].lower()
    # Contracciones
    textProcesado = contractions.fix(textProcesado)
    # Eliminar signos de puntuación y números
    textProcesado = re.sub(r'[^\w\s]+|\d+',' ',textProcesado)
    # Eliminar non-ASCII
    textProcesado = re.sub(r'\b\w*[^\x00-\x7F]+\w*\b', '', textProcesado)
    textProcesado = re.sub(r'\s+',' ',textProcesado)
    # if(textProcesado != ' ' and detect(textProcesado) != 'en'):
    #     textProcesado = traductor.translate(textProcesado)
    # Stop-words removal
    words = nltk.word_tokenize(textProcesado)
    filtered_words = [word for word in words if word not in stop_words]
    textProcesado = ' '.join(filtered_words)
    # Porter Stemmer
    words = nltk.word_tokenize(textProcesado)
    stemmed_words = [stemmer.stem(word) for word in words]
    textProcesado = ' '.join(stemmed_words)
    # Crear tabla limpia
    linea = [row['Rating'],textProcesado]
    dfPorterStemmer = dfPorterStemmer.append(pd.Series(linea,index=df.columns),ignore_index=True)

# Eliminamos ejemplos que se nos han quedado vacios tras la limpieza de los textos
ejVacios = dfPorterStemmer['Text'].astype(str).eq('').sum()
print('Obtenemos {} ejemplos vacios'.format(ejVacios))

mask = dfPorterStemmer['Text'] == ''
dfPorterStemmer = dfPorterStemmer[~mask]

ejVacios = dfPorterStemmer['Text'].astype(str).eq('').sum()
print('Obtenemos {} ejemplos vacios'.format(ejVacios))

dfPorterStemmer.head()

Obtenemos 16 ejemplos vacios
Obtenemos 0 ejemplos vacios


Unnamed: 0,Rating,Text
0,4.0 out of 5 stars,magnet charger job charg iphon ipad mini charg...
1,4.0 out of 5 stars,hesit believ would work tell charg time impact...
2,4.0 out of 5 stars,like differ size cord cord durabl blue light e...
3,4.0 out of 5 stars,appl devic make sure arrow correctli insert ch...
4,4.0 out of 5 stars,far seem like decent cabl wish give includ tip...


In [7]:
'''
Creamos los arrays para trabjar mas adelante
'''

XStem= dfPorterStemmer['Text'].values.copy()
yStem = dfPorterStemmer['Rating'].values.copy()

In [8]:
'''
Escribimos los textos limpios en un csv
'''

# Abrir el archivo CSV para escritura
with open('textosLimpiosPorterStemmer.csv', mode='w', newline='') as csv_file:
    writer = csv.writer(csv_file)

    # Escribir los datos de X y y en el archivo CSV
    for x, y in zip(XStem, yStem):
        writer.writerow([x, y])


# Mostramos los tokens mas repetidos de cada clase y de los datos en general

Vamos ha suponer que solamente vamos a utiliza **2000 tokens**. De esta manera nos es mas facil poder comparar con todos los distintos metodos de extracción de tokens.

In [9]:
def crearDictTokensPorClases(tipoTokenizer,dfDatos,numTokens=2000,clase=None):
    '''
    :tipoTokenizer: STRING por el cual poder definir que tipo de tokenizador usar\n
    :numTokens: La cantidad de tokens que se quieren extraer\n
    :dfDatos: DataFrame donde se encuentran todos los ejemplos con sus clases correspondientes\n
    :clase: Utilizado o no, para saber que tokens de que clase extraer\n
    :Return: Diccionario en orden descendiente donde la KEY sea el token y el VALUE las veces repetidas dentro de la clase (mas o menos importancia)
    '''
    dfArgupado = dfDatos.groupby('Rating').agg({'Text': ' '.join}).reset_index()
    dictTokens = {}
    if(tipoTokenizer == 'Unigram'):
        numTokens = 10000
        
    if(tipoTokenizer == 'Palabras'):
        if(clase != 'Total'):
            token,rep = np.unique(dfArgupado.loc[dfArgupado['Rating'] == clase, 'Text'].values[0].split(' '),return_counts=True)
        else:
            valores = dfArgupado['Text'].values
            valoresFinal = np.array([])
            for valor in valores:
                valor = valor.split(' ')
                valoresFinal = np.append(valor,valoresFinal)
            token,rep = np.unique(valoresFinal,return_counts=True)

        for i in zip(token,rep):
            dictTokens[i[0]] = i[1]
    
    elif(tipoTokenizer == 'BPE'):
        if(clase == 'Total'):
            valores = dfArgupado['Text'].values
            reviews = np.array([])
            for valor in valores:
                reviews = np.append(valor,reviews)
            reviews = reviews[0]
        else:
            reviews = list(dfArgupado.loc[dfArgupado['Rating'] == clase,'Text'])[0]
        tokenizer = CharBPETokenizer()
        tokenizer.train_from_iterator([reviews], vocab_size=numTokens)
        encoding = tokenizer.encode(reviews)
        for token in encoding.tokens:
            if(token not in dictTokens):
                dictTokens[token] = 0
            dictTokens[token] += 1
    
    elif(tipoTokenizer == 'Unigram'):
        if(clase == 'Total'):
            valores = dfArgupado['Text'].values
            reviews = np.array([])
            for valor in valores:
                reviews = np.append(valor,reviews)
            reviews = reviews[0]
        else:
            reviews = list(dfArgupado.loc[dfArgupado['Rating'] == clase,'Text'])[0]
        token_counts = Counter()
        text_ngrams = ngrams(reviews.split(), 2)
        token_counts.update(text_ngrams)
        dictTokens = dict(token_counts)
    
    elif(tipoTokenizer == 'Wordpiece'):
        if(clase == 'Total'):
            valores = dfArgupado['Text'].values
            reviews = np.array([])
            for valor in valores:
                reviews = np.append(valor,reviews)
            reviews = reviews[0]
        else:
            reviews = list(dfArgupado.loc[dfArgupado['Rating'] == clase,'Text'])[0]
        tokenizer = BertWordPieceTokenizer()
        tokenizer.train_from_iterator([reviews], vocab_size=numTokens)
        encoding = tokenizer.encode(reviews)
        for token in encoding.tokens:
            if(token not in dictTokens):
                dictTokens[token] = 0
            dictTokens[token] += 1
    dictTokens = dict(list(sorted(dictTokens.items(), key=lambda item: item[1], reverse=False)))
    return dictTokens

In [13]:
def crearDictTokensPorEjemplos(tipoTokenizer,dfDatos,numTokens=2000,clase=None):
    '''
    :tipoTokenizer: STRING por el cual poder definir que tipo de tokenizador usar\n
    :numTokens: La cantidad de tokens que se quieren extraer\n
    :dfDatos: DataFrame donde se encuentran todos los ejemplos con sus clases correspondientes\n
    :clase: Utilizado o no, para saber que tokens de que clase extraer\n
    :Return: Diccionario en orden descendiente donde la KEY sea el token y el VALUE las veces repetidas dentro de la clase (mas o menos importancia)
    '''
    dictTokens = {}
    if(tipoTokenizer == 'Unigram'):
        numTokens = 10000
        
    if(tipoTokenizer == 'Palabras'):
        if (clase != 'Total'):
            for review in dfDatos.loc[dfDatos['Rating'] == clase, 'Text']:
                tokens = np.unique(review.split())
                for token in tokens:
                    if (token not in dictTokens):
                        dictTokens[token] = 0
                    dictTokens[token] += 1
        else:
            for review in dfDatos['Text']:
                tokens = np.unique(review.split())
                for token in tokens:
                    if (token not in dictTokens):
                        dictTokens[token] = 0
                    dictTokens[token] += 1
    
    elif(tipoTokenizer == 'BPE'):
        if(clase != 'Total'):
            arrayReviews = dfDatos.loc[dfDatos['Rating'] == clase, 'Text'].to_numpy().tolist()
        else:
            arrayReviews = dfDatos['Text'].to_numpy().tolist()
        # Concatenar todas las cadenas en un solo string por array
        array_de_cadenas = [' '.join(array.split()) for array in arrayReviews]

        # Crear el tokenizador y entrenarlo con las cadenas concatenadas
        tokenizer = CharBPETokenizer()
        tokenizer.train_from_iterator(array_de_cadenas)

        # Obtener los tokens de cada cadena en el array original
        tokens_por_array = []
        for array in arrayReviews:
            array = array.split()
            tokens_por_cadena = []
            for cadena in array:
                tokens = tokenizer.encode(cadena).tokens
                tokens_por_cadena.append(tokens)
            flat_list = [item for sublist in tokens_por_cadena for item in sublist]
            tokens_por_array.append(flat_list)
            
        for review in tokens_por_array:
            for token in review:
                if (token not in dictTokens):
                    dictTokens[token] = 0
                dictTokens[token] += 1
    
    elif(tipoTokenizer == 'Unigram'):
        if(clase != 'Total'):
            arrayReviews = dfDatos.loc[dfDatos['Rating'] == clase, 'Text'].to_numpy().tolist()
        else:
            arrayReviews = dfDatos['Text'].to_numpy().tolist()
        tokens_por_array = []
        for array in arrayReviews:
            tokens_por_cadena = []
            tokens = [tuple(ngram) for ngram in ngrams(array.split(), 2)]
            tokens_por_cadena += tokens
            tokens_por_array.append(tokens_por_cadena)
        
        for review in tokens_por_array:
            for token in review:
                if (token not in dictTokens):
                    dictTokens[token] = 0
                dictTokens[token] += 1
    
    elif(tipoTokenizer == 'Wordpiece'):
        if(clase != 'Total'):
            arrayReviews = dfDatos.loc[dfDatos['Rating'] == clase, 'Text'].to_numpy().tolist()
        else:
            arrayReviews = dfDatos['Text'].to_numpy().tolist()
        # Concatenar todas las cadenas en un solo string por array
        array_de_cadenas = [' '.join(array.split()) for array in arrayReviews]

        # Crear el tokenizador y entrenarlo con las cadenas concatenadas
        tokenizer = BertWordPieceTokenizer()
        tokenizer.train_from_iterator(array_de_cadenas)

        # Obtener los tokens de cada cadena en el array original
        tokens_por_array = []
        for array in arrayReviews:
            array = array.split()
            tokens_por_cadena = []
            for cadena in array:
                tokens = tokenizer.encode(cadena).tokens
                tokens_por_cadena.append(tokens)
            flat_list = [item for sublist in tokens_por_cadena for item in sublist]
            tokens_por_array.append(flat_list)
            
        for review in tokens_por_array:
            for token in review:
                if (token not in dictTokens):
                    dictTokens[token] = 0
                dictTokens[token] += 1
    dictTokens = dict(list(sorted(dictTokens.items(), key=lambda item: item[1], reverse=True)))
    return dictTokens

## TOKENS

In [14]:
def crearCuadroAparicionNumTokens(dictBusqueda,columns,conjuntoDicts):
    '''
    :dictBusqueda: Diccionario sobre el que iterar las busquedas de Tokens sobre los demas\n
    :columns: Los demas diccionarios sobre los que buscar\n
    :Return: Una matriz donde la FILA sean los tokens de clase DESDE donde se busque y la COLUMNA sea la clase SOBRE la que se busque
    '''
    matrizApariciones = pd.DataFrame()
    dictAux = dict.fromkeys(columns)

    for key in dictAux.keys():
        for estrellas in dictAux.keys():
            if(key == estrellas):
                dictAux[estrellas] = '-'
            else:
                dictAux[estrellas] = str(np.round((len(set(conjuntoDicts[key]).intersection(set(conjuntoDicts[estrellas])))/len(set(conjuntoDicts[key])))*100,2)) + '%'
        matrizApariciones = pd.concat([matrizApariciones,pd.DataFrame(dictAux,index=[key])])

    matrizApariciones.columns.name = 'Sobre'
    matrizApariciones.index.name = 'Desde'
    
    return matrizApariciones

In [15]:
def crearMatrizAparicionTokens(dictBusqueda,columns,conjuntoDicts):
    '''
    :dictBusqueda: Diccionario sobre el que iterar las busquedas de Tokens sobre los demas\n
    :columns: Los demas diccionarios sobre los que buscar\n
    :Return: Una matriz donde la FILA sea el token a tratar y las COLUMNAS correspondan al % de aparición en esa clase
    '''
    matrizApariciones = pd.DataFrame()
    dictAux = dict.fromkeys(columns)
    
    for key,value in dictBusqueda.items():
        for estrellas in dictAux:
            if(key not in conjuntoDicts[estrellas]):
                dictAux[estrellas] = 0
            else:
                dictAux[estrellas] = np.round((conjuntoDicts[estrellas][key]/sum(conjuntoDicts[estrellas].values()))*100,3)
        matrizApariciones = pd.concat([matrizApariciones,pd.DataFrame(dictAux,index=[key])])
        
    matrizApariciones = matrizApariciones.head(10).copy()
    
    return matrizApariciones

In [16]:
def crearColumnaNoAparicionTokens(dictBusqueda,dictSobreBusqueda):
    '''
    :dictBusqueda: Diccionario sobre el que iterar las busquedas de Tokens sobre los demas\n
    :dictSobreBusqueda: Diccionario sobre el cual buscar si NO se encuentran los tokens\n
    :Return: Una matriz donde la FILA sea el token a tratar y las COLUMNAS correspondan al % de aparición en esa clase
    '''
    columnaApariciones = pd.DataFrame(columns=['% Aparición'])

    for key,value in dictBusqueda.items():
        if(key not in dictSobreBusqueda):
            columnaApariciones = pd.concat([columnaApariciones,pd.DataFrame({list(columnaApariciones.columns)[0]: np.round((value/sum(dictBusqueda.values()))*100,5)},index=[key])])
            
    columnaApariciones = columnaApariciones.head(10).copy()
    
    return columnaApariciones

In [None]:
'''
Suponemos que los tokens son palabras
'''
dfUtilizar = dfLemmatizer.copy()
tokenizer = 'Palabras'

#Creación de los diccionarios de Tokens
dict1E = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='1.0 out of 5 stars')
dict2E = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='2.0 out of 5 stars')
dict3E = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='3.0 out of 5 stars')
dict4E = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='4.0 out of 5 stars')
dict5E = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='5.0 out of 5 stars')
dictTotal = crearDictTokensPorEjemplos(tokenizer,dfUtilizar,clase='Total')

conjuntoDicts = {'dict1E': dict1E, 'dict2E': dict2E, 'dict3E': dict3E, 'dict4E': dict4E, 'dict5E': dict5E, 'dictTotal': dictTotal}

In [178]:
'''
Representación en cuanto al conjunto total de Tokens
'''

# Construir matriz aparición tokens en % por las clases  
repTokensTotalTF = crearMatrizAparicionTokens(dictTotal,['dict1E','dict2E', 'dict3E', 'dict4E', 'dict5E'],conjuntoDicts)

#Construir matriz aparición num (si / no) por las clases
repTokensTotalCV = crearCuadroAparicionNumTokens(dictTotal,['dict1E','dict2E', 'dict3E', 'dict4E', 'dict5E'],conjuntoDicts)

# # Printear tablas con titulo de lado en lado
# html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens' + '</h3>' + repTokensTotalTF.to_html() + '</div>'
# html2 = '<div style="text-align:center;"><h3>' + 'Num Aparicion Tokens' + '</h3>' + repTokensTotalCV.to_html() + '</div>'
# html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td></tr></table>'

# display(HTML(html))
display(repTokensTotalTF)
display(repTokensTotalCV)

Unnamed: 0,dict1E,dict2E,dict3E,dict4E,dict5E
charge,2.491,2.108,1.735,1.15,1.426
charging,1.884,1.709,1.418,1.058,1.327
cable,1.362,1.4,1.25,1.166,1.628
phone,1.748,1.694,1.271,0.895,1.105
work,1.465,1.174,1.195,1.128,1.455
cord,1.211,1.212,1.121,0.975,1.678
charger,1.387,1.016,1.011,0.863,1.064
one,1.06,1.076,1.039,0.818,0.899
great,0.508,0.662,0.773,1.061,1.966
good,0.438,0.576,0.755,1.345,1.08


Sobre,dict1E,dict2E,dict3E,dict4E,dict5E
Desde,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dict1E,-,57.7%,60.21%,57.65%,50.0%
dict2E,56.4%,-,61.91%,60.87%,52.1%
dict3E,50.04%,52.64%,-,57.66%,48.84%
dict4E,43.28%,46.76%,52.1%,-,45.66%
dict5E,50.2%,53.51%,59.01%,61.05%,-


In [179]:
'''
Representación en cuanto a valoraciones de 1 Estrella
'''

# Construir matriz aparición tokens en % por las clases  
repTokens1E = crearMatrizAparicionTokens(dict1E,['dict2E', 'dict3E', 'dict4E', 'dict5E', 'dictTotal'],conjuntoDicts)

# Construir columna de NO aparicion de Tokens en la clase 2E
tokensNotIn2E = crearColumnaNoAparicionTokens(dict1E,dict2E)
# Construir columna de NO aparicion de Tokens en la clase 3E
tokensNotIn3E = crearColumnaNoAparicionTokens(dict1E,dict3E)
# Construir columna de NO aparicion de Tokens en la clase 4E
tokensNotIn4E = crearColumnaNoAparicionTokens(dict1E,dict4E)
# Construir columna de NO aparicion de Tokens en la clase 5E
tokensNotIn5E = crearColumnaNoAparicionTokens(dict1E,dict5E)


# Printear tablas con titulo de lado en lado
html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens Principales' + '</h3>' + repTokens1E.to_html() + '</div>'
html2 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 2E' + '</h3>' + tokensNotIn2E.to_html() + '</div>'
html3 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 3E' + '</h3>' + tokensNotIn3E.to_html() + '</div>'
html4 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 4E' + '</h3>' + tokensNotIn4E.to_html() + '</div>'
html5 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 5E' + '</h3>' + tokensNotIn5E.to_html() + '</div>'

html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td><td>' + html3 + '</td><td>' + html4 + '</td><td>' + html5 + '</td></tr></table>'

display(HTML(html))

Unnamed: 0_level_0,dict2E,dict3E,dict4E,dict5E,dictTotal
Unnamed: 0_level_1,% Aparición,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,% Aparición,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,% Aparición,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,% Aparición,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
charge,2.108,1.735,1.150,1.426,1.768
charging,1.709,1.418,1.058,1.327,1.467
phone,1.694,1.271,0.895,1.105,1.33
work,1.174,1.195,1.128,1.455,1.272
charger,1.016,1.011,0.863,1.064,1.06
cable,1.400,1.250,1.166,1.628,1.346
cord,1.212,1.121,0.975,1.678,1.218
one,1.076,1.039,0.818,0.899,0.977
month,0.907,0.513,0.230,0.210,0.575
would,0.892,0.868,0.751,0.540,0.807

Unnamed: 0,dict2E,dict3E,dict4E,dict5E,dictTotal
charge,2.108,1.735,1.15,1.426,1.768
charging,1.709,1.418,1.058,1.327,1.467
phone,1.694,1.271,0.895,1.105,1.33
work,1.174,1.195,1.128,1.455,1.272
charger,1.016,1.011,0.863,1.064,1.06
cable,1.4,1.25,1.166,1.628,1.346
cord,1.212,1.121,0.975,1.678,1.218
one,1.076,1.039,0.818,0.899,0.977
month,0.907,0.513,0.23,0.21,0.575
would,0.892,0.868,0.751,0.54,0.807

Unnamed: 0,% Aparición
sell,0.0368
drained,0.03312
burn,0.02944
absolute,0.02944
fried,0.02944
smell,0.02576
gas,0.02576
dangerous,0.02208
offered,0.02208
heated,0.02208

Unnamed: 0,% Aparición
overheated,0.04416
anyway,0.02576
policy,0.02208
glad,0.02208
call,0.02208
heated,0.02208
wurde,0.02208
hooked,0.0184
unsafe,0.0184
forgot,0.0184

Unnamed: 0,% Aparición
trash,0.10304
period,0.04784
percent,0.04784
hazard,0.04416
overheated,0.04416
worthless,0.0368
sell,0.0368
burn,0.02944
slowest,0.02944
intermittent,0.02944

Unnamed: 0,% Aparición
junk,0.15823
poor,0.15455
refund,0.13983
useless,0.13247
none,0.12144
trash,0.10304
window,0.08832
neither,0.08464
disappointing,0.0736
worst,0.06992


In [180]:
'''
Representación en cuanto a valoraciones de 2 Estrella
'''

# Construir matriz aparición tokens en % por las clases  
repTokens2E = crearMatrizAparicionTokens(dict2E,['dict1E', 'dict3E', 'dict4E', 'dict5E', 'dictTotal'],conjuntoDicts)

# Construir columna de NO aparicion de Tokens en la clase 2E
tokensNotIn1E = crearColumnaNoAparicionTokens(dict2E,dict1E)
# Construir columna de NO aparicion de Tokens en la clase 3E
tokensNotIn3E = crearColumnaNoAparicionTokens(dict2E,dict3E)
# Construir columna de NO aparicion de Tokens en la clase 4E
tokensNotIn4E = crearColumnaNoAparicionTokens(dict2E,dict4E)
# Construir columna de NO aparicion de Tokens en la clase 5E
tokensNotIn5E = crearColumnaNoAparicionTokens(dict2E,dict5E)


# Printear tablas con titulo de lado en lado
html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens Principales' + '</h3>' + repTokens2E.to_html() + '</div>'
html2 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 1E' + '</h3>' + tokensNotIn2E.to_html() + '</div>'
html3 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 3E' + '</h3>' + tokensNotIn3E.to_html() + '</div>'
html4 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 4E' + '</h3>' + tokensNotIn4E.to_html() + '</div>'
html5 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 5E' + '</h3>' + tokensNotIn5E.to_html() + '</div>'

html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td><td>' + html3 + '</td><td>' + html4 + '</td><td>' + html5 + '</td></tr></table>'

display(HTML(html))

Unnamed: 0_level_0,dict1E,dict3E,dict4E,dict5E,dictTotal
Unnamed: 0_level_1,% Aparición,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,% Aparición,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,% Aparición,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,% Aparición,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
charge,2.491,1.735,1.150,1.426,1.768
charging,1.884,1.418,1.058,1.327,1.467
phone,1.748,1.271,0.895,1.105,1.33
cable,1.362,1.250,1.166,1.628,1.346
cord,1.211,1.121,0.975,1.678,1.218
work,1.465,1.195,1.128,1.455,1.272
one,1.060,1.039,0.818,0.899,0.977
charger,1.387,1.011,0.863,1.064,1.06
fast,0.751,0.831,0.693,1.035,0.839
month,1.049,0.513,0.230,0.210,0.575

Unnamed: 0,dict1E,dict3E,dict4E,dict5E,dictTotal
charge,2.491,1.735,1.15,1.426,1.768
charging,1.884,1.418,1.058,1.327,1.467
phone,1.748,1.271,0.895,1.105,1.33
cable,1.362,1.25,1.166,1.628,1.346
cord,1.211,1.121,0.975,1.678,1.218
work,1.465,1.195,1.128,1.455,1.272
one,1.06,1.039,0.818,0.899,0.977
charger,1.387,1.011,0.863,1.064,1.06
fast,0.751,0.831,0.693,1.035,0.839
month,1.049,0.513,0.23,0.21,0.575

Unnamed: 0,% Aparición
sell,0.0368
drained,0.03312
burn,0.02944
absolute,0.02944
fried,0.02944
smell,0.02576
gas,0.02576
dangerous,0.02208
offered,0.02208
heated,0.02208

Unnamed: 0,% Aparición
provides,0.02258
nylon,0.02258
anyway,0.01882
physically,0.01882
christmas,0.01505
measure,0.01505
overheated,0.01505
reconnecting,0.01505
asked,0.01505
supplied,0.01505

Unnamed: 0,% Aparición
intermittently,0.03387
frustrating,0.03011
telling,0.02634
iphones,0.02634
awful,0.02258
trash,0.02258
occasional,0.02258
january,0.02258
onto,0.01882
ridiculous,0.01882

Unnamed: 0,% Aparición
useless,0.07151
window,0.07151
send,0.06774
poor,0.06398
disappointing,0.05645
angle,0.05645
defective,0.04893
disconnecting,0.03764
unable,0.03764
unusable,0.03387


In [181]:
'''
Representación en cuanto a valoraciones de 3 Estrella
'''

# Construir matriz aparición tokens en % por las clases  
repTokens3E = crearMatrizAparicionTokens(dict3E,['dict1E', 'dict2E', 'dict4E', 'dict5E', 'dictTotal'],conjuntoDicts)

# Construir columna de NO aparicion de Tokens en la clase 2E
tokensNotIn1E = crearColumnaNoAparicionTokens(dict3E,dict1E)
# Construir columna de NO aparicion de Tokens en la clase 3E
tokensNotIn2E = crearColumnaNoAparicionTokens(dict3E,dict2E)
# Construir columna de NO aparicion de Tokens en la clase 4E
tokensNotIn4E = crearColumnaNoAparicionTokens(dict3E,dict4E)
# Construir columna de NO aparicion de Tokens en la clase 5E
tokensNotIn5E = crearColumnaNoAparicionTokens(dict3E,dict5E)


# Printear tablas con titulo de lado en lado
html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens Principales' + '</h3>' + repTokens3E.to_html() + '</div>'
html2 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 1E' + '</h3>' + tokensNotIn1E.to_html() + '</div>'
html3 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 2E' + '</h3>' + tokensNotIn2E.to_html() + '</div>'
html4 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 4E' + '</h3>' + tokensNotIn4E.to_html() + '</div>'
html5 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 5E' + '</h3>' + tokensNotIn5E.to_html() + '</div>'

html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td><td>' + html3 + '</td><td>' + html4 + '</td><td>' + html5 + '</td></tr></table>'

display(HTML(html))

Unnamed: 0_level_0,dict1E,dict2E,dict4E,dict5E,dictTotal
Unnamed: 0_level_1,% Aparición,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,% Aparición,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,% Aparición,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,% Aparición,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
charge,2.491,2.108,1.150,1.426,1.768
charging,1.884,1.709,1.058,1.327,1.467
phone,1.748,1.694,0.895,1.105,1.33
cable,1.362,1.400,1.166,1.628,1.346
work,1.465,1.174,1.128,1.455,1.272
cord,1.211,1.212,0.975,1.678,1.218
one,1.060,1.076,0.818,0.899,0.977
charger,1.387,1.016,0.863,1.064,1.06
would,0.953,0.892,0.751,0.540,0.807
fast,0.751,0.933,0.693,1.035,0.839

Unnamed: 0,dict1E,dict2E,dict4E,dict5E,dictTotal
charge,2.491,2.108,1.15,1.426,1.768
charging,1.884,1.709,1.058,1.327,1.467
phone,1.748,1.694,0.895,1.105,1.33
cable,1.362,1.4,1.166,1.628,1.346
work,1.465,1.174,1.128,1.455,1.272
cord,1.211,1.212,0.975,1.678,1.218
one,1.06,1.076,0.818,0.899,0.977
charger,1.387,1.016,0.863,1.064,1.06
would,0.953,0.892,0.751,0.54,0.807
fast,0.751,0.933,0.693,1.035,0.839

Unnamed: 0,% Aparición
space,0.06416
compartment,0.04583
carplay,0.03666
powershot,0.03666
mostly,0.03361
ability,0.03055
carrying,0.03055
zip,0.03055
hood,0.03055
constructed,0.03055

Unnamed: 0,% Aparición
excellent,0.03055
gaming,0.02139
installed,0.02139
poco,0.02139
hx,0.02139
eine,0.02139
kann,0.02139
mir,0.02139
app,0.01833
sense,0.01833

Unnamed: 0,% Aparición
middle,0.0275
register,0.0275
gently,0.02444
period,0.02139
listed,0.02139
percent,0.02139
sorry,0.02139
june,0.02139
hazard,0.01833
intermittently,0.01833

Unnamed: 0,% Aparición
angle,0.06111
useless,0.05805
disappointing,0.04278
body,0.03972
carplay,0.03666
lack,0.03361
poor,0.03361
none,0.03361
hoped,0.03055
randomly,0.03055


In [182]:
'''
Representación en cuanto a valoraciones de 4 Estrella
'''

# Construir matriz aparición tokens en % por las clases  
repTokens4E = crearMatrizAparicionTokens(dict4E,['dict1E', 'dict2E', 'dict3E', 'dict5E', 'dictTotal'],conjuntoDicts)

# Construir columna de NO aparicion de Tokens en la clase 2E
tokensNotIn1E = crearColumnaNoAparicionTokens(dict4E,dict1E)
# Construir columna de NO aparicion de Tokens en la clase 3E
tokensNotIn2E = crearColumnaNoAparicionTokens(dict4E,dict2E)
# Construir columna de NO aparicion de Tokens en la clase 4E
tokensNotIn3E = crearColumnaNoAparicionTokens(dict4E,dict3E)
# Construir columna de NO aparicion de Tokens en la clase 5E
tokensNotIn5E = crearColumnaNoAparicionTokens(dict4E,dict5E)


# Printear tablas con titulo de lado en lado
html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens Principales' + '</h3>' + repTokens4E.to_html() + '</div>'
html2 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 1E' + '</h3>' + tokensNotIn1E.to_html() + '</div>'
html3 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 2E' + '</h3>' + tokensNotIn2E.to_html() + '</div>'
html4 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 3E' + '</h3>' + tokensNotIn3E.to_html() + '</div>'
html5 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 5E' + '</h3>' + tokensNotIn5E.to_html() + '</div>'

html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td><td>' + html3 + '</td><td>' + html4 + '</td><td>' + html5 + '</td></tr></table>'

display(HTML(html))

Unnamed: 0_level_0,dict1E,dict2E,dict3E,dict5E,dictTotal
Unnamed: 0_level_1,% Aparición,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,% Aparición,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,% Aparición,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,% Aparición,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
good,0.438,0.576,0.755,1.080,0.846
cable,1.362,1.400,1.250,1.628,1.346
charge,2.491,2.108,1.735,1.426,1.768
work,1.465,1.174,1.195,1.455,1.272
great,0.508,0.662,0.773,1.966,0.969
charging,1.884,1.709,1.418,1.327,1.467
camera,0.052,0.335,0.746,0.837,0.611
cord,1.211,1.212,1.121,1.678,1.218
like,0.486,0.595,0.779,0.829,0.727
phone,1.748,1.694,1.271,1.105,1.33

Unnamed: 0,dict1E,dict2E,dict3E,dict5E,dictTotal
good,0.438,0.576,0.755,1.08,0.846
cable,1.362,1.4,1.25,1.628,1.346
charge,2.491,2.108,1.735,1.426,1.768
work,1.465,1.174,1.195,1.455,1.272
great,0.508,0.662,0.773,1.966,0.969
charging,1.884,1.709,1.418,1.327,1.467
camera,0.052,0.335,0.746,0.837,0.611
cord,1.211,1.212,1.121,1.678,1.218
like,0.486,0.595,0.779,0.829,0.727
phone,1.748,1.694,1.271,1.105,1.33

Unnamed: 0,% Aparición
compartment,0.12143
space,0.09906
storage,0.08948
pleased,0.0703
carrying,0.06072
padded,0.05433
hood,0.04793
protection,0.04474
powershot,0.04474
nicely,0.04154

Unnamed: 0,% Aparición
excellent,0.0703
snugly,0.04793
tough,0.03196
delivery,0.02876
offered,0.02876
zippered,0.02876
ease,0.02556
divider,0.02556
hiking,0.02556
eine,0.02556

Unnamed: 0,% Aparición
glad,0.04154
christmas,0.02876
provides,0.02556
nylon,0.02237
anyway,0.02237
momento,0.02237
tiene,0.02237
storing,0.01917
instruction,0.01598
mounted,0.01598

Unnamed: 0,% Aparición
angle,0.05433
useless,0.03835
removed,0.03515
tad,0.02876
edit,0.02556
tape,0.02556
body,0.02556
divider,0.02556
gut,0.02556
beware,0.02237


In [183]:
'''
Representación en cuanto a valoraciones de 5 Estrella
'''

# Construir matriz aparición tokens en % por las clases  
repTokens5E = crearMatrizAparicionTokens(dict4E,['dict1E', 'dict2E', 'dict3E', 'dict4E', 'dictTotal'],conjuntoDicts)

# Construir columna de NO aparicion de Tokens en la clase 2E
tokensNotIn1E = crearColumnaNoAparicionTokens(dict5E,dict1E)
# Construir columna de NO aparicion de Tokens en la clase 3E
tokensNotIn2E = crearColumnaNoAparicionTokens(dict5E,dict2E)
# Construir columna de NO aparicion de Tokens en la clase 4E
tokensNotIn3E = crearColumnaNoAparicionTokens(dict5E,dict3E)
# Construir columna de NO aparicion de Tokens en la clase 5E
tokensNotIn4E = crearColumnaNoAparicionTokens(dict5E,dict4E)


# Printear tablas con titulo de lado en lado
html1 = '<div style="text-align:center;"><h3>' + '% Aparicion Tokens Principales' + '</h3>' + repTokens5E.to_html() + '</div>'
html2 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 1E' + '</h3>' + tokensNotIn1E.to_html() + '</div>'
html3 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 2E' + '</h3>' + tokensNotIn2E.to_html() + '</div>'
html4 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 3E' + '</h3>' + tokensNotIn3E.to_html() + '</div>'
html5 = '<div style="text-align:center;"><h3>' + '% Tokens NO en 4E' + '</h3>' + tokensNotIn4E.to_html() + '</div>'

html = '<table><tr><td>' + html1 + '</td><td>' + html2 + '</td><td>' + html3 + '</td><td>' + html4 + '</td><td>' + html5 + '</td></tr></table>'

display(HTML(html))

Unnamed: 0_level_0,dict1E,dict2E,dict3E,dict4E,dictTotal
Unnamed: 0_level_1,% Aparición,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,% Aparición,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Unnamed: 0_level_3,% Aparición,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Unnamed: 0_level_4,% Aparición,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
good,0.438,0.576,0.755,1.345,0.846
cable,1.362,1.400,1.250,1.166,1.346
charge,2.491,2.108,1.735,1.150,1.768
work,1.465,1.174,1.195,1.128,1.272
great,0.508,0.662,0.773,1.061,0.969
charging,1.884,1.709,1.418,1.058,1.467
camera,0.052,0.335,0.746,1.016,0.611
cord,1.211,1.212,1.121,0.975,1.218
like,0.486,0.595,0.779,0.914,0.727
phone,1.748,1.694,1.271,0.895,1.33

Unnamed: 0,dict1E,dict2E,dict3E,dict4E,dictTotal
good,0.438,0.576,0.755,1.345,0.846
cable,1.362,1.4,1.25,1.166,1.346
charge,2.491,2.108,1.735,1.15,1.768
work,1.465,1.174,1.195,1.128,1.272
great,0.508,0.662,0.773,1.061,0.969
charging,1.884,1.709,1.418,1.058,1.467
camera,0.052,0.335,0.746,1.016,0.611
cord,1.211,1.212,1.121,0.975,1.218
like,0.486,0.595,0.779,0.914,0.727
phone,1.748,1.694,1.271,0.895,1.33

Unnamed: 0,% Aparición
storage,0.13604
space,0.10306
velcro,0.10306
pleased,0.08245
protection,0.08245
compartment,0.07832
nicely,0.07008
duty,0.07008
carrying,0.06596
padded,0.05771

Unnamed: 0,% Aparición
excellent,0.20612
delivery,0.06184
tough,0.04535
priced,0.04122
wrap,0.04122
electronics,0.03298
guy,0.02886
chew,0.02886
delivered,0.02886
roomy,0.02886

Unnamed: 0,% Aparición
glad,0.06596
guy,0.02886
excelente,0.02886
beautiful,0.02886
glove,0.02886
dad,0.02473
surprise,0.02473
anyway,0.02473
shipped,0.02473
nylon,0.02473

Unnamed: 0,% Aparición
tangled,0.02473
promised,0.02473
saved,0.02061
packed,0.02061
chewing,0.02061
figured,0.01649
withstand,0.01649
investment,0.01649
speaker,0.01649
sell,0.01649
