**A Semantic-Proximity Term-Weighting Scheme for Aspect Category Detection**

Vázquez-Hernández, M., Villaseñor-Pineda, L., & Montes, M. (2022). A Semantic-Proximity Term-Weighting Scheme for Aspect Category Detection. Procesamiento del Lenguaje Natural, 69, 117-127.

**Librerías, Clases y Métodos**
---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Github/ABAA-model/

/content/drive/MyDrive/Github/ABAA-model


In [None]:
# SS3 Model https://github.com/sergioburdisso/pyss3
pip install pySS3

Collecting pySS3
  Downloading pyss3-0.6.4-py3-none-any.whl.metadata (13 kB)
Collecting iterative-stratification (from pySS3)
  Downloading iterative_stratification-0.1.7-py3-none-any.whl.metadata (1.3 kB)
Downloading pyss3-0.6.4-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification, pySS3
Successfully installed iterative-stratification-0.1.7 pySS3-0.6.4


In [None]:
from pyss3 import SS3
from pyss3.util import Dataset, Evaluation, span
from pyss3.server import Live_Test

from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#-----------------------
import numpy as np
import math

#-----------------------
#undersampling
import re
import pandas
from collections import defaultdict
from pandas import DataFrame

#-----------------------
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

#Classes used
class Category:
  def __init__(self,category):
        self.category = category
        self.entity = ""
        self.aspect = ""
        self.reviews = [] #Todos los reviews de la categoria
        self.positivos= Polaridad("positivos") #Información de la clase positiva (reviews, vocabulario, vector)
        self.negativos = Polaridad("negativos") #Información de la clase positiva (reviews, vocabulario, vector)
        self.neutros = Polaridad("neutros") #Información de la clase positiva (reviews, vocabulario, vector)

class Polaridad:
  def __init__(self,Polaridad):
    self.polaridad = Polaridad
    self.reviews = []
    self.vocabulario = []
    self.vector = []
    self.promedio = 0
    self.dev_std = 0
    self.umbral = 0
    self.diccionario_gv = dict() #Diccionario para valores globales

class Review:
  def __init__(self,review):
    self.review = review
    self.review_clean = ''
    self.polaridad = ''
    self.vector_cat = []

class PCategory:
  def __init__(self, category):
    self.category = category
    self.Reviews = []

class PReview:
  def __init__(self, review, no_review_, polaridad_):
    self.no_review_ = no_review_
    self.review_ = review
    self.polaridad_ = polaridad_
    self.prob_neg = 0
    self.prob_pos = 0
    self.prob_neu = 0
    self.res_clasificacion = ""

def CleanReview(review):
    ST = stopwords.words('english') #stop words
    ST.append("iban")
    ST.append("has")
    ST.append("ahi")
    ST.append("ven")
    ST.append("si")
    ST.append("otro")
    ST.append("esa")
    ST.append("etc")
    ST.append("mas")
    ST.append("asi")
    ST.append("uno")
    ST.append("unas")
    ST.append("un")
    ST.append("vez")
    ST.append("ser")
    ST.append("dia")
    ST.append("aun")
    ST.append("pues")
    ST.append("de")
    ST.append("hola")
    ST.append("haber")

    AuxReview = review.lower().replace("\n","").replace(',',' ').replace('!',' ').replace(':',' ').replace('(',' ').replace(')',' ').replace('.',' ').replace('$t$s',' ').replace('$t$','').replace('"',' ').replace('?',' ').replace('¿',' ').replace('!',' ').replace('¡',' ').replace('(…)',' ').replace('“”',' ').replace('″',' ').replace(';',' ').replace('-',' ').replace('//',' ')
    AuxReview = AuxReview.replace('<',' ').replace('+',' ').replace('>',' ').replace('”',' ').replace('“',' ').replace('”',' ').replace('//',' ').replace('…',' ').replace('’',' ').replace('º',' ').replace('‘',' ')
    AuxReview = AuxReview.replace('[',' ').replace(']',' ').replace('%',' ').replace("/"," ").replace("d´","d")
    AuxReview = AuxReview.replace('  ',' ').replace('€','').replace('0','').replace('1','').replace('2','').replace('3','').replace("l'o","o").replace('4','').replace('5','').replace('6','').replace('7','').replace('8','').replace('9','')
    AuxReview = AuxReview.replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u').replace("*"," ").replace("-"," ")

    #Get tokens from review
    tokens = AuxReview.split(' ')

    #Remove stopwords
    clean_tokens = tokens[:]
    for token in tokens:
        if token in ST:
            clean_tokens.remove(token)

    clean_review = ""
    for token in clean_tokens:
        clean_review += token + ' '

    return clean_review.replace("  "," ")

def CleanDocs(documentos):
    clean_Docs = []
    for doc in documentos:
        clean_Docs.append(CleanReview(doc))

    return clean_Docs

def cosine_distance(u, v):
    """ Returns the cosine of the angle between vectors v and u. This is equal to uv / |u||v|. """
    return np.dot(u, v) / (math.sqrt(np.dot(u,u) * math.sqrt(np.dot(v,v))))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#-----------------------
#SVM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

#random undersampling
import numpy as np
import re
import pandas
from collections import defaultdict
from pandas import DataFrame

#random oversampling
from imblearn.over_sampling import RandomOverSampler
#---------------------------------
import re
from scipy import spatial



In [None]:
#-------------------------
#CNN-LSTM
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.layers import *
from keras.initializers import Constant


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from tensorflow.keras import layers
from tensorflow.keras import regularizers

In [None]:
#SIMILITUD COSENO
import re
from scipy import spatial

def cos_sim(u,v):
  result = 1 - spatial.distance.cosine(u, v)
  return result

In [None]:
"""SVM"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#Count words frequency in documents
def tokens(doc):
  return(tok.lower() for tok in re.findall(r"\w+", doc))

def frequency(tokens, review,vector_categoria, vocabulario):
  f = dict.fromkeys(vocabulario,0)
  # print(vector_categoria)
  # print(f)
  for token in tokens:
    if token in model:
      if token in f:
        f[token] += cos_sim(vector_categoria,model[token]) #1

  vector = []
  for item in vocabulario:
    vector.append(f[item])

  # print(vector)
  return vector

def tokens_frequency(doc,vector_categoria,Vocabulario):
  # print(vector_categoria)
  return frequency(tokens(doc),tokens(doc),vector_categoria,Vocabulario)


#Todo el vocabulario observado en los reviews de entrenamiento
def Vocabulario_train(Reviews_train):
  Vocabulario_train = []
  for review in Reviews_train:
    tokens = review.review.split(' ')
    for token in tokens:
      if token not in Vocabulario_train:
        Vocabulario_train.append(token)

  return Vocabulario_train


#Todo el vocabulario observado en los reviews de entrenamiento
def Vocabulario_train_Over(Reviews_train):
  Vocabulario_train = []
  for review in Reviews_train:
    tokens = review.split(' ')
    for token in tokens:
      if token not in Vocabulario_train:
        Vocabulario_train.append(token)

  return Vocabulario_train

GloVe: Embeddings
---

GloVe: Embeddings Wikipedia

---

GloVe: Embeddings Twitter


---

In [None]:
#-------------------------------------
"""Twitter"""
word2vec_output_file = './Files/glove.twitter.27B.200d.txt.word2vec'
# word2vec_output_file = '/content/drive/MyDrive/glove.twitter.27B.25d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

embedding_size = 200



Métodos para la extracción de léxico por categoría-polaridad
---

**Extracción** de léxico por categoría por polaridad

---

In [None]:
import statistics

# category = 'LOCATION#GENERAL'
# if True:
def Lexicos_polaridad_categoria(category):
  print(category)
  clf = SS3()

  #------------------------------------------------------------------------------------------------------------
  #Cargar datos de entrenamiento
  d_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.reviews.gold','r', encoding='utf-8')
  d_train_ = []
  for line in d_train.readlines():
    d_train_.append(line.replace("\n",""))
  d_train.close()

  #Cargar clases de entrenamiento
  l_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.class.gold','r', encoding='utf-8')

  l_train_ = []
  for line in l_train.readlines():
    l_train_.append(line.replace("\n",""))

  l_train.close()

  #Cargar polaridades de entrenamiento
  p_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.polarities.gold','r', encoding='utf-8')

  p_train_ = []
  for line in p_train.readlines():
    p_train_.append(line.replace("\n",""))

  p_train.close()

  # print(p_train_)

  x_train = []
  y_train = []

  for index_review in range(len(d_train_)):
    if l_train_[index_review] == category and p_train_[index_review] != "neutral" and p_train_[index_review] != "conflict":
    # if l_train_[index_review] == category and p_train_[index_review] != "conflict":
      x_train.append(d_train_[index_review])
      y_train.append(p_train_[index_review])

  x_train= CleanDocs(x_train)

  #------------------------------------------------------------------------------------------------------------


  #------------------------------------------------------------------------------------------------------------
  #Cargar datos de entrenamiento
  d_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.reviews.gold','r', encoding='utf-8')
  d_test_ = []
  for line in d_test.readlines():
    d_test_.append(line.replace("\n",""))
  d_test.close()

  #Cargar clases de entrenamiento
  l_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.class.gold','r', encoding='utf-8')

  l_test_ = []
  for line in l_test.readlines():
    l_test_.append(line.replace("\n",""))
  l_test.close()

  #Cargar polaridades de entrenamiento
  p_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.polarities.gold','r', encoding='utf-8')

  p_test_ = []
  for line in p_test.readlines():
    p_test_.append(line.replace("\n",""))

  p_test.close()


  x_test = []
  y_test = []

  for index_review in range(len(d_test_)):
    if l_test_[index_review] == category and p_test_[index_review] != "neutral" and p_test_[index_review] != "conflict":
    # if l_test_[index_review] == category and p_test_[index_review] != "conflict":
      x_test.append(d_test_[index_review])
      y_test.append(p_test_[index_review])

  x_test= CleanDocs(x_test)
  #------------------------------------------------------------------------------------------------------------


  # #Cargar datos de entrenamiento
  # directorio = "/content/drive/MyDrive/ProyectoTesis_INAOE/Slot3/restaurant/train/" + category
  # x_train, y_train = Dataset.load_from_files(directorio, folder_label=False)
  # x_train= CleanDocs(x_train)

  # #Cargar datos de pruebas
  # directorio = "/content/drive/MyDrive/ProyectoTesis_INAOE/Slot3/restaurant/test/" + category
  # x_test, y_test = Dataset.load_from_files(directorio, folder_label=False)
  # x_test = CleanDocs(x_test)

  clf.train(x_train, y_train) #Entrenar para calcular pesos a la forma SS3
  # Evaluation.test(clf, x_test, y_test)

  s_vals=[0.2 , 0.32, 0.44, 0.56, 0.68, 0.8]
  l_vals=[0.1 , 0.48, 0.86, 1.24, 1.62, 2]
  p_vals=[1.75, 1.95, 2.15, 2.35, 2.55, 2.75]
  best_s, best_l, best_p, _ = Evaluation.grid_search(clf,x_test, y_test, s=s_vals,l=l_vals,p=p_vals)

  # print("The hyperparameter values that obtained the best Accuracy are:")
  # print("Smoothness(s):", best_s)
  # print("Significance(l):", best_s)
  # print("Sanction(p):", best_p)

  clf.set_hyperparameters(best_s, best_s, best_p, 0.0)
  clf.train(x_train, y_train) #Entrenar para calcular pesos a la forma SS3

  polaridades = clf.get_categories() #CategoriasIdentificadas
  ctr = 0



  #Auxiliares para definir el umbral a considerar
  ValoresGlobales = []
  Diccionario_terminos = dict()
  suma = 0
  promedio = 0
  dev_std = 0


  Vocabulario_polaridades_train = []

  for polaridad in polaridades:
    # print(polaridad)
    if polaridad != 'conflict' and polaridad != "neutral":
      CPolaridad = Polaridad(polaridad)
      vocab_ss3 = clf.__get_category_vocab__(ctr)
      freqFile= open(polaridad + '.txt','w', encoding='utf-8')
      vocabulario = []
      for item in vocab_ss3:
        # if len(item[0]) >=2:
        vocabulario.append(item[0])
        freqFile.write(item[0] + ' ' + str(item[4]) + '\n')
        ValoresGlobales.append(item[4])
        Diccionario_terminos[item[0]] = item[4]

      freqFile.close()

      #Auxiliares para variables globales
      suma = sum(ValoresGlobales)
      promedio = suma/len(ValoresGlobales)
      if len(ValoresGlobales) > 1 :
        dev_std = statistics.stdev(ValoresGlobales)
      else:
        dev_std = ValoresGlobales[0]


      CPolaridad.diccionario_gv = Diccionario_terminos
      CPolaridad.promedio = promedio
      CPolaridad.dev_std = dev_std
      CPolaridad.vocabulario = vocabulario
      Vocabulario_polaridades_train.append(CPolaridad)
    ctr += 1

  return Vocabulario_polaridades_train, x_train, y_train, x_test, y_test

Vector Único

---

In [None]:
def VectorUnico(vocabulario, gv, x, promedio, dev_std, e_umbral):
  #-----------------------
  #Librerías
  import numpy as np
  import math

  if e_umbral == True:
    umbral = promedio + (x*dev_std)
    print("Umbral: " + str(umbral))
  else:
    umbral = 0


  contador = 0
  index = 0


  #construcción vector único
  for item in vocabulario:
    if gv[item] >= umbral and item in model:
      contador += 1

      if index == 0:
        index += 1
        Vector = model[item]
      else:
        Vector = np.asarray(Vector + model[item])

  #Vector único promedio
  CVector = np.asarray([n * (1/contador) for n in Vector])
  # print("tÉRMINOS: " + str(contador))
  return CVector

# SVM

In [None]:
import re
from collections import defaultdict

def SVM(categoria, x_train, y_train, x_test, y_test):
  print(categoria.category)

  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  #Agrupar información de reviews en un solo conjunto de datos
  Reviews_train_ = []
  Label_train_ = []
  contador = 0
  flag = 0
  positivos = 0
  negativos = 0


  vec_negative = categoria.negativos.vector
  vec_positive = categoria.positivos.vector

  # print(y_train)

  for doc in x_train:
    if y_train[contador] != "neutral" and y_train[contador] != "conflict":
      review = Review(doc)
      review.polaridad = y_train[contador].lower()

      if y_train[contador] == "negative":
        negativos += 1
        review.vector_cat = vec_negative
      else:
        if y_train[contador] == "positive":
          positivos += 1
          review.vector_cat = vec_positive


      Label_train_.append(y_train[contador])
      Reviews_train_.append(review)

    contador += 1




  if len(Reviews_train_) > 0:
    Vocabulario = Vocabulario_train(Reviews_train_)


    #Datos de test
    Reviews_test_ = []
    Label_test_ = []
    contador = 0

    for doc in x_test:
      if y_test[contador] != "neutral" and y_test[contador] != "conflict":
        review = Review(doc)
        review.polaridad = y_test[contador].lower()

        if y_test[contador] == "negative":
          review.vector_cat = categoria.negativos.vector
        else:
          review.vector_cat = categoria.positivos.vector


        Label_test_.append(y_test[contador])
        Reviews_test_.append(review)


      contador += 1



    print("INICIA SVM Y PREPARACIÓN DE DATOS")

    X_train = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_train_]
    # svc = SVC(C=2.5, kernel='linear', degree=3, gamma='auto').fit(X_train,Label_train_)

    svc = SVC(kernel='linear', C=2.0, decision_function_shape='ovo',probability=True).fit(X_train,Label_train_)

    # svc = SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train,Label_train_)



    X_test = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_test_]

    pred = svc.predict(X_test)
    # print(svc.predict_proba(X_test))
    # print(pred)
    y__test = Label_test_
    print(confusion_matrix(Label_test_, pred))
    print('accuracy score %0.3f' % svc.score(X_test, y__test))
    print(metrics.classification_report(y__test,pred) )

    # summary(svm)

In [None]:
import re
from collections import defaultdict

def SVM_binario(categoria, x_train, y_train, x_test, y_test):
  print(categoria.category)

  ##############################################################
  #Datos de clasificación
  PCategory_ = PCategory(categoria.category)

  PReviews = []
  contador = 0
  for review in x_test:
    PReview_ = PReview(review,contador,y_test[contador])
    PReviews.append(PReview_)
    contador += 1





  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  for polaridad in polaridades:
    print(polaridad)
    #Agrupar información de reviews en un solo conjunto de datos
    Reviews_train_ = []
    Label_train_ = []
    contador = 0
    flag = 0
    positivos = 0
    negativos = 0


    if polaridad == 'negative':
      vec_categoria = categoria.negativos.vector
    else:
      vec_categoria = categoria.positivos.vector

    # print(y_train)

    for doc in x_train:
      if y_train[contador] != "neutral" and y_train[contador] != "conflict":
        if y_train[contador] == polaridad:
          Label_train_.append(1)
        else:
          Label_train_.append(0)

        Reviews_train_.append(doc)

      contador += 1


    if len(Reviews_train_) > 0:
      Vocabulario = Vocabulario_train_Over(Reviews_train_)


      #Datos de test
      Reviews_test_ = []
      Label_test_ = []
      contador = 0

      for doc in x_test:
        if y_test[contador] != "neutral" and y_test[contador] != "conflict":
          review = Review(doc)
          review.polaridad = y_test[contador].lower()
          review.vector_cat = vec_categoria

          if y_test[contador] == polaridad:
            Label_test_.append(1)
          else:
            Label_test_.append(0)

          Reviews_test_.append(review)
        contador += 1



      print("INICIA SVM Y PREPARACIÓN DE DATOS")

      X_train = [tokens_frequency(d,vec_categoria,Vocabulario) for d in Reviews_train_]
      svc = SVC(C=2.5, kernel='linear', degree=2, gamma='auto',probability=True).fit(X_train,Label_train_)

      # svc = SVC(kernel='linear', C=2.5, decision_function_shape='ovo').fit(X_train,Label_train_)
      # svc = SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train,Label_train_)



      X_test = [tokens_frequency(d.review,vec_categoria,Vocabulario) for d in Reviews_test_]

      pred = svc.predict(X_test)

      # Datos de predicción
      probabilidades = svc.predict_proba(X_test)
      for i in range(len(x_test)):
        if polaridad == 'negative':
          PReviews[i].prob_neg = probabilidades[i][1]
        else:
          PReviews[i].prob_pos = probabilidades[i][1]


      y__test = Label_test_
      # print(confusion_matrix(Label_test_, pred))
      # print('accuracy score %0.3f' % svc.score(X_test, y__test))
      # print(metrics.classification_report(y__test,pred) )

  PCategory_.Reviews = PReviews
  return PCategory_

In [None]:
import re
from collections import defaultdict
from collections import Counter

def SVM_OverSampling(categoria, x_train, y_train, x_test, y_test):
  print(categoria.category)

  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  #Agrupar información de reviews en un solo conjunto de datos
  Reviews_train_ = []
  Label_train_ = []
  contador = 0
  flag = 0
  positivos = 0
  negativos = 0


  vec_negative = categoria.negativos.vector
  vec_positive = categoria.positivos.vector

  # print(y_train)

  for doc in x_train:
    if y_train[contador] != "neutral" and y_train[contador] != "conflict":
      Label_train_.append(y_train[contador])
      Reviews_train_.append(doc)

    contador += 1


  if len(Reviews_train_) > 0:
    Vocabulario = Vocabulario_train_Over(Reviews_train_)

    ###############################################################
    # Oversampling strategy - 21/03/21
    # instantiating the random over sampler
    ros = RandomOverSampler()

    # resampling X, y
    X = DataFrame(Reviews_train_,columns=['review'])

    X_ros, y_ros = ros.fit_resample(X, Label_train_)

    # new class distribution
    print(Counter(y_ros))
    # print(y_ros)

    Reviews_train_Over = []
    Label_train_over = []
    for i in range(len(X_ros)):
      review = Review(X_ros[i][0])

      if y_ros[i] == "negative":
        review.vector_cat = vec_negative
      else:
        if y_ros[i] == "positive":
          review.vector_cat = vec_positive

      Label_train_over.append(y_ros[i])
      Reviews_train_Over.append(review)


    ################################################################



    #Datos de test
    Reviews_test_ = []
    Label_test_ = []
    contador = 0

    for doc in x_test:
      if y_test[contador] != "neutral" and y_test[contador] != "conflict":
        review = Review(doc)
        review.polaridad = y_test[contador].lower()

        if y_test[contador] == "negative":
          review.vector_cat = categoria.negativos.vector
        else:
          review.vector_cat = categoria.positivos.vector


        Label_test_.append(y_test[contador])
        Reviews_test_.append(review)


      contador += 1



    print("INICIA SVM Y PREPARACIÓN DE DATOS")

    X_train = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_train_Over]
    # svc = SVC(C=2.5, kernel='linear', degree=3, gamma='auto',probability=True).fit(X_train,Label_train_over)

    svc = SVC(kernel='linear', C=2, decision_function_shape='ovo',probability=True).fit(X_train,Label_train_over)
    # svc = SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train,Label_train_)

    X_test = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_test_]

    pred = svc.predict(X_test)
    y__test = Label_test_
    print(confusion_matrix(Label_test_, pred))
    print('accuracy score %0.3f' % svc.score(X_test, y__test))
    print(metrics.classification_report(y__test,pred) )

In [None]:
import re
from collections import defaultdict
from collections import Counter

def SVM_OverSampling_binario(categoria, x_train, y_train, x_test, y_test):
  print(categoria.category)

  ##############################################################
  #Datos de clasificación
  PCategory_ = PCategory(categoria.category)

  PReviews = []
  contador = 0
  for review in x_test:
    PReview_ = PReview(review,contador,y_test[contador])
    PReviews.append(PReview_)
    contador += 1




  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  for polaridad in polaridades:
    print(polaridad)
    # Agrupar información de reviews en un solo conjunto de datos
    Reviews_train_ = []
    Label_train_ = []
    contador = 0
    flag = 0
    positivos = 0
    negativos = 0


    if polaridad == 'negative':
      vec_categoria = categoria.negativos.vector
    else:
      vec_categoria = categoria.positivos.vector

  # print(y_train)

    for doc in x_train:
      if y_train[contador] != "neutral" and y_train[contador] != "conflict":
      # if y_train[contador] != "conflict":
        if y_train[contador] == polaridad:
          Label_train_.append(1)
        else:
          Label_train_.append(0)

        Reviews_train_.append(doc)

      contador += 1


    if len(Reviews_train_) > 0:
      Vocabulario = Vocabulario_train_Over(Reviews_train_)

      ###############################################################
      # Oversampling strategy - 21/03/21
      # instantiating the random over sampler
      ros = RandomOverSampler()

      # resampling X, y
      X = DataFrame(Reviews_train_,columns=['review'])

      X_ros, y_ros = ros.fit_resample(X, Label_train_)

      # new class distribution
      # print(Counter(y_ros))
      # print(y_ros)

      Reviews_train_Over = []
      Label_train_over = []
      for i in range(len(X_ros)):
        review = Review(X_ros[i][0])
        review.vector_cat = vec_categoria

        Label_train_over.append(y_ros[i])
        Reviews_train_Over.append(review)


      ################################################################



      #Datos de test
      Reviews_test_ = []
      Label_test_ = []
      contador = 0

      for doc in x_test:
        if y_test[contador] != "neutral" and y_test[contador] != "conflict":
        # if y_test[contador] != "conflict":
          review = Review(doc)
          review.polaridad = y_test[contador].lower()
          review.vector_cat = vec_categoria

          if y_test[contador] == polaridad:
            Label_test_.append(1)
          else:
            Label_test_.append(0)

          Reviews_test_.append(review)
        contador += 1



      print("INICIA SVM Y PREPARACIÓN DE DATOS")

      X_train = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_train_Over]
      svc = SVC(C=2.5, kernel='linear', degree=2, gamma='auto', probability=True).fit(X_train,Label_train_over)

      # svc = SVC(kernel='linear', C=2.5, decision_function_shape='ovo').fit(X_train,Label_train_over)
      # svc = SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train,Label_train_)

      X_test = [tokens_frequency(d.review,d.vector_cat,Vocabulario) for d in Reviews_test_]


      pred = svc.predict(X_test)

      # Datos de predicción

      probabilidades = svc.predict_proba(X_test)
      for i in range(len(x_test)):
        if polaridad == 'negative':
          PReviews[i].prob_neg = probabilidades[i][1]
        else:
          PReviews[i].prob_pos = probabilidades[i][1]



      y__test = Label_test_
      # print(confusion_matrix(Label_test_, pred))
      # print('accuracy score %0.3f' % svc.score(X_test, y__test))
      # print(metrics.classification_report(y__test,pred) )

  PCategory_.Reviews = PReviews
  return PCategory_


CNN
---

In [None]:
import re
import pandas
from collections import defaultdict
from pandas import DataFrame

def CNN_oversampling_binario(categoria, x_train, y_train, x_test, y_test):

  print(categoria.category)

  ##############################################################
  #Datos de clasificación
  PCategory_ = PCategory(categoria.category)

  PReviews = []
  contador = 0
  for review in x_test:
    PReview_ = PReview(review,contador,y_test[contador])
    PReviews.append(PReview_)
    contador += 1


  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  for polaridad in polaridades:
    print(polaridad)
    # Agrupar información de reviews en un solo conjunto de datos
    Reviews_train_ = []
    Reviews_train_1 = []
    Label_train_ = []
    contador = 0
    flag = 0
    positivos = 0
    negativos = 0


    if polaridad == 'negative':
      vec_categoria = categoria.negativos.vector
    else:
      vec_categoria = categoria.positivos.vector

     # print(y_train)

    for doc in x_train:
      if y_train[contador] != "neutral" and y_train[contador] != "conflict":
      # if y_train[contador] != "conflict":
        if y_train[contador] == polaridad:
          Reviews_train_1.append(doc)
          Label_train_.append(1)
        else:
          Label_train_.append(0)

        Reviews_train_.append(doc)

      contador += 1


    if len(Reviews_train_) > 0:
      Vocabulario = Vocabulario_train_Over(Reviews_train_1)

      ###############################################################
      # Oversampling strategy - 21/03/21
      # instantiating the random over sampler
      ros = RandomOverSampler()

      # resampling X, y
      X = DataFrame(Reviews_train_,columns=['review'])

      X_ros, y_ros = ros.fit_resample(X, Label_train_)

      # new class distribution
      # print(Counter(y_ros))
      # print(y_ros)

      Reviews_train_Over = []
      Label_train_over = []

      for i in range(len(X_ros)):
        # review = Review(X_ros[i][0])
        # review.vector_cat = vec_categoria
        Label_train_over.append(y_ros[i])
        Reviews_train_Over.append(X_ros[i][0])

      # print(Reviews_train_Over)

      ################################################################

      # Max-Lenght
      max_length = max([len(s.split()) for s in Reviews_train_Over])
      # ################################################################


      # Datos de test
      Reviews_test_ = []
      Label_test_ = []
      contador = 0

      for doc in x_test:
        if y_test[contador] != "neutral" and y_test[contador] != "conflict":
        # if y_test[contador] != "conflict":
          # review = Review(doc)
          # review.polaridad = y_test[contador].lower()
          # review.vector_cat = vec_categoria

          if y_test[contador] == polaridad:
            Label_test_.append(1)
          else:
            Label_test_.append(0)

          Reviews_test_.append(doc)
        contador += 1



      embedding_dim = 200
      vocabulary_size = len(Vocabulario) #len(Vocabulario) #20000
      tokenizer = Tokenizer(num_words = vocabulary_size)
      tokenizer.fit_on_texts(Reviews_train_1)
      sequences = tokenizer.texts_to_sequences(Reviews_train_Over)
      data_train = pad_sequences(sequences, maxlen=max_length)

      tokenizer = Tokenizer(num_words = vocabulary_size)
      tokenizer.fit_on_texts(Reviews_train_1)

      sequences = tokenizer.texts_to_sequences(Reviews_test_)
      data_test = pad_sequences(sequences, maxlen = max_length)


      #Embedding Matrix
      embeddings_index = dict()
      for word in model.index2word:
        embeddings_index[word] = np.asarray(model[word],dtype='float32')


      print('Loaded %s word vectors.' %len(embeddings_index))

      # create a weight matrix for words in training reviews
      embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
      for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
          break
        else:
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
            # wordVector = VectorRelation(word,embedding_vector)
            cosine = cos_sim(embedding_vector,vec_categoria)
            wordVector = [n * cosine for n in embedding_vector]
            embedding_matrix[index] =  wordVector #embedding_vector

      # print(embedding_matrix)

      # model_glove = Sequential()
      # model_glove.add(Embedding(vocabulary_size,embedding_dim,input_length=max_length,weights=[embedding_matrix],trainable=False))

      #--------------------------------------------------
      """https://unipython.com/proyecto-desarrollar-un-modelo-de-incrustacion-cnn-para-el-analisis-de-sentimientos/"""
      #configuración CNN

      # model_glove.add(Conv1D(filters=256,kernel_size=3,activation='relu'))
      # model_glove.add(MaxPooling1D(pool_size=3))

      # model_glove.add(Flatten())
      # # model_glove.add(Dense(10, activation="relu"))
      # model_glove.add(Dense(1, activation="sigmoid"))


      #------------------------------------------------

            #--------------------------------------------------
      #03-05-2021
      #CNN con multiples kernels (capas a mismo nivel de convolución)
      num_filters = 256

      inputs_2 = Input(shape=(max_length,), dtype='int32')
      embedding_layer_2 = Embedding(vocabulary_size,embedding_dim, embeddings_initializer=Constant(embedding_matrix),input_length=max_length, trainable=False)(inputs_2)

      conv_0 = Conv1D(num_filters, kernel_size=1, activation='relu')(embedding_layer_2)
      conv_1 = Conv1D(num_filters, kernel_size=2, activation='relu')(embedding_layer_2)
      conv_2 = Conv1D(num_filters, kernel_size=3, activation='relu')(embedding_layer_2)

      maxpool_0 = MaxPool1D(pool_size=(max_length - 1 + 1), strides=1, padding='valid')(conv_0)
      maxpool_1 = MaxPool1D(pool_size=(max_length - 2 + 1), strides=1, padding='valid')(conv_1)
      maxpool_2 = MaxPool1D(pool_size=(max_length - 3 + 1), strides=1, padding='valid')(conv_2)

      concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
      flatten_2 = Flatten()(concatenated_tensor)

      dropout_2 = Dropout(0.5)(flatten_2)
      output_2 = Dense(units=1, activation='sigmoid')(dropout_2)

      # model_glove.add(Dense(10, activation="relu"))
      # model_glove.add(Dense(1, activation="sigmoid"))
      #------------------------------------------------

      #------------------------------------------------
      """"https://www.tensorflow.org/api_docs/python/tf/keras/layers/Bidirectional"""
      # #Configruacion BiLSTM
      # forward_layer = LSTM(10, return_sequences=True)
      # backward_layer = LSTM(10, activation='relu',return_sequences=True, go_backwards=True)
      # model_glove.add(Bidirectional(forward_layer,backward_layer=backward_layer, input_shape=(5,10)))
      # model_glove.add(Flatten())
      # model_glove.add(Dense(1, activation="sigmoid"))
      # # model_glove.add(Dense(5))
      # model_glove.add(Activation('softmax'))

      #-----------------------------------------------------------------------


      # model_glove.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
      model_2 = Model(inputs=inputs_2, outputs=output_2)
      model_2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

      # model_glove.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy']) #for CNN-BiLSTM

      # model_glove.fit(data_train,np.array(Label_train_over),validation_split=0.4,epochs=9)
      # model_glove.fit(data_train,np.array(Label_train_over),validation_split=0.4,epochs=11)
      batch_size = 64
      model_2.fit(data_train,np.array(Label_train_over), epochs=9, batch_size=batch_size, verbose=1, validation_split=0.2)




      scores = model_2.evaluate(data_test,np.array(Label_test_), verbose=0)
      print("Accuracy: %.2f%%" % (scores[1]*100))

      y_pred = model_2.predict(data_test,batch_size=64,verbose=1)

      #Almacenar información de predicción
      for i in range(len(x_test)):
        if polaridad == 'negative':
          PReviews[i].prob_neg = y_pred[i]
        else:
          PReviews[i].prob_pos = y_pred[i]

      # print(y_pred)
      y_pred = (y_pred>0.2)
      # print(y_pred)
      list(y_pred)




      cm = confusion_matrix(np.array(Label_test_), y_pred)
      print(cm)

      print(classification_report(np.array(Label_test_),y_pred))

  PCategory_.Reviews = PReviews
  return PCategory_

In [None]:
import re
import pandas
from collections import defaultdict
from pandas import DataFrame

def CNN_binario(categoria, x_train, y_train, x_test, y_test):

  print(categoria.category)

  ##############################################################
  #Datos de clasificación
  PCategory_ = PCategory(categoria.category)

  PReviews = []
  contador = 0
  for review in x_test:
    PReview_ = PReview(review,contador,y_test[contador])
    PReviews.append(PReview_)
    contador += 1


  polaridades = ["negative","positive"] #No se debe considerar la polaridad NEUTRAL

  for polaridad in polaridades:
    print(polaridad)
    # Agrupar información de reviews en un solo conjunto de datos
    Reviews_train_ = []
    Reviews_train_1 = []
    Label_train_ = []
    contador = 0
    flag = 0
    positivos = 0
    negativos = 0


    if polaridad == 'negative':
      vec_categoria = categoria.negativos.vector
    else:
      vec_categoria = categoria.positivos.vector

     # print(y_train)

    for doc in x_train:
      if y_train[contador] != "neutral" and y_train[contador] != "conflict":
      # if y_train[contador] != "conflict":
        if y_train[contador] == polaridad:
          Reviews_train_1.append(doc)
          Label_train_.append(1)
        else:
          Label_train_.append(0)

        Reviews_train_.append(doc)

      contador += 1


    if len(Reviews_train_) > 0:
      Vocabulario = Vocabulario_train_Over(Reviews_train_1)


      # Max-Lenght
      max_length = max([len(s.split()) for s in Reviews_train_])
      # ################################################################


      # Datos de test
      Reviews_test_ = []
      Label_test_ = []
      contador = 0

      for doc in x_test:
        if y_test[contador] != "neutral" and y_test[contador] != "conflict":
        # if y_test[contador] != "conflict":
          # review = Review(doc)
          # review.polaridad = y_test[contador].lower()
          # review.vector_cat = vec_categoria

          if y_test[contador] == polaridad:
            Label_test_.append(1)
          else:
            Label_test_.append(0)

          Reviews_test_.append(doc)
        contador += 1




      embedding_dim = 200
      vocabulary_size = len(Vocabulario) #len(Vocabulario) #20000
      tokenizer = Tokenizer(num_words = vocabulary_size)
      tokenizer.fit_on_texts(Reviews_train_1)
      sequences = tokenizer.texts_to_sequences(Reviews_train_)
      data_train = pad_sequences(sequences, maxlen=max_length)

      tokenizer = Tokenizer(num_words = vocabulary_size)
      tokenizer.fit_on_texts(Reviews_train_1)

      sequences = tokenizer.texts_to_sequences(Reviews_test_)
      data_test = pad_sequences(sequences, maxlen = max_length)


      #Embedding Matrix
      embeddings_index = dict()
      for word in model.index2word:
        embeddings_index[word] = np.asarray(model[word],dtype='float32')


      print('Loaded %s word vectors.' %len(embeddings_index))

      # create a weight matrix for words in training reviews
      embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
      for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
          break
        else:
          embedding_vector = embeddings_index.get(word)
          if embedding_vector is not None:
            # wordVector = VectorRelation(word,embedding_vector)
            cosine = cos_sim(embedding_vector,vec_categoria)
            wordVector = [n * cosine for n in embedding_vector]
            embedding_matrix[index] =  wordVector #embedding_vector

      # print(embedding_matrix)

      # model_glove = Sequential()
      # model_glove.add(Embedding(vocabulary_size,300,input_length=max_length,weights=[embedding_matrix],trainable=False))

      #--------------------------------------------------
      """https://unipython.com/proyecto-desarrollar-un-modelo-de-incrustacion-cnn-para-el-analisis-de-sentimientos/"""
  #     #configuración CNN

  #     # model_glove.add(Conv1D(filters=256,kernel_size=3,activation='relu'))
  #     # model_glove.add(MaxPooling1D(pool_size=3))

  #     # model_glove.add(Flatten())
  #     # # model_glove.add(Dense(10, activation="relu"))
  #     # model_glove.add(Dense(1, activation="sigmoid"))


  #     #------------------------------------------------

            #--------------------------------------------------
      #03-05-2021
      #CNN con multiples kernels (capas a mismo nivel de convolución)
      num_filters = 256

      inputs_2 = Input(shape=(max_length,), dtype='int32')
      embedding_layer_2 = Embedding(vocabulary_size,embedding_dim, embeddings_initializer=Constant(embedding_matrix),input_length=max_length, trainable=False)(inputs_2)

      conv_0 = Conv1D(num_filters, kernel_size=1, activation='relu')(embedding_layer_2)
      conv_1 = Conv1D(num_filters, kernel_size=2, activation='relu')(embedding_layer_2)
      conv_2 = Conv1D(num_filters, kernel_size=3, activation='relu')(embedding_layer_2)

      maxpool_0 = MaxPool1D(pool_size=(max_length - 1 + 1), strides=1, padding='valid')(conv_0)
      maxpool_1 = MaxPool1D(pool_size=(max_length - 2 + 1), strides=1, padding='valid')(conv_1)
      maxpool_2 = MaxPool1D(pool_size=(max_length - 3 + 1), strides=1, padding='valid')(conv_2)

      concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
      flatten_2 = Flatten()(concatenated_tensor)

      dropout_2 = Dropout(0.5)(flatten_2)
      output_2 = Dense(units=1, activation='sigmoid')(dropout_2)

      # model_glove.add(Dense(10, activation="relu"))
      # model_glove.add(Dense(1, activation="sigmoid"))
      #------------------------------------------------

  #     #------------------------------------------------
  #     """"https://www.tensorflow.org/api_docs/python/tf/keras/layers/Bidirectional"""
  #     # #Configruacion BiLSTM
  #     # forward_layer = LSTM(10, return_sequences=True)
  #     # backward_layer = LSTM(10, activation='relu',return_sequences=True, go_backwards=True)
  #     # model_glove.add(Bidirectional(forward_layer,backward_layer=backward_layer, input_shape=(5,10)))
  #     # model_glove.add(Flatten())
  #     # model_glove.add(Dense(1, activation="sigmoid"))
  #     # # model_glove.add(Dense(5))
  #     # model_glove.add(Activation('softmax'))

  #     #-----------------------------------------------------------------------


  #     # model_glove.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
      model_2 = Model(inputs=inputs_2, outputs=output_2)
      model_2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

  #     # model_glove.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy']) #for CNN-BiLSTM

  #     # model_glove.fit(data_train,np.array(Label_train_over),validation_split=0.4,epochs=9)
  #     # model_glove.fit(data_train,np.array(Label_train_over),validation_split=0.4,epochs=11)
      batch_size = 64
      model_2.fit(data_train,np.array(Label_train_), epochs=9, batch_size=batch_size, verbose=1, validation_split=0.2)




      scores = model_2.evaluate(data_test,np.array(Label_test_), verbose=0)
      print("Accuracy: %.2f%%" % (scores[1]*100))

      y_pred = model_2.predict(data_test,batch_size=64,verbose=1)

      #Almacenar información de predicción
      for i in range(len(x_test)):
        if polaridad == 'negative':
          PReviews[i].prob_neg = y_pred[i]
        else:
          PReviews[i].prob_pos = y_pred[i]

      # print(y_pred)
      y_pred = (y_pred>0.2)
      # print(y_pred)
      list(y_pred)




      cm = confusion_matrix(np.array(Label_test_), y_pred)
      print(cm)

      print(classification_report(np.array(Label_test_),y_pred))

  PCategory_.Reviews = PReviews
  return PCategory_

In [None]:
#Listado manual de Entidades - para recorrer directorios y tener léxico jerárquico
# categorias_list = ["AMBIENCE#GENERAL"]
categorias_list = ["AMBIENCE#GENERAL", "DRINKS#QUALITY","DRINKS#STYLE_OPTIONS","DRINKS#PRICES","FOOD#QUALITY", "FOOD#STYLE_OPTIONS",
             "FOOD#PRICES","RESTAURANT#GENERAL", "RESTAURANT#MISCELLANEOUS","RESTAURANT#PRICES","SERVICE#GENERAL","LOCATION#GENERAL"]

# categorias_list = ["AMBIENCE#GENERAL", "DRINKS#QUALITY","DRINKS#STYLE_OPTIONS","DRINKS#PRICES","FOOD#QUALITY", "FOOD#STYLE_OPTIONS",
#              "FOOD#PRICES","RESTAURANT#GENERAL", "RESTAURANT#MISCELLANEOUS","RESTAURANT#PRICES","SERVICE#GENERAL"]

#Recorrer por categoría:
for cat in categorias_list:
  print(cat)
  # x = 1.46
  x = .0

  Categoria = Category(cat)
  Vocabulario_polaridades_train, x_train, y_train, x_test, y_test = Lexicos_polaridad_categoria(cat)

  for item in Vocabulario_polaridades_train:
    # print(item.polaridad)
    # print(item.vocabulario)

    if item.polaridad == "negative":
      Categoria.negativos.vocabulario = item.vocabulario
      # print("negative")
      # print(len(item.vocabulario))
      Categoria.negativos.vector = VectorUnico(item.vocabulario, item.diccionario_gv, x , item.promedio, item.dev_std, False)
    else:
        # print("positive")
        # print(len(item.vocabulario))
        Categoria.positivos.vocabulario = item.vocabulario
        Categoria.positivos.vector = VectorUnico(item.vocabulario, item.diccionario_gv, x , item.promedio, item.dev_std, False)


  PCategory_ = CNN_oversampling_binario(Categoria, x_train, y_train, x_test, y_test)
  # PCategory_ = SVM_OverSampling_binario(Categoria, x_train, y_train, x_test, y_test)

  #Análisis de probabiliades
  #Resultados
  total_reviews = 0
  total_correctos = 0

  # Resultados erroneos
  total_errores = 0
  total_errores_neg = 0
  total_errores_pos = 0
  error_10 = 0
  error_20 = 0
  error_30 = 0
  error_40 = 0
  error_50 = 0
  error_60 = 0
  error_70 = 0
  error_80 = 0
  error_90 = 0
  error_100 = 0

  for review in PCategory_.Reviews:
    total_reviews += 1


    #Clasificación por la probabilidad más alta
    diff = 0
    if review.prob_neg > review.prob_pos:
      review.res_clasificacion = "negative"
      res_clasificacion = "negative"
      diff = review.prob_neg - review.prob_pos
    else:
      review.res_clasificacion = "positive"
      res_clasificacion = "positive"
      diff = review.prob_pos - review.prob_neg







    # #Clasificación con probabilidad mayor a .5
    # if review.prob_neg < 0.3:
    #   review.res_clasificacion = "negative"
    #   res_clasificacion = "negative"
    # else:
    #   review.res_clasificacion = "positive"
    #   res_clasificacion = "positive"


    #Total clasificación correcta
    if res_clasificacion == review.polaridad_:
      total_correctos += 1
    else:
      total_errores += 1
      # print(diff)
      if review.polaridad_ == "negative":
        total_errores_neg += 1
      else:
        total_errores_pos += 1

      if diff <= .10:
        error_10 += 1
      else:
        if diff <= .20:
          error_20 += 1
        else:
          if diff <= .30:
            error_30 += 1
          else:
            if diff <= .40:
              error_40 += 1
            else:
              if diff <= .50:
                error_50 += 1
              else:
                if diff <= .60:
                  error_60 += 1
                else:
                  if diff <= .70:
                    error_70 += 1
                  else:
                    if diff <= .80:
                      error_80 += 1
                    else:
                      if diff <= .90:
                        error_90 += 1
                      else:
                        error_100 += 1

      # print(review.review_)
      # print(review.polaridad_)
      # print("NEG:" + str(review.prob_neg))
      # print("POS:" + str(review.prob_pos))
      # print("--------------")

  # print("Total Errores: " + str(total_errores))
  # print("Total Errores Neg: " + str(total_errores_neg))
  # print("Total Errores Pos: " + str(total_errores_pos))

  # print(".10: " + str(error_10))
  # print(".20: " + str(error_20))
  # print(".30: " + str(error_30))
  # print(".40: " + str(error_40))
  # print(".50: " + str(error_50))
  # print(".60: " + str(error_60))
  # print(".70: " + str(error_70))
  # print(".80: " + str(error_80))
  # print(".90: " + str(error_90))
  # print("1.00: " + str(error_90))


  print(total_reviews)
  print(total_correctos)
  accuracy_ = (total_correctos*100)/total_reviews
  print(accuracy_)


  # print(Categoria.negativos.polaridad)
  # print(Categoria.negativos.vocabulario)
  # print(len(Categoria.negativos.vocabulario))
  # print(Categoria.negativos.vector)
  # print("-----------------------------------------")
  # print(Categoria.neutros.polaridad)
  # print(Categoria.neutros.vocabulario)
  # print(len(Categoria.neutros.vocabulario))
  # print(Categoria.neutros.vector)
  # print("-----------------------------------------")
  # print(Categoria.positivos.polaridad)
  # print(Categoria.positivos.vocabulario)
  # print(len(Categoria.positivos.vocabulario))
  # print(Categoria.positivos.vector)
  print("--------------------------------------------------------------------------------------------------------")

AMBIENCE#GENERAL
AMBIENCE#GENERAL


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 42531.79it/s]


AMBIENCE#GENERAL
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 96.83%
[[50 11]
 [ 1  1]]
              precision    recall  f1-score   support

           0       0.98      0.82      0.89        61
           1       0.08      0.50      0.14         2

    accuracy                           0.81        63
   macro avg       0.53      0.66      0.52        63
weighted avg       0.95      0.81      0.87        63

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 85.71%
[[ 0  2]
 [ 4 57]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.97      0.93      0.95        61

    accuracy                           0.90        63
   macro avg       0.48      0.47      0.47        63
weighted avg       0.94      0.90      0.92        63

63
61
96.82539682539682
--------------------------------------------------------------------------------------------------------
DRINKS#QUALITY
DRINKS#QUALITY


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 54731.45it/s]


DRINKS#QUALITY
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 90.91%
[[ 3 18]
 [ 0  1]]
              precision    recall  f1-score   support

           0       1.00      0.14      0.25        21
           1       0.05      1.00      0.10         1

    accuracy                           0.18        22
   macro avg       0.53      0.57      0.17        22
weighted avg       0.96      0.18      0.24        22

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 90.91%
[[ 0  1]
 [ 1 20]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.95      0.95      0.95        21

    accuracy                           0.91        22
   macro avg       0.48      0.48      0.48        22
weighted avg       0.91      0.91      0.91        22

22
20
90.9090909090909
--------------------------------------------------------------------------------------------------------
DRINKS#STYLE_OPTIONS
DRINKS#STYLE_OPTIONS


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 43470.55it/s]


DRINKS#STYLE_OPTIONS
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 91.67%
[[4 7]
 [0 1]]
              precision    recall  f1-score   support

           0       1.00      0.36      0.53        11
           1       0.12      1.00      0.22         1

    accuracy                           0.42        12
   macro avg       0.56      0.68      0.38        12
weighted avg       0.93      0.42      0.51        12

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 83.33%
[[ 0  1]
 [ 0 11]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.92      1.00      0.96        11

    accuracy                           0.92        12
   macro avg       0.46      0.50      0.48        12
weighted avg       0.84      0.92      0.88        12

12
10
83.33333333333333
--------------------------------------------------------------------------------------------------------
DRINKS#PRICES
DRINKS#PRICES


  _warn_prf(average, modifier, msg_start, len(result))
  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 53725.30it/s]


DRINKS#PRICES
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 0.00%
[[4]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 75.00%
[[0 4]
 [0 0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       4.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0

4
0
0.0
--------------------------------------------------------------------------------------------------------
FOOD#QUALITY
FOOD#QUALITY


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 50655.28it/s]


FOOD#QUALITY
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 89.33%
[[217  52]
 [ 13  18]]
              precision    recall  f1-score   support

           0       0.94      0.81      0.87       269
           1       0.26      0.58      0.36        31

    accuracy                           0.78       300
   macro avg       0.60      0.69      0.61       300
weighted avg       0.87      0.78      0.82       300

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 72.33%
[[ 14  17]
 [ 34 235]]
              precision    recall  f1-score   support

           0       0.29      0.45      0.35        31
           1       0.93      0.87      0.90       269

    accuracy                           0.83       300
   macro avg       0.61      0.66      0.63       300
weighted avg       0.87      0.83      0.85       300

300
270
90.0
--------------------------------------------------------------------------------------------------------
FOOD#STYLE_OPTIONS
FOOD#STYLE_OPTIONS


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 47323.95it/s]


FOOD#STYLE_OPTIONS
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 65.22%
[[17 14]
 [ 7  8]]
              precision    recall  f1-score   support

           0       0.71      0.55      0.62        31
           1       0.36      0.53      0.43        15

    accuracy                           0.54        46
   macro avg       0.54      0.54      0.53        46
weighted avg       0.60      0.54      0.56        46

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 56.52%
[[ 6  9]
 [ 5 26]]
              precision    recall  f1-score   support

           0       0.55      0.40      0.46        15
           1       0.74      0.84      0.79        31

    accuracy                           0.70        46
   macro avg       0.64      0.62      0.62        46
weighted avg       0.68      0.70      0.68        46

46
31
67.3913043478261
--------------------------------------------------------------------------------------------------------
FOOD#PRICES
FOOD#PRICES


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 40463.14it/s]


FOOD#PRICES
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 45.00%
[[ 4  2]
 [ 4 10]]
              precision    recall  f1-score   support

           0       0.50      0.67      0.57         6
           1       0.83      0.71      0.77        14

    accuracy                           0.70        20
   macro avg       0.67      0.69      0.67        20
weighted avg       0.73      0.70      0.71        20

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 75.00%
[[5 9]
 [1 5]]
              precision    recall  f1-score   support

           0       0.83      0.36      0.50        14
           1       0.36      0.83      0.50         6

    accuracy                           0.50        20
   macro avg       0.60      0.60      0.50        20
weighted avg       0.69      0.50      0.50        20

20
12
60.0
--------------------------------------------------------------------------------------------------------
RESTAURANT#GENERAL
RESTAURANT#GENERAL


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 42916.61it/s]


RESTAURANT#GENERAL
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 78.72%
[[71 36]
 [14 20]]
              precision    recall  f1-score   support

           0       0.84      0.66      0.74       107
           1       0.36      0.59      0.44        34

    accuracy                           0.65       141
   macro avg       0.60      0.63      0.59       141
weighted avg       0.72      0.65      0.67       141

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 65.25%
[[  4  30]
 [  6 101]]
              precision    recall  f1-score   support

           0       0.40      0.12      0.18        34
           1       0.77      0.94      0.85       107

    accuracy                           0.74       141
   macro avg       0.59      0.53      0.52       141
weighted avg       0.68      0.74      0.69       141

141
112
79.43262411347517
--------------------------------------------------------------------------------------------------------
RESTAURANT#MISCELLANEOUS
RESTAURANT#MISCELLANEOUS


  return concat([self.open(f).read() for f in fileids])
  frame = None


Grid search: 100%|██████████| 216/216 [00:00<00:00, 32212.25it/s]


RESTAURANT#MISCELLANEOUS
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 65.52%
[[11  5]
 [ 5  8]]
              precision    recall  f1-score   support

           0       0.69      0.69      0.69        16
           1       0.62      0.62      0.62        13

    accuracy                           0.66        29
   macro avg       0.65      0.65      0.65        29
weighted avg       0.66      0.66      0.66        29

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 51.72%
[[ 0 13]
 [ 2 14]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.52      0.88      0.65        16

    accuracy                           0.48        29
   macro avg       0.26      0.44      0.33        29
weighted avg       0.29      0.48      0.36        29

29
18
62.06896551724138
--------------------------------------------------------------------------------------------------------
RESTAURANT#PRICES
RESTAURANT#PRICES


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 41012.66it/s]


RESTAURANT#PRICES
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 63.16%
[[ 3  3]
 [ 1 12]]
              precision    recall  f1-score   support

           0       0.75      0.50      0.60         6
           1       0.80      0.92      0.86        13

    accuracy                           0.79        19
   macro avg       0.78      0.71      0.73        19
weighted avg       0.78      0.79      0.78        19

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 63.16%
[[ 0 13]
 [ 0  6]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.32      1.00      0.48         6

    accuracy                           0.32        19
   macro avg       0.16      0.50      0.24        19
weighted avg       0.10      0.32      0.15        19

19
13
68.42105263157895
--------------------------------------------------------------------------------------------------------
SERVICE#GENERAL
SERVICE#GENERAL


  _warn_prf(average, modifier, msg_start, len(result))
  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 38057.96it/s]


SERVICE#GENERAL
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 72.30%
[[48 24]
 [10 66]]
              precision    recall  f1-score   support

           0       0.83      0.67      0.74        72
           1       0.73      0.87      0.80        76

    accuracy                           0.77       148
   macro avg       0.78      0.77      0.77       148
weighted avg       0.78      0.77      0.77       148

positive




Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 75.68%
[[40 36]
 [11 61]]
              precision    recall  f1-score   support

           0       0.78      0.53      0.63        76
           1       0.63      0.85      0.72        72

    accuracy                           0.68       148
   macro avg       0.71      0.69      0.68       148
weighted avg       0.71      0.68      0.67       148

148
114
77.02702702702703
--------------------------------------------------------------------------------------------------------
LOCATION#GENERAL
LOCATION#GENERAL


  return concat([self.open(f).read() for f in fileids])


Grid search: 100%|██████████| 216/216 [00:00<00:00, 52223.29it/s]


LOCATION#GENERAL
negative
Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 100.00%
[[ 0 11]
 [ 0  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      11.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00      11.0
   macro avg       0.00      0.00      0.00      11.0
weighted avg       0.00      0.00      0.00      11.0

positive


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Loaded 1193514 word vectors.
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Accuracy: 100.00%
[[11]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        11

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11

11
11
100.0
--------------------------------------------------------------------------------------------------------


In [None]:
  categorias_list = ["AMBIENCE#GENERAL", "DRINKS#QUALITY","DRINKS#STYLE_OPTIONS","DRINKS#PRICES","FOOD#QUALITY", "FOOD#STYLE_OPTIONS",
             "FOOD#PRICES","RESTAURANT#GENERAL", "RESTAURANT#MISCELLANEOUS","RESTAURANT#PRICES","SERVICE#GENERAL","LOCATION#GENERAL"]

  #------------------------------------------------------------------------------------------------------------
  #Cargar datos de entrenamiento
  d_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.reviews.gold','r', encoding='utf-8')
  d_train_ = []
  for line in d_train.readlines():
    d_train_.append(line.replace("\n",""))
  d_train.close()

  #Cargar clases de entrenamiento
  l_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.class.gold','r', encoding='utf-8')

  l_train_ = []
  for line in l_train.readlines():
    l_train_.append(line.replace("\n",""))

  l_train.close()

  #Cargar polaridades de entrenamiento
  p_train = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TRAIN.polarities.gold','r', encoding='utf-8')

  p_train_ = []
  for line in p_train.readlines():
    p_train_.append(line.replace("\n",""))

  p_train.close()

  # print(p_train_)



  for category in categorias_list:
    x_train = []
    y_train = []

    for index_review in range(len(d_train_)):
      if l_train_[index_review] == category:
        x_train.append(d_train_[index_review])
        y_train.append(p_train_[index_review])

    neutral = 0
    positive = 0
    negative = 0
    for index_review in range(len(x_train)):
      if y_train[index_review] == 'neutral':
        neutral += 1
      else:
        if y_train[index_review] == 'positive':
          positive +=1
        else:
          if y_train[index_review] == 'negative':
            negative += 1

    print(category)
    print('NEU:' + str(neutral))
    print('POS:' + str(positive))
    print('NEG:' + str(negative))
    print(neutral + positive + negative)
    print("---------------------------")


    # x_train= CleanDocs(x_train)

  #------------------------------------------------------------------------------------------------------------

  print("#------------------------------------------------------------------------------------------------------------")
  #------------------------------------------------------------------------------------------------------------
  #Cargar datos de entrenamiento
  d_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.reviews.gold','r', encoding='utf-8')
  d_test_ = []
  for line in d_test.readlines():
    d_test_.append(line.replace("\n",""))
  d_test.close()

  #Cargar clases de entrenamiento
  l_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.class.gold','r', encoding='utf-8')

  l_test_ = []
  for line in l_test.readlines():
    l_test_.append(line.replace("\n",""))
  l_test.close()

  #Cargar polaridades de entrenamiento
  p_test = open('/content/drive/MyDrive/ProyectoTesis_INAOE/restaurante_EN/SLOT3/S3-V3_EN_REST_SB1_TEST.polarities.gold','r', encoding='utf-8')

  p_test_ = []
  for line in p_test.readlines():
    p_test_.append(line.replace("\n",""))

  p_test.close()



  for category in categorias_list:
    x_test = []
    y_test = []

    for index_review in range(len(d_test_)):
      if l_test_[index_review] == category:
        x_test.append(d_test_[index_review])
        y_test.append(p_test_[index_review])

    neutral = 0
    positive = 0
    negative = 0
    for index_review in range(len(x_test)):
      if y_test[index_review] == 'neutral':
        neutral += 1
      else:
        if y_test[index_review] == 'positive':
          positive +=1
        else:
          if y_test[index_review] == 'negative':
            negative += 1

    print(category)
    print('NEU:' + str(neutral))
    print('POS:' + str(positive))
    print('NEG:' + str(negative))
    print(neutral + positive + negative)
    print("---------------------------")


AMBIENCE#GENERAL
NEU:16
POS:197
NEG:42
255
---------------------------
DRINKS#QUALITY
NEU:2
POS:40
NEG:5
47
---------------------------
DRINKS#STYLE_OPTIONS
NEU:0
POS:29
NEG:3
32
---------------------------
DRINKS#PRICES
NEU:0
POS:13
NEG:7
20
---------------------------
FOOD#QUALITY
NEU:28
POS:617
NEG:204
849
---------------------------
FOOD#STYLE_OPTIONS
NEU:9
POS:83
NEG:45
137
---------------------------
FOOD#PRICES
NEU:1
POS:41
NEG:48
90
---------------------------
RESTAURANT#GENERAL
NEU:8
POS:313
NEG:101
422
---------------------------
RESTAURANT#MISCELLANEOUS
NEU:13
POS:58
NEG:27
98
---------------------------
RESTAURANT#PRICES
NEU:6
POS:34
NEG:40
80
---------------------------
SERVICE#GENERAL
NEU:12
POS:211
NEG:226
449
---------------------------
LOCATION#GENERAL
NEU:6
POS:21
NEG:1
28
---------------------------
#------------------------------------------------------------------------------------------------------------
AMBIENCE#GENERAL
NEU:3
POS:61
NEG:2
66
---------------------