In [36]:
import os
import pickle
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

from Preprocesamiento.lematizador import lematizar as lematyze

In [37]:
class data_set_polarity:
	def __init__(self, X_train, y_train, X_test, y_test):
		self.X_train = X_train
		self.y_train = y_train
		self.X_test = X_test
		self.y_test = y_test

class data_set_attraction:
	def __init__(self, X_train, y_train, X_test, y_test):
		self.X_train = X_train
		self.y_train = y_train
		self.X_test = X_test
		self.y_test = y_test

In [48]:
"""
A - Adjective
C - Conjunction
D - Determiner
N - Noun
	C - Common
	P - Proper
P - Pronoun
R - Adverb
S - Adposition
V - Verb
Z - Number
W - Date
I - Interjection
F - Symbols
"""
TAGS = ['A']

def preprocessor(slist:list, i, sz):
	print(f"\r{i}/{sz}", end="")
	result = []
	preprocessing = lematyze("\n".join(slist))
	for ln in preprocessing:
		for w in ln:
			#print(f"{w.get_form()} [{w.get_tag()}] -> {w.get_lemma()}")
			for tag in TAGS:
				if w.get_tag().startswith(tag):
					result.append(w.get_lemma())
	return " ".join(result)

def generate_train_test(file_name, test_size=0.1):
	pd.options.display.max_colwidth = 200				

	#Lee el corpus original del archivo de entrada y lo pasa a una DataFrame
	df = pd.read_excel(file_name, dtype=str, nrows=None)
	df=df.replace(to_replace=np.NaN,value="")

	count_polarity = df['Polarity'].value_counts().sort_values(ascending=False).to_frame()
	max_rows = count_polarity.iloc[0]

	print(count_polarity)
	
	for pol in count_polarity.index:
		print(f"pol {pol}")
		rows_number = count_polarity[0]
		print(f"rows number {rows_number}")
		polarity_df = df.loc[df['Polarity']==pol]
		while rows_number<max_rows:
			idx = random.randint(0, rows_number-1)
			df.append(polarity_df.iloc[idx])
			rows_number += 1

	
	X = df.drop(['Polarity', 'Attraction'],axis=1).values   
	X = np.array(list(map(lambda y: preprocessor(y[1], y[0], len(X)), enumerate(X))))
	y_polarity = df['Polarity'].values
	y_attraction = df['Attraction'].values



	
	#~ #Separa el corpus cargado en el DataFrame en el 80% para entrenamiento y el 20% para pruebas
	X_train, X_test, y_train_polarity, y_test_polarity = train_test_split(X, y_polarity, test_size=test_size, random_state=0)
	X_train, X_test, y_train_attraction, y_test_attraction = train_test_split(X, y_attraction, test_size=test_size, random_state=0)
	
	#	Polarity,	Attraction
	return (data_set_polarity(X_train, y_train_polarity, X_test, y_test_polarity), data_set_attraction(X_train, y_train_attraction, X_test, y_test_attraction))

In [49]:
corpus_polarity, corpus_attraction = generate_train_test('Corpus/Rest_Mex_2022_Sentiment_Analysis_Track_Train.xlsx')

with open ('Corpus/Rest_Mex_2022_Sentiment_Analysis_Track_Train-Polarity-Preprocessed-Sesg.pkl','wb') as dataset_file:
	pickle.dump(corpus_polarity, dataset_file)

with open ('Corpus/Rest_Mex_2022_Sentiment_Analysis_Track_Train-Attraction-Preprocessed-Sesg.pkl','wb') as dataset_file:
	pickle.dump(corpus_attraction, dataset_file)

   Polarity
5     20936
4      5878
3      2121
2       730
1       547
pol 5


KeyError: 0

In [None]:
len(corpus_polarity.X_train)


In [None]:
if not (os.path.exists('Corpus/Rest_Mex_2022_Sentiment_Analysis_Track_Train-Polarity-Preprocessed.pkl')):
	raise("No se ha generado el corpus lematizado para Polarity")
else:
	with open ('Corpus/Rest_Mex_2022_Sentiment_Analysis_Track_Train-Polarity-Preprocessed.pkl','rb') as corpus_file:
		corpus_polarity = pickle.load(corpus_file)

#~ print (corpus_attraction.X_train[0])

In [None]:
# Representación vectorial binarizada
vectorizador_binario = CountVectorizer(binary=True)
vectorizador_binario_fit = vectorizador_binario.fit(corpus_attraction.X_train)
X_train = vectorizador_binario_fit.transform(corpus_attraction.X_train)
y_train = corpus_attraction.y_train
print (vectorizador_binario.get_feature_names_out())
print (X_train.shape)#sparse matrix
#~ clf = LogisticRegression()
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)


X_test = vectorizador_binario_fit.transform(corpus_attraction.X_test)
y_test = corpus_attraction.y_test
print (vectorizador_binario_fit.get_feature_names_out())
print (X_test.shape)#sparse matrix

y_pred = clf.predict(X_test)
print (y_pred)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred,labels=['Restaurant','Hotel','Attractive']))
target_names = ['Restaurant','Hotel','Attractive']
print(classification_report(y_test, y_pred, target_names=target_names))

y_train_polarity = corpus_polarity.y_train
clf_polarity = LogisticRegression(max_iter=10000)
clf_polarity.fit(X_train, y_train_polarity)
y_test = corpus_polarity.y_test
y_pred = clf_polarity.predict(X_test)
print (y_pred)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred,labels=[1,2,3,4,5]))
target_names = ['1','2','3','4','5']
print(classification_report(y_test, y_pred, target_names=target_names))


#~ print (type(X.toarray()))#dense ndarray
#~ print ('Representación vectorial binarizada')
#~ print (X.toarray())#dense ndarray

#~ #Representación vectorial por frecuencia
#~ vectorizador_frecuencia = CountVectorizer()
#~ X = vectorizador_frecuencia.fit_transform(corpus_lematizado)
#~ print('Representación vectorial por frecuencia')
#~ print (X.toarray())

#Representación vectorial tf-idf
#~ vectorizador_tfidf = TfidfVectorizer()
#~ X = vectorizador_tfidf.fit_transform(corpus_lematizado)
#~ print ('Representación vectorial tf-idf')
#~ print (X.toarray())