In [1]:
# Librerías

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting
import string # string manipulation
import re # regular expressions
import nltk # text manipulation
nltk.download('punkt')
nltk.download('stopwords')

from tqdm import trange # progress bar
from nltk import tokenize # text manipulation
from nltk.corpus import stopwords # text manipulation
from nltk.stem import WordNetLemmatizer # text manipulation
from nltk.probability import FreqDist # text manipulation
from collections import Counter # text manipulation
from sklearn.feature_extraction.text import CountVectorizer # text manipulation
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # wordcloud generator
from IPython.display import display # image display
from PIL import Image

#hito 2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn import tree

import joblib # guardar modelos

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("threads.csv") # carga del dataset a usar como variable

In [3]:
# función para convertir el rating a palabras
def ratingTransform(rating):
    if rating <= 2:
        return "NEGATIVE"
    elif rating <= 4:
        return "NEUTRAL"
    else:
        return "POSITIVE"

# creacion del grafico de pie de las reviews segun rating
data["rating"] = data["rating"].apply(ratingTransform)

In [4]:
def clean(review):

    review = review.lower()
    review = re.sub('[^a-z A-Z 0-9-]+', '', review)
    review = " ".join([word for word in review.split() if word not in stopwords.words('english')])

    return review

data['review_description'] = data['review_description'].apply(clean)

In [5]:
Y = data['rating']

In [6]:
def subsampling(X, Y):
    X['evaluacion'] = Y

    clase_mayoritaria_1 = X[X['evaluacion'] == "POSITIVE"]
    clase_mayoritaria_2 = X[X['evaluacion'] == "NEGATIVE"]
    clase_minoritaria = X[X['evaluacion'] == "NEUTRAL"]

    # Determinar el número de muestras de la clase minoritaria
    num_muestras_minoritaria = len(clase_minoritaria)
    # Realizar submuestreo de las clases mayoritarias
    clase_mayoritaria_1_resample = clase_mayoritaria_1.sample(n=num_muestras_minoritaria, random_state=42)
    clase_mayoritaria_2_resample = clase_mayoritaria_2.sample(n=num_muestras_minoritaria, random_state=42)

    # Concatenar las clases remuestreadas con la clase minoritaria
    df_resample = pd.concat([clase_mayoritaria_1_resample, clase_mayoritaria_2_resample, clase_minoritaria], axis=0)

    # Separar de nuevo las características y las etiquetas
    X_resample = df_resample.drop('evaluacion', axis=1)
    y_resample = df_resample['evaluacion']
    return (X_resample, y_resample)


## 1-gram

In [7]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(data["review_description"])

In [8]:
vectorized = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())
X_1gram = vectorized

In [18]:
X_1gram, y1 = subsampling(X_1gram, Y)

In [11]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_1gram, y1, test_size=0.33, random_state=37,stratify=y1)
joblib.dump(X1_train, 'modelos/X1_train.pkl')
joblib.dump(X1_test, 'modelos/X1_test.pkl')
joblib.dump(y1_train, 'modelos/y1_train.pkl')
joblib.dump(y1_test, 'modelos/y1_test.pkl')

['modelos/y1_test.pkl']

### Entrenar Modelos

In [3]:
X1_train = joblib.load('modelos/X1_train.pkl')
y1_train = joblib.load('modelos/y1_train.pkl')

In [13]:
clf = DecisionTreeClassifier()
clf.fit(X1_train, y1_train) 
joblib.dump(clf, "modelos/1-gram_trained_DecisionTreeClassifier.joblib")

['modelos/1-gram_trained_DecisionTreeClassifier.joblib']

In [14]:
clf = MultinomialNB()
clf.fit(X1_train, y1_train)
joblib.dump(clf, "modelos/1-gram_trained_NV.joblib")

['modelos/1-gram_trained_NV.joblib']

In [4]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X1_train, y1_train)
joblib.dump(clf, "modelos/1-gram_logistic_regesion.joblib")

['modelos/1-gram_logistic_regesion.joblib']

In [5]:
clf = SVC()
clf.fit(X1_train, y1_train)
joblib.dump(clf, "modelos/1-gram_trained_svc.joblib")

### Cargar modelos para evaluar

In [3]:
X1_test = joblib.load('modelos/X1_test.pkl')
y1_test = joblib.load('modelos/y1_test.pkl')

In [4]:
clf = joblib.load("modelos/1-gram_trained_DecisionTreeClassifier.joblib")
y1_pred = clf.predict(X1_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y1_test, y1_pred))   ## Evaluamos la predicción comparando y_test con y1_pred
print(classification_report(y1_test, y1_pred))

Accuracy en test set: 0.5784092878184024
              precision    recall  f1-score   support

    NEGATIVE       0.65      0.60      0.62      1924
     NEUTRAL       0.53      0.40      0.45      1924
    POSITIVE       0.56      0.74      0.64      1923

    accuracy                           0.58      5771
   macro avg       0.58      0.58      0.57      5771
weighted avg       0.58      0.58      0.57      5771



In [5]:
clf = joblib.load("modelos/1-gram_trained_NV.joblib")
y1_pred = clf.predict(X1_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y1_test, y1_pred))   ## Evaluamos la predicción comparando y_test con y1_pred
print(classification_report(y1_test, y1_pred))

Accuracy en test set: 0.6575983365101369
              precision    recall  f1-score   support

    NEGATIVE       0.76      0.66      0.71      1924
     NEUTRAL       0.61      0.55      0.58      1924
    POSITIVE       0.62      0.76      0.69      1923

    accuracy                           0.66      5771
   macro avg       0.66      0.66      0.66      5771
weighted avg       0.66      0.66      0.66      5771



In [6]:
clf = joblib.load("modelos/1-gram_logistic_regesion.joblib")
y1_pred = clf.predict(X1_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y1_test, y1_pred))   ## Evaluamos la predicción comparando y_test con y1_pred
print(classification_report(y1_test, y1_pred))

Accuracy en test set: 0.6510136891353319
              precision    recall  f1-score   support

    NEGATIVE       0.74      0.67      0.71      1924
     NEUTRAL       0.63      0.45      0.53      1924
    POSITIVE       0.60      0.83      0.70      1923

    accuracy                           0.65      5771
   macro avg       0.66      0.65      0.64      5771
weighted avg       0.66      0.65      0.64      5771



In [7]:
clf = joblib.load("modelos/1-gram_trained_svc.joblib")
y1_pred = clf.predict(X1_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y1_test, y1_pred))   ## Evaluamos la predicción comparando y_test con y1_pred
print(classification_report(y1_test, y1_pred))

FileNotFoundError: [Errno 2] No such file or directory: 'modelos/1-gram_trained_svc.joblib'

## 2-gram

In [7]:
cv = CountVectorizer(ngram_range=(2,2))
bigrams = cv.fit_transform(data['review_description'])

In [8]:
bigrams = pd.DataFrame(bigrams.toarray(), columns=cv.get_feature_names_out())
X_2gram = bigrams

In [9]:
X_2gram, y2 = subsampling(X_2gram, Y)

In [10]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_2gram, y2, test_size=0.33, random_state=37,stratify=y2)
joblib.dump(X2_train, 'modelos/X2_train.pkl')
joblib.dump(X2_test, 'modelos/X2_test.pkl')
joblib.dump(y2_train, 'modelos/y2_train.pkl')
joblib.dump(y2_test, 'modelos/y2_test.pkl')

['modelos/y2_test.pkl']

### Entrenar Modelos

In [3]:
X2_train = joblib.load('modelos/X2_train.pkl')
y2_train = joblib.load('modelos/y2_train.pkl')

In [4]:
clf = DecisionTreeClassifier()
clf.fit(X2_train, y2_train) 
joblib.dump(clf, "modelos/2-gram_trained_DecisionTreeClassifier.joblib")

['modelos/2-gram_trained_DecisionTreeClassifier.joblib']

In [5]:
clf = MultinomialNB()
clf.fit(X2_train, y2_train)
joblib.dump(clf, "modelos/2-gram_trained_NV.joblib")

['modelos/2-gram_trained_NV.joblib']

In [6]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X2_train, y2_train)
joblib.dump(clf, "modelos/2-gram_logistic_regesion.joblib")

['modelos/2-gram_logistic_regesion.joblib']

In [None]:
clf = SVC()
clf.fit(X2_train, y2_train)
joblib.dump(clf, "modelos/2-gram_trained_svc.joblib")

### Cargar modelos para evaluar

In [8]:
X2_test = joblib.load('modelos/X2_test.pkl')
y2_test = joblib.load('modelos/y2_test.pkl')

In [10]:
clf = joblib.load("modelos/2-gram_trained_DecisionTreeClassifier.joblib")
y2_pred = clf.predict(X2_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y2_test, y2_pred))   ## Evaluamos la predicción comparando y2_test con y2_pred
print(classification_report(y2_test, y2_pred))

Accuracy en test set: 0.5146421763992376
              precision    recall  f1-score   support

    NEGATIVE       0.66      0.42      0.52      1924
     NEUTRAL       0.57      0.23      0.33      1924
    POSITIVE       0.45      0.89      0.60      1923

    accuracy                           0.51      5771
   macro avg       0.56      0.51      0.48      5771
weighted avg       0.56      0.51      0.48      5771



In [11]:
clf = joblib.load("modelos/2-gram_trained_NV.joblib")
y2_pred = clf.predict(X2_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y2_test, y2_pred))   ## Evaluamos la predicción comparando y2_test con y2_pred
print(classification_report(y2_test, y2_pred))

Accuracy en test set: 0.5803153699532143
              precision    recall  f1-score   support

    NEGATIVE       0.73      0.49      0.59      1924
     NEUTRAL       0.61      0.41      0.49      1924
    POSITIVE       0.51      0.84      0.63      1923

    accuracy                           0.58      5771
   macro avg       0.62      0.58      0.57      5771
weighted avg       0.62      0.58      0.57      5771



In [12]:
clf = joblib.load("modelos/2-gram_logistic_regesion.joblib")
y2_pred = clf.predict(X2_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y2_test, y2_pred))   ## Evaluamos la predicción comparando y2_test con y2_pred
print(classification_report(y2_test, y2_pred))

Accuracy en test set: 0.5687055969502686
              precision    recall  f1-score   support

    NEGATIVE       0.74      0.46      0.57      1924
     NEUTRAL       0.63      0.35      0.45      1924
    POSITIVE       0.49      0.90      0.63      1923

    accuracy                           0.57      5771
   macro avg       0.62      0.57      0.55      5771
weighted avg       0.62      0.57      0.55      5771



In [None]:
clf = joblib.load("modelos/2-gram_trained_svc.joblib")
y2_pred = clf.predict(X2_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y2_test, y2_pred))   ## Evaluamos la predicción comparando y2_test con y2_pred
print(classification_report(y2_test, y2_pred))

##  3-gram

In [7]:
cv1 = CountVectorizer(ngram_range=(3,3))
trigrams = cv1.fit_transform(data['review_description'])

In [8]:
trigrams = pd.DataFrame(trigrams.toarray(), columns=cv1.get_feature_names_out())

In [9]:
X_3gram = trigrams

In [10]:
X_3gram, y3 = subsampling(X_3gram, Y)

In [11]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X_3gram, y3, test_size=0.33, random_state=37,stratify=y3)
joblib.dump(X3_train, 'modelos/X3_train.pkl')
joblib.dump(X3_test, 'modelos/X3_test.pkl')
joblib.dump(y3_train, 'modelos/y3_train.pkl')
joblib.dump(y3_test, 'modelos/y3_test.pkl')

['modelos/y3_test.pkl']

In [2]:
X3_train = joblib.load('modelos/X3_train.pkl')
y3_train = joblib.load('modelos/y3_train.pkl')

In [5]:
clf = DecisionTreeClassifier()
clf.fit(X3_train, y3_train) 
joblib.dump(clf, "modelos/3-gram_trained_DecisionTreeClassifier.joblib")

['modelos/3-gram_trained_DecisionTreeClassifier.joblib']

In [3]:
clf = MultinomialNB()
clf.fit(X3_train, y3_train)
joblib.dump(clf, "modelos/3-gram_trained_NV.joblib")

['modelos/3-gram_trained_NV.joblib']

In [4]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X3_train, y3_train)
joblib.dump(clf, "modelos/3-gram_logistic_regesion.joblib")

['modelos/3-gram_logistic_regesion.joblib']

In [None]:
clf = SVC()
clf.fit(X3_train, y3_train)
joblib.dump(clf, "modelos/3-gram_trained_svc.joblib")

### Cargar modelos para evaluar

In [14]:
X3_test = joblib.load('modelos/X3_test.pkl')
y3_test = joblib.load('modelos/y3_test.pkl')

In [15]:
clf = joblib.load("modelos/3-gram_trained_DecisionTreeClassifier.joblib")
y3_pred = clf.predict(X3_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y3_test, y3_pred))   ## Evaluamos la predicción comparando y3_test con y3_pred
print(classification_report(y3_test, y3_pred))

Accuracy en test set: 0.3990642869520014
              precision    recall  f1-score   support

    NEGATIVE       0.73      0.14      0.23      1924
     NEUTRAL       0.59      0.09      0.16      1924
    POSITIVE       0.36      0.97      0.53      1923

    accuracy                           0.40      5771
   macro avg       0.56      0.40      0.31      5771
weighted avg       0.56      0.40      0.31      5771



In [16]:
clf = joblib.load("modelos/3-gram_trained_NV.joblib")
y3_pred = clf.predict(X3_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y3_test, y3_pred))   ## Evaluamos la predicción comparando y3_test con y3_pred
print(classification_report(y3_test, y3_pred))

Accuracy en test set: 0.4049558135505112
              precision    recall  f1-score   support

    NEGATIVE       0.71      0.16      0.27      1924
     NEUTRAL       0.59      0.09      0.16      1924
    POSITIVE       0.37      0.96      0.53      1923

    accuracy                           0.40      5771
   macro avg       0.55      0.41      0.32      5771
weighted avg       0.55      0.40      0.32      5771



In [17]:
clf = joblib.load("modelos/3-gram_logistic_regesion.joblib")
y3_pred = clf.predict(X3_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y3_test, y3_pred))   ## Evaluamos la predicción comparando y3_test con y3_pred
print(classification_report(y3_test, y3_pred))

Accuracy en test set: 0.41032749956679954
              precision    recall  f1-score   support

    NEGATIVE       0.72      0.16      0.27      1924
     NEUTRAL       0.63      0.10      0.17      1924
    POSITIVE       0.37      0.97      0.54      1923

    accuracy                           0.41      5771
   macro avg       0.57      0.41      0.32      5771
weighted avg       0.57      0.41      0.32      5771



In [None]:
clf = joblib.load("modelos/3-gram_trained_svc.joblib")
y3_pred = clf.predict(X3_test)   ## Predecimos con nuevos datos (los de test X_test)
print("Accuracy en test set:", accuracy_score(y3_test, y3_pred))   ## Evaluamos la predicción comparando y3_test con y3_pred
print(classification_report(y3_test, y3_pred))