In [127]:
pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


# Importar las librerías (deben de estar previamente instaladas)

In [128]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Descargar herramientas para NLP

In [129]:
# stopwords (elimina palabras vacias) y WordNetLemmatizer(lematización de palabras en un texto - La lematización es el proceso de reducir una palabra a su forma base o raíz)
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jisaza53\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jisaza53\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Descargo todas las tablas de datos de Kaggle (21 G) :P Bastante info

In [130]:
#path = kagglehub.dataset_download("cynthiarempel/amazon-us-customer-reviews-dataset")

# Cargamos los datos

In [131]:
path = rf'C:\Users\jisaza53\OneDrive - Cementos Argos S.A\Escritorio\PruebaTecnica_GestorSr\DataSet\amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv'
try:
    df = pd.read_csv(path, sep='\t',on_bad_lines='skip')
except Exception as e:
    print(f"Error al leer el archivo: {e}")

In [132]:
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,32114233,R1QX6706ZWJ1P5,B00OYRW4UE,223980852,Elite Sportz Exercise Sliders are Double Sided...,Personal_Care_Appliances,5,0,0,N,Y,Good quality. Shipped,Exactly as described. Good quality. Shipped fast,2015-08-31
1,US,18125776,R3QWMLJHIW6P37,B0000537JQ,819771537,Ezy Dose Weekly,Personal_Care_Appliances,5,0,0,N,Y,Five Stars,It is great,2015-08-31
2,US,19917519,R14Z1VR1N0Z9G6,B00HXXO332,849307176,"Pulse Oximeter, Blood Oxygen Monitor",Personal_Care_Appliances,5,1,1,N,Y,It's really nice it works great,It's really nice it works great. You have the ...,2015-08-31
3,US,18277171,R25ZRJL0GH0U0,B00EOB0JA2,700864740,SE Tools Tool Kit Watch Watch Repair Kit (20 P...,Personal_Care_Appliances,2,0,0,N,Y,Two Stars,The kit works fine... simple cheap plastic tho,2015-08-31
4,US,2593270,R3837KYH7AZNIY,B00OC2O1UC,794298839,"doTERRA HD Clear Facial Kit - Facial Lotion, F...",Personal_Care_Appliances,4,0,1,N,Y,Four Stars,It works better than anything else ive tried,2015-08-31


# Organizamos la información del  las reseñas (Calidad del dato)

In [133]:
# Preprocesar el texto stop(stopwords - Elimina los espacios en blanco y WordNetLemmatizer - que corta palabras hasta obtener una raíz )
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):  # Si el texto no es una cadena (e.g., NaN o float), lo convertimos en vacío
        return ''
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Eliminar caracteres especiales
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Aplicar preprocesamiento a la columna 'review_headline' (puede variar dependiendo del nombre de la columna)
df['cleaned_review'] = df['review_headline'].apply(preprocess_text)

# Verificar resultados
df[['review_headline', 'cleaned_review']].head()

Unnamed: 0,review_headline,cleaned_review
0,Good quality. Shipped,good quality shipped
1,Five Stars,five star
2,It's really nice it works great,really nice work great
3,Two Stars,two star
4,Four Stars,four star


# Creamos un dataframe donde se contenga las clasificaciones para analizar 

In [134]:
try:
    feeling = pd.read_excel(r'C:\Users\jisaza53\OneDrive - Cementos Argos S.A\Escritorio\PruebaTecnica_GestorSr\DataSet\Feelings.xlsx',header=0)
except Exception as e:
    print(f"Error al leer el archivo: {e}")

In [135]:
# Verifica que ambos DataFrames tengan el mismo número de filas
print(f"Longitud de df: {len(df)}")
print(f"Longitud de feeling: {len(feeling)}")

Longitud de df: 85924
Longitud de feeling: 119


In [136]:
# Limpiar las reseñas
df['cleaned_review'] = df['cleaned_review'].fillna('')  # Lidiar con NaN

# Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limitar el número de características
X = vectorizer.fit_transform(df['cleaned_review'])

# Etiquetas (suponiendo que 'sentiment' está en el dataframe de 'feeling')
y = feeling['sentiment']  # Asegúrate de que esta columna existe

# División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento del modelo Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Predicciones
y_pred = model.predict(X_test)

# Evaluación del modelo
print(classification_report(y_test, y_pred))

# Análisis de las palabras clave para las reseñas positivas, negativas y neutras
vectorizer = CountVectorizer(stop_words='english')
X_pos = vectorizer.fit_transform(df[df['sentiment'] == 'positive']['cleaned_review'])
X_neg = vectorizer.fit_transform(df[df['sentiment'] == 'negative']['cleaned_review'])
X_neu = vectorizer.fit_transform(df[df['sentiment'] == 'neutral']['cleaned_review'])

# Mostrar las palabras clave más frecuentes en cada clase
def top_n_keywords(X, n=10):
    feature_names = vectorizer.get_feature_names_out()
    sum_words = X.sum(axis=0)
    word_freq = [(word, sum_words[0, idx]) for word, idx in zip(feature_names, range(len(feature_names)))]
    sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    return sorted_word_freq[:n]

print("Top palabras positivas:", top_n_keywords(X_pos))
print("Top palabras negativas:", top_n_keywords(X_neg))
print("Top palabras neutras:", top_n_keywords(X_neu))

ValueError: Found input variables with inconsistent numbers of samples: [85924, 119]