In [109]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import drive

# NLP

## Análisis de Sentimiento con las Bibliotecas Tensorflow, keras y sklearn, por arquitectura RNN Bidireccional (Red Neuronal Recurrente Bidireccional)

MONTAJE DE ARCHIVOS GOOGLE DRIVE:

In [110]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [111]:
user_reviews_content = '/content/drive/MyDrive/Colab Notebooks/HotelWiseML/hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

MONTAJE DE ARCHIVOS LOCAL:

In [None]:
user_reviews_content = 'hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

### Tokenizado, Secuenciación y Armado

In [112]:
# Preprocesamiento de datos
reviews = user_reviews_dataset['reviews'].values

positive_keywords = ['clean', 'comfortable', 'quiet', 'pleasant', 'modern',
    'spacious', 'friendly', 'affordable', 'luxurious', 'inviting',
    'safe', 'relaxing', 'efficient', 'organized', 'welcoming',
    'satisfying', 'excellent', 'reliable', 'enjoyable', 'responsive',
    'beautiful', 'attentive', 'sanitary', 'inviting', 'well-maintained',
    'cared-for', 'convenient', 'accommodating', 'problem-free', 'stellar'
]

negative_keywords = ['dirty', 'uncomfortable', 'noisy', 'smelly', 'outdated',
    'small', 'unfriendly', 'expensive', 'overpriced', 'unhygienic',
    'unsafe', 'crowded', 'inefficient', 'unorganized', 'rude',
    'disappointing', 'terrible', 'unreliable', 'dull', 'unresponsive',
    'unpleasant', 'inattentive', 'unsanitary', 'uninviting', 'dilapidated',
    'neglected', 'inconvenient', 'unaccommodating', 'problematic'
]

# Función para determinar la etiqueta basada en las palabras clave
def determine_label(review):
    for word in positive_keywords:
        if word in review.lower():
            return 1
    for word in negative_keywords:
        if word in review.lower():
            return 0
    return 0  # Si no se encuentra ninguna palabra clave, se etiqueta como 'otro'

labels = [determine_label(review) for review in reviews]


In [113]:
# Tokenización y secuenciación
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, maxlen=100, truncating='post')

# División de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42)

### Construccion del Modelo de Red Neuronal Recurrente de Capa Bidireccional

In [114]:
# Construcción del modelo de análisis de sentimientos
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=100),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

### Compilado, Entrenamiento y Evaluacion del Modelo

In [115]:
# Convertir las listas de etiquetas en matrices NumPy
y_train = np.array(y_train)
y_test = np.array(y_test)

In [116]:
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f0f9fa99db0>

In [117]:
# Evaluación del modelo
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 81.82%


#### Aplicado del Modelo al Dataset

In [118]:
# 1. Preprocesar los nuevos datos
nuevas_reviews = user_reviews_dataset['reviews'].values
nuevas_sequences = tokenizer.texts_to_sequences(nuevas_reviews)
nuevas_padded_sequences = pad_sequences(nuevas_sequences, maxlen=100, truncating='post')

# 3. Generar predicciones sobre los nuevos datos
nuevas_predicciones = model.predict(nuevas_padded_sequences)

# 4. Interpretar las predicciones (por ejemplo, etiquetarlas como positivas o negativas)
nuevos_sentimientos = [1 if pred > 0.5 else 0 for pred in nuevas_predicciones]

# Agregar los sentimientos predichos como una nueva columna en el DataFrame de nuevos datos
user_reviews_dataset['sentiment_analysis'] = nuevos_sentimientos



In [119]:
user_reviews_dataset.head()

Unnamed: 0,name,latitude,longitude,county,city,reviews,avg_rating,security,amenities,sentiment_analysis
0,Palms Inn,28.510625,-81.418326,Orange County,Orlando,"I'm new to the area and needed a place to live temporarily. I've been staying here for nearly 2 months so far and would reccomend this place. This place is huge! It is a former Ramada hotel that went out of business sometime last year and whoever bought it decided to turn it into a short term apartment building/week to week hotel. They are the cheapest short term/week to week motel that you're going to find in Orlando. I pay $220 a week for myself (all inclusive: including Tax, utilities, garbage, etc). More people cost extra. They have security at night so although it's located a half mile from the Orange County jail, you won't feel unsafe here. Front desk staff are friendly as well. My only complaint is that the internet doesn't work often in my room but my room might be too far away from the router/front desk. There are also bugs in my room daily as well, although they did spray the rooms recently. My room does have a balcony which is a huge plus and there's a pool open during the warmer months. Would recommend staying here overall!!",2.8,9,"[Multilingual Staff, Air Conditioning, Free Parking, Concierge Service, Luggage Storage]",0
1,Palms Inn,28.510625,-81.418326,Orange County,Orlando,"This was previously a Ramada but was sold & became the Palms Inn.\n\nOn 1st appearance this place seemed nice but the 3 cars with broken windows in the parking lot should hv told me something. 😒\n\nAs the attendant was taking me to my room she seemed surprised to find the room nxt to mine was kicked in and left open & disheveled. You could see several large kick marks on the bright white dr. Dare I say another RED flag.\n\nThe hotel has 2 sides - one half seemed to be remodeled with new paint & carpet in the halls and newly renovated rooms. But there are no shower curtains as we were told we had to provide our own as well as sheets & towels (this is now provided with the new ownership - the hotel was sold agn during my short stay)\n\nThere is a laundry but it has business hours (really?) And to use it you need to get change from the desk - only thing is they hv no change nor do they hv a change machine!!!\n\nThe new owners came in & immediately hiked up the rates - double the amount daily (from $40+ to $70+) for a rm with roaches - albeit remodeled but still with roaches.\n\nThere is a hole in the wall that is connected to the balcony - so the inside is outside at all times.\n\nThe other half of the hotel is uninhabitable & seems dangerous and the new weekly rates are $310 did i mention you get a flat scrn, cable, a sml fridge & a sml microwave for all of this 👍 Smh....\n\nThere is a pool & hot tub that used to be turquoise blue but is now bad shade of brown.\n\nAs the new owners show up in their luxury Maserati each day...",2.8,9,"[Room Service, Concierge Service, Business Center, Complimentary Breakfast, Conference Room]",0
2,Palms Inn,28.510625,-81.418326,Orange County,Orlando,"Just checked in and found out that it is under new ownership. Never stayed here before, but room was in decent shape upon moving in (definitely can tell it is a seasoned locattion), very clean and friendly staff. If you see a rate on Craigslist that is incorrect. Currently it is $270 per week plus deposit.",2.8,9,"[Business Center, Gym, Complimentary Breakfast, Luggage Storage, 24-Hour Reception]",1
3,Palms Inn,28.510625,-81.418326,Orange County,Orlando,"I like the place. The place is massive, and super interesting.",2.8,6,"[Airport Shuttle Service, Bar, Accessible Accommodations, Daily Housekeeping, Concierge Service]",0
4,Palms Inn,28.510625,-81.418326,Orange County,Orlando,Awesome staff place is nice and clean rooms are comfy,2.8,8,"[Children's Play Area, Bicycle Rental Service, Heating, Accessible Accommodations, 24-Hour Reception]",1


### Implementacion en la DB

### Revisión de 'sentiment_analysis' vs. 'review'

A continuación vemos los resultados que se obtuvieron del análisis de sentimiento y comparamos con la columna 'review'.

In [120]:
filas_neg = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 0]
filas_pos = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 1]

In [121]:
pd.set_option('display.max_colwidth', None)

- Comparamos las filas negativas con la review

In [122]:
filas_neg[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
0,"I'm new to the area and needed a place to live temporarily. I've been staying here for nearly 2 months so far and would reccomend this place. This place is huge! It is a former Ramada hotel that went out of business sometime last year and whoever bought it decided to turn it into a short term apartment building/week to week hotel. They are the cheapest short term/week to week motel that you're going to find in Orlando. I pay $220 a week for myself (all inclusive: including Tax, utilities, garbage, etc). More people cost extra. They have security at night so although it's located a half mile from the Orange County jail, you won't feel unsafe here. Front desk staff are friendly as well. My only complaint is that the internet doesn't work often in my room but my room might be too far away from the router/front desk. There are also bugs in my room daily as well, although they did spray the rooms recently. My room does have a balcony which is a huge plus and there's a pool open during the warmer months. Would recommend staying here overall!!",0
1,"This was previously a Ramada but was sold & became the Palms Inn.\n\nOn 1st appearance this place seemed nice but the 3 cars with broken windows in the parking lot should hv told me something. 😒\n\nAs the attendant was taking me to my room she seemed surprised to find the room nxt to mine was kicked in and left open & disheveled. You could see several large kick marks on the bright white dr. Dare I say another RED flag.\n\nThe hotel has 2 sides - one half seemed to be remodeled with new paint & carpet in the halls and newly renovated rooms. But there are no shower curtains as we were told we had to provide our own as well as sheets & towels (this is now provided with the new ownership - the hotel was sold agn during my short stay)\n\nThere is a laundry but it has business hours (really?) And to use it you need to get change from the desk - only thing is they hv no change nor do they hv a change machine!!!\n\nThe new owners came in & immediately hiked up the rates - double the amount daily (from $40+ to $70+) for a rm with roaches - albeit remodeled but still with roaches.\n\nThere is a hole in the wall that is connected to the balcony - so the inside is outside at all times.\n\nThe other half of the hotel is uninhabitable & seems dangerous and the new weekly rates are $310 did i mention you get a flat scrn, cable, a sml fridge & a sml microwave for all of this 👍 Smh....\n\nThere is a pool & hot tub that used to be turquoise blue but is now bad shade of brown.\n\nAs the new owners show up in their luxury Maserati each day...",0
3,"I like the place. The place is massive, and super interesting.",0
6,Alllllll Bad,0
7,Worthwhile deal,0


In [123]:
cantidad_ceros = (filas_neg['sentiment_analysis'] == 0).sum()

print("Cantidad de reviews negativas:", cantidad_ceros)

Cantidad de reviews negativas: 298


- Comparamos las filas positivas con la review

In [124]:
filas_pos[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
2,"Just checked in and found out that it is under new ownership. Never stayed here before, but room was in decent shape upon moving in (definitely can tell it is a seasoned locattion), very clean and friendly staff. If you see a rate on Craigslist that is incorrect. Currently it is $270 per week plus deposit.",1
4,Awesome staff place is nice and clean rooms are comfy,1
5,not worth what they charge!never cleanup the room.,1
10,"Very clean, modern and fully loaded with all of the items you need from day to day.",1
11,Relaxing clean good to go with the children and chill,1


In [125]:
cantidad_unos = (filas_pos['sentiment_analysis'] == 1).sum()

print("Cantidad de reviews positivas:", cantidad_unos)

Cantidad de reviews positivas: 141


### Reordenamiento del dataset

In [126]:
user_reviews_dataset_Final=user_reviews_dataset[['name','latitude','longitude','city','county','avg_rating','security','sentiment_analysis']]

In [127]:
user_reviews_dataset_Final.head()

Unnamed: 0,name,latitude,longitude,city,county,avg_rating,security,sentiment_analysis
0,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,0
1,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,0
2,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
3,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,6,0
4,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,8,1


In [128]:
# Agrupar por 'name' y calcular la suma de 'avg_rating' y 'sentiment_analysis', conservando las otras columnas
summarized_data = user_reviews_dataset_Final.groupby('name').agg({
    'latitude': 'first',
    'longitude':'first',
    'county': 'first',
    'city': 'first',
    'avg_rating': 'sum',
    'sentiment_analysis': 'sum',
    'security':'sum',

})
summarized_data = summarized_data.reset_index()
summarized_data.head()

Unnamed: 0,name,latitude,longitude,county,city,avg_rating,sentiment_analysis,security
0,17 John St Associates,40.71,-74.0087,New York County,New York,23.5,1,36
1,5 Star Island,25.775874,-80.151371,Miami-Dade County,Miami Beach,24.6,0,39
2,Best Western Plus Atlantik,25.813303,-80.122444,Miami-Dade County,Miami Beach,24.5,3,56
3,Best Western Plus Seaport Inn Downtown,40.708106,-74.001388,New York County,New York,64.0,5,135
4,Building 69,28.401189,-81.468729,Orange County,Orlando,24.0,2,35


In [129]:
# Guardar DataFrame en un archivo Parquet
summarized_data.to_parquet('Hoteles.NLP.03.TensorFlowBidireccional.parquet')

## Almacenamiento de Dataframe en Parquet

GUARDADO DE DATAFRAME EN GOOGLE DRIVE:

In [132]:
summarized_data.to_parquet('/content/drive/MyDrive/Colab Notebooks/HotelWiseML/Hoteles.NLP.03.TensorFlow.parquet')

GUARDADO DE DATAFRAME EN LOCAL:


In [131]:
summarized_data.to_parquet('Hoteles.NLP.03.TensorFlow.parquet')