In [27]:
!pip install stanza



In [28]:
import pandas as pd
import stanza
from tqdm import tqdm
from google.colab import drive

# NLP

## Análisis de Sentimiento con la bibilioteca Stanza (ex StanfordNLP)

MONTAJE DE ARCHIVOS GOOGLE DRIVE:

In [29]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
user_reviews_content = '/content/drive/MyDrive/Colab Notebooks/HotelWiseML/hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

MONTAJE DE ARCHIVOS LOCAL:

In [None]:
user_reviews_content = 'hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

### Configuraciones del Modelo

In [44]:
# Descargar de idioma
stanza.download('en')

# Inicializar el procesador
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| sentiment | sstplus  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: sentiment
INFO:stanza:Done loading processors!


### Función de Análisis de Sentimiento

In [45]:
# Definir una función para analizar el sentimiento de cada revisión
def analyze_sentiment(review):
    doc = nlp(review)
    overall_sentiment = 0
    for sentence in doc.sentences:
        overall_sentiment += sentence.sentiment
    average_sentiment = overall_sentiment / len(doc.sentences)
    return 1 if average_sentiment >= 0.5 else 0

# Aplicar el análisis de sentimientos a cada revisión en la columna 'reviews'
tqdm.pandas()  # Habilitar el seguimiento del progreso
user_reviews_dataset['sentiment_analysis'] = user_reviews_dataset['reviews'].progress_apply(analyze_sentiment)

100%|██████████| 439/439 [03:30<00:00,  2.09it/s]


## Revisión de 'sentiment_analysis' vs. 'review'

A continuación vemos los resultados que se obtuvieron del análisis de sentimiento y comparamos con la columna 'review'.

In [46]:
filas_neg = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 0]
filas_pos = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 1]

In [35]:
pd.set_option('display.max_colwidth', None)

- Comparamos las filas negativas con la review

In [47]:
filas_neg[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
5,not worth what they charge!never cleanup the room.,0
13,Lol! 69!!!,0
21,Mandatory fees! You cant opt out of a 2 dollar safe fee and 8 dollar parking fee. Confronted desk worked politely and still would not remove. Bad cell phone reception and no wifi. We turned off the ac and were freezing all night. Something wrong there. Toliet lid was disgusting. With those ten dollars they stole they could afford to replace it. Next time I will go back to Runway Inn. Its cheaper if you consider the hidden fees!,0
28,"Nasty, worst customer service ever, toilet leaking, fridge making noise, found hair in the shower, they moved us to 4 diferentes rooms because the problems and everyone had a strong smoke smell, don't expect to see a manager, we try for 5 days, no luck",0
29,"It's very cheap, but it's not a clean hotel, and they charge you for parking, and a safe.",0


In [48]:
cantidad_ceros = (filas_neg['sentiment_analysis'] == 0).sum()

print("Cantidad de reviews negativas:", cantidad_ceros)

Cantidad de reviews negativas: 71


- Comparamos las filas positivas con la review

In [49]:
filas_pos[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
0,"I'm new to the area and needed a place to live temporarily. I've been staying here for nearly 2 months so far and would reccomend this place. This place is huge! It is a former Ramada hotel that went out of business sometime last year and whoever bought it decided to turn it into a short term apartment building/week to week hotel. They are the cheapest short term/week to week motel that you're going to find in Orlando. I pay $220 a week for myself (all inclusive: including Tax, utilities, garbage, etc). More people cost extra. They have security at night so although it's located a half mile from the Orange County jail, you won't feel unsafe here. Front desk staff are friendly as well. My only complaint is that the internet doesn't work often in my room but my room might be too far away from the router/front desk. There are also bugs in my room daily as well, although they did spray the rooms recently. My room does have a balcony which is a huge plus and there's a pool open during the warmer months. Would recommend staying here overall!!",1
1,"This was previously a Ramada but was sold & became the Palms Inn.\n\nOn 1st appearance this place seemed nice but the 3 cars with broken windows in the parking lot should hv told me something. 😒\n\nAs the attendant was taking me to my room she seemed surprised to find the room nxt to mine was kicked in and left open & disheveled. You could see several large kick marks on the bright white dr. Dare I say another RED flag.\n\nThe hotel has 2 sides - one half seemed to be remodeled with new paint & carpet in the halls and newly renovated rooms. But there are no shower curtains as we were told we had to provide our own as well as sheets & towels (this is now provided with the new ownership - the hotel was sold agn during my short stay)\n\nThere is a laundry but it has business hours (really?) And to use it you need to get change from the desk - only thing is they hv no change nor do they hv a change machine!!!\n\nThe new owners came in & immediately hiked up the rates - double the amount daily (from $40+ to $70+) for a rm with roaches - albeit remodeled but still with roaches.\n\nThere is a hole in the wall that is connected to the balcony - so the inside is outside at all times.\n\nThe other half of the hotel is uninhabitable & seems dangerous and the new weekly rates are $310 did i mention you get a flat scrn, cable, a sml fridge & a sml microwave for all of this 👍 Smh....\n\nThere is a pool & hot tub that used to be turquoise blue but is now bad shade of brown.\n\nAs the new owners show up in their luxury Maserati each day...",1
2,"Just checked in and found out that it is under new ownership. Never stayed here before, but room was in decent shape upon moving in (definitely can tell it is a seasoned locattion), very clean and friendly staff. If you see a rate on Craigslist that is incorrect. Currently it is $270 per week plus deposit.",1
3,"I like the place. The place is massive, and super interesting.",1
4,Awesome staff place is nice and clean rooms are comfy,1


In [50]:
cantidad_unos = (filas_pos['sentiment_analysis'] == 1).sum()

print("Cantidad de reviews positivas:", cantidad_unos)

Cantidad de reviews positivas: 368


### Reordenamiento del dataset

In [51]:
user_reviews_dataset_Final=user_reviews_dataset[['name','latitude','longitude','city','county','avg_rating','security','sentiment_analysis']]

In [52]:
user_reviews_dataset_Final.head()

Unnamed: 0,name,latitude,longitude,city,county,avg_rating,security,sentiment_analysis
0,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
1,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
2,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
3,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,6,1
4,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,8,1


In [53]:
# Agrupar por 'name' y calcular la suma de 'avg_rating' y 'sentiment_analysis', conservando las otras columnas
summarized_data = user_reviews_dataset_Final.groupby('name').agg({
    'latitude': 'first',
    'longitude':'first',
    'county': 'first',
    'city': 'first',
    'avg_rating': 'sum',
    'sentiment_analysis': 'sum',
    'security':'sum',

})
summarized_data = summarized_data.reset_index()
summarized_data.head()

Unnamed: 0,name,latitude,longitude,county,city,avg_rating,sentiment_analysis,security
0,17 John St Associates,40.71,-74.0087,New York County,New York,23.5,5,36
1,5 Star Island,25.775874,-80.151371,Miami-Dade County,Miami Beach,24.6,5,39
2,Best Western Plus Atlantik,25.813303,-80.122444,Miami-Dade County,Miami Beach,24.5,3,56
3,Best Western Plus Seaport Inn Downtown,40.708106,-74.001388,New York County,New York,64.0,15,135
4,Building 69,28.401189,-81.468729,Orange County,Orlando,24.0,4,35


## Almacenamiento de Dataframe en Parquet

GUARDADO DE DATAFRAME EN GOOGLE DRIVE:

In [58]:
summarized_data.to_parquet('/content/drive/MyDrive/Colab Notebooks/HotelWiseML/Hoteles.NLP.02.Stanza.parquet')

GUARDADO DE DATAFRAME EN LOCAL:


In [59]:
summarized_data.to_parquet('Hoteles.NLP.02.Stanza.parquet')