In [2]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [5]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from contractions import fix
from google.colab import drive

# Descargar recursos (ejecutar una vez)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# NLP

## Análisis de Sentimiento con la Biblioteca NLTK y Diccionario Vader

MONTAJE DE ARCHIVOS GOOGLE DRIVE:

In [6]:
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
user_reviews_content = '/content/drive/MyDrive/Colab Notebooks/HotelWiseML/hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

MONTAJE DE ARCHIVOS LOCAL:

In [None]:
user_reviews_content = 'hoteles_unificado.parquet'
user_reviews_dataset = pd.read_parquet(user_reviews_content)

### Tokenizado, Lematizado y Stopwords

In [8]:
def preprocess_text(text):
    if not text or pd.isnull(text):  # Verificar si el texto está vacío o es nulo
        return ''

    # Eliminar emojis y caracteres especiales
    # Se pueden tokenizar los emojis con metodos mas robustos
    # que ayuden a manejarlos, pero se elije eliminarlos
    text = re.sub(r'[^\w\s]', '', text)

    # Arreglo de contracciones
    try:
        text = fix(text)  # Intentar usar la función fix()
    except Exception as e:
        print(f"Error al aplicar fix(): {e}")

    # Tokenización
    tokens = word_tokenize(text)

    # Palabras clave adicionales, positivas y negativas para mejorar el análisis
    negative_hotel_words = ['dirty', 'uncomfortable', 'noisy', 'smelly', 'outdated',
    'small', 'unfriendly', 'expensive', 'overpriced', 'unhygienic',
    'unsafe', 'crowded', 'inefficient', 'unorganized', 'rude',
    'disappointing', 'terrible', 'unreliable', 'dull', 'unresponsive',
    'unpleasant', 'inattentive', 'unsanitary', 'uninviting', 'dilapidated',
    'neglected', 'inconvenient', 'unaccommodating', 'problematic'
]

    positive_hotel_words = ['clean', 'comfortable', 'quiet', 'pleasant', 'modern',
    'spacious', 'friendly', 'affordable', 'luxurious', 'inviting',
    'safe', 'relaxing', 'efficient', 'organized', 'welcoming',
    'satisfying', 'excellent', 'reliable', 'enjoyable', 'responsive',
    'beautiful', 'attentive', 'sanitary', 'inviting', 'well-maintained',
    'cared-for', 'convenient', 'accommodating', 'problem-free', 'stellar'
]


    # Obtener stopwords y agregar palabras con connotación positiva y negativa
    stop_words = set(stopwords.words('english'))

    for word in negative_hotel_words:
        stop_words.discard(word)
    for word in positive_hotel_words:
        stop_words.discard(word)

    # Eliminación de stopwords y puntuación no necesarios
    tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]

    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)
user_reviews_dataset['preprocess_text'] = user_reviews_dataset['reviews'].apply(preprocess_text)

### Función de Análisis de Sentimiento

In [9]:
# Función para analizar el sentimiento con VADER
def analizar_sentimiento(texto):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(texto)
    compound_score = scores['compound']
    if compound_score >= 0.05:
        return 1 # Review Positiva
    else:
        return 0 # Review Negativa

user_reviews_dataset['sentiment_analysis'] = user_reviews_dataset['preprocess_text'].apply(analizar_sentimiento)

## Revisión de 'sentiment_analysis' vs. 'review'

A continuación vemos los resultados que se obtuvieron del análisis de sentimiento y comparamos con la columna 'review'.

In [10]:
filas_neg = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 0]
filas_pos = user_reviews_dataset[user_reviews_dataset['sentiment_analysis'] == 1]

In [11]:
pd.set_option('display.max_colwidth', None)

- Comparamos las filas negativas con la review

In [12]:
filas_neg[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
1,"This was previously a Ramada but was sold & became the Palms Inn.\n\nOn 1st appearance this place seemed nice but the 3 cars with broken windows in the parking lot should hv told me something. 😒\n\nAs the attendant was taking me to my room she seemed surprised to find the room nxt to mine was kicked in and left open & disheveled. You could see several large kick marks on the bright white dr. Dare I say another RED flag.\n\nThe hotel has 2 sides - one half seemed to be remodeled with new paint & carpet in the halls and newly renovated rooms. But there are no shower curtains as we were told we had to provide our own as well as sheets & towels (this is now provided with the new ownership - the hotel was sold agn during my short stay)\n\nThere is a laundry but it has business hours (really?) And to use it you need to get change from the desk - only thing is they hv no change nor do they hv a change machine!!!\n\nThe new owners came in & immediately hiked up the rates - double the amount daily (from $40+ to $70+) for a rm with roaches - albeit remodeled but still with roaches.\n\nThere is a hole in the wall that is connected to the balcony - so the inside is outside at all times.\n\nThe other half of the hotel is uninhabitable & seems dangerous and the new weekly rates are $310 did i mention you get a flat scrn, cable, a sml fridge & a sml microwave for all of this 👍 Smh....\n\nThere is a pool & hot tub that used to be turquoise blue but is now bad shade of brown.\n\nAs the new owners show up in their luxury Maserati each day...",0
6,Alllllll Bad,0
8,Dump,0
9,Roach motel,0
21,Mandatory fees! You cant opt out of a 2 dollar safe fee and 8 dollar parking fee. Confronted desk worked politely and still would not remove. Bad cell phone reception and no wifi. We turned off the ac and were freezing all night. Something wrong there. Toliet lid was disgusting. With those ten dollars they stole they could afford to replace it. Next time I will go back to Runway Inn. Its cheaper if you consider the hidden fees!,0


In [13]:
cantidad_ceros = (filas_neg['sentiment_analysis'] == 0).sum()

print("Cantidad de reviews negativas:", cantidad_ceros)

Cantidad de reviews negativas: 117


- Comparamos las filas positivas con la review

In [14]:
filas_pos[['reviews','sentiment_analysis']].head()

Unnamed: 0,reviews,sentiment_analysis
0,"I'm new to the area and needed a place to live temporarily. I've been staying here for nearly 2 months so far and would reccomend this place. This place is huge! It is a former Ramada hotel that went out of business sometime last year and whoever bought it decided to turn it into a short term apartment building/week to week hotel. They are the cheapest short term/week to week motel that you're going to find in Orlando. I pay $220 a week for myself (all inclusive: including Tax, utilities, garbage, etc). More people cost extra. They have security at night so although it's located a half mile from the Orange County jail, you won't feel unsafe here. Front desk staff are friendly as well. My only complaint is that the internet doesn't work often in my room but my room might be too far away from the router/front desk. There are also bugs in my room daily as well, although they did spray the rooms recently. My room does have a balcony which is a huge plus and there's a pool open during the warmer months. Would recommend staying here overall!!",1
2,"Just checked in and found out that it is under new ownership. Never stayed here before, but room was in decent shape upon moving in (definitely can tell it is a seasoned locattion), very clean and friendly staff. If you see a rate on Craigslist that is incorrect. Currently it is $270 per week plus deposit.",1
3,"I like the place. The place is massive, and super interesting.",1
4,Awesome staff place is nice and clean rooms are comfy,1
5,not worth what they charge!never cleanup the room.,1


In [15]:
cantidad_unos = (filas_pos['sentiment_analysis'] == 1).sum()

print("Cantidad de reviews positivas:", cantidad_unos)

Cantidad de reviews positivas: 322


### Reordenamiento del dataset

In [16]:
user_reviews_dataset_Final=user_reviews_dataset[['name','latitude','longitude','city','county','avg_rating','security','sentiment_analysis']]

In [17]:
user_reviews_dataset_Final.head()

Unnamed: 0,name,latitude,longitude,city,county,avg_rating,security,sentiment_analysis
0,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
1,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,0
2,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,9,1
3,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,6,1
4,Palms Inn,28.510625,-81.418326,Orlando,Orange County,2.8,8,1


In [18]:
# Agrupar por 'name' y calcular la suma de 'avg_rating' y 'sentiment_analysis', conservando las otras columnas
summarized_data = user_reviews_dataset_Final.groupby('name').agg({
    'latitude': 'first',
    'longitude':'first',
    'county': 'first',
    'city': 'first',
    'avg_rating': 'sum',
    'sentiment_analysis': 'sum',
    'security':'sum',

})
summarized_data = summarized_data.reset_index()
summarized_data.head()

Unnamed: 0,name,latitude,longitude,county,city,avg_rating,sentiment_analysis,security
0,17 John St Associates,40.71,-74.0087,New York County,New York,23.5,5,36
1,5 Star Island,25.775874,-80.151371,Miami-Dade County,Miami Beach,24.6,3,39
2,Best Western Plus Atlantik,25.813303,-80.122444,Miami-Dade County,Miami Beach,24.5,4,56
3,Best Western Plus Seaport Inn Downtown,40.708106,-74.001388,New York County,New York,64.0,14,135
4,Building 69,28.401189,-81.468729,Orange County,Orlando,24.0,5,35


## Almacenamiento de Dataframe en Parquet

GUARDADO DE DATAFRAME EN GOOGLE DRIVE:

In [25]:
summarized_data.to_parquet('/content/drive/MyDrive/Colab Notebooks/HotelWiseML/Hoteles.NLP.01.NLTK.parquet')

GUARDADO DE DATAFRAME EN LOCAL:


In [26]:
summarized_data.to_parquet('Hoteles.NLP.01.NLTK.parquet')