In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, TextClassificationPipeline
import torch
import re
from typing import Union, List
import string

In [2]:
class CleanText():
    """ clearing text except digits () . , word character """ 

    def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
        self.clean_pattern =clean_pattern

    def __call__(self, text: Union[str, list]) -> List[List[str]]:

        if isinstance(text, str):
            docs = [[text]]

        if isinstance(text, list):
            docs = text

        text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]

        return text
    
def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def tokenize(text):
    """ basic tokenize method with word character, non word character and digits """
    text = re.sub(r" +", " ", str(text))
    text = re.split(r"(\d+|[a-zA-ZğüşıöçĞÜŞİÖÇ]+|\W)", text)
    text = list(filter(lambda x: x != '' and x != ' ', text))
    sent_tokenized = ' '.join(text)
    return sent_tokenized

regex = re.compile('[%s]' % re.escape(string.punctuation))

def remove_punct(text):
    text = regex.sub(" ", text)
    return text

def remove_numbers(text):
    return re.sub(r'\d+', '', text)
clean = CleanText()

In [3]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True
    return_attention_mask=True
    padding='max_length'
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [4]:
# Cargar el tokenizador y el modelo directamente si el vocabulario está integrado
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, do_lower_case=config.do_lower_case)
model = BertForSequenceClassification.from_pretrained(config.pretrained_model,
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.to(config.device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
model.load_state_dict(torch.load('C:/Users/Miguel Alejandro/Documents/3er Año/Segundo Período/SRI/Proyecto/_BERT_new_epoch_3.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [7]:
model.eval()  # Establecer el modelo en modo evaluación

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
# Crear el pipeline de clasificación de texto
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=-1)  # Usa device=0 si tienes GPU disponible

In [9]:
import pandas as pd

# Cargar el CSV
df = pd.read_csv('C:/Users/Miguel Alejandro/Documents/3er Año/Segundo Período/SRI/Proyecto/datafiniti-hotel-reviews/Datafiniti_Hotel_Reviews_Jun19.csv')

# Define las columnas de interés
columnas_interes = ["address", "categories", "city", "country", "name", "province", "reviews.rating", "reviews.text"]

df = df[columnas_interes]


In [10]:
# clean text, lowercase and remove punk
df["reviews.text"] = df["reviews.text"].apply(lambda x: str(x).lower() if isinstance(x, str) else str(x))
df["reviews.text"] = df["reviews.text"].apply(lambda x: remove_numbers(remove_punct(clean(remove_emoji(x))[0][0])))

In [11]:
df.head()

Unnamed: 0,address,categories,city,country,name,province,reviews.rating,reviews.text
0,5620 Calle Real,"Hotels,Hotels and motels,Hotel and motel mgmt....",Goleta,US,Best Western Plus South Coast Inn,CA,3,this hotel was nice and quiet did not know t...
1,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,4,we stayed in the king suite with the separatio...
2,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,3,parking was horrible somebody ran into my ren...
3,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,5,not cheap but excellent location price is som...
4,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,2,if you get the room that they advertised on th...


In [24]:
# Mapeo de etiquetas
label_mapping = {
    "LABEL_0": 0.0,
    "LABEL_1": 1.0,
    "LABEL_2": 2.5
}

def process_text(text):
    print("Ya empecé uno")
    prediction = pipeline(text, truncation=True, max_length=512)
    label = prediction[0]['label']
    sentiment = label_mapping.get(label, label)  # Obtener la etiqueta mapeada o dejarla como está si no se encuentra
    confidence=prediction[0]['score']
    text_score=sentiment*confidence
    return sentiment, confidence, text_score

In [25]:
# Aplicar la función a cada fila en la columna 'text'
df[['sentiment', 'confidence','text_score']] = df['reviews.text'].apply(lambda x: pd.Series(process_text(x)))                                               

Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya empecé uno
Ya emp

In [26]:
df['final_score'] = df['reviews.rating'] * 1.5 + df['text_score']

In [28]:
# Agrupa por las columnas especificadas y calcula el promedio de final_score
grouped_df = df.groupby(['address', 'categories', 'city', 'country', 'name', 'province']).agg(
    average_final_score=('final_score', 'mean')
).reset_index()

# Ordena el DataFrame de mayor a menor por average_final_score
grouped_df = grouped_df.sort_values(by='average_final_score', ascending=False)

# Muestra el resultado
print(grouped_df['name'])

                     address  \
1388      901 Snelling Ave N   
411        1838 Decherd Blvd   
1259      730 Sw Columbia St   
673           2804 N Main St   
890   4000 Canyons Resort Dr   

                                             categories        city country  \
1388  Hotels,Hotels and motels,Motel,Lodging,Hotels ...  Saint Paul      US   
411   Hotels and motels,Hotel and motel reservations...     Decherd      US   
1259  Hotels,Hotels and motels,Corporate Lodging,Lod...        Bend      US   
673   Hotels,Hotels and motels,Lodging,Hotels Motels...       Altus      US   
890                                        Hotels,Hotel   Park City      US   

                         name province  average_final_score  
1388             Midway Motel       MN             9.997937  
411       Quality Inn-decherd       TN             9.997914  
1259  Hampton Inn Suites Bend       OR             9.997896  
673            Days Inn-Altus       OK             9.997873  
890        Grand Summi