In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, TextClassificationPipeline
import torch
import re
from typing import Union, List
import string

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
class CleanText():
    """ clearing text except digits () . , word character """ 

    def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
        self.clean_pattern =clean_pattern

    def __call__(self, text: Union[str, list]) -> List[List[str]]:

        if isinstance(text, str):
            docs = [[text]]

        if isinstance(text, list):
            docs = text

        text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]

        return text
    
def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def tokenize(text):
    """ basic tokenize method with word character, non word character and digits """
    text = re.sub(r" +", " ", str(text))
    text = re.split(r"(\d+|[a-zA-ZğüşıöçĞÜŞİÖÇ]+|\W)", text)
    text = list(filter(lambda x: x != '' and x != ' ', text))
    sent_tokenized = ' '.join(text)
    return sent_tokenized

regex = re.compile('[%s]' % re.escape(string.punctuation))

def remove_punct(text):
    text = regex.sub(" ", text)
    return text

def remove_numbers(text):
    return re.sub(r'\d+', '', text)
clean = CleanText()

In [25]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True
    return_attention_mask=True
    padding='max_length'
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [31]:
# Cargar el tokenizador y el modelo directamente si el vocabulario está integrado
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, do_lower_case=config.do_lower_case)
model = BertForSequenceClassification.from_pretrained(config.pretrained_model,
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Error while downloading from https://cdn-lfs.huggingface.co/bert-base-uncased/68d45e234eb4a928074dfd868cead0219ab85354cc53d20e772753c6bb9169d3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1725345753&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNTM0NTc1M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iZXJ0LWJhc2UtdW5jYXNlZC82OGQ0NWUyMzRlYjRhOTI4MDc0ZGZkODY4Y2VhZDAyMTlhYjg1MzU0Y2M1M2QyMGU3NzI3NTNjNmJiOTE2OWQzP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=es9DiAIUoY3nU-p4efybhjt-MT5htarnwXrbytC0H-1xzoHX6d7wAHihk2k8AJwpENiaV06aenPRIXr5qt9BAaXvSE1SIjB6jSohIxfKjRgfpio2Ry%7EL0F55PbneuBhjgV2X5%7E7XFoJM59UsuW6uvk2hLlRZcr7btNcngYJlEiFGQNo8R34syFEGQ3ITlkUo%7EuyRIPk-3eW3PLm%7E1y8SmmTa7-EW9yz6tF3Ph8aq%7EIIbferKmsjMsygUqpqmiV76JLj3Pj4al%7E2RrW%7EIYZH%7EPiqlLBndGRG%7EZGtImFW1kgBkBMeDDSOQW%7EpZpIesFWkLQeZBZU%7EUocvCz5ionWDQoA__&Key-Pair-Id=K3ESJI6D

KeyboardInterrupt: 

In [None]:
model.to(config.device)

In [None]:
model.load_state_dict(torch.load('/home/dario/Study/sri/_BERT_epoch_2.model', map_location=torch.device('cpu')))

In [None]:
model.eval()  # Establecer el modelo en modo evaluación

In [None]:
# Crear el pipeline de clasificación de texto
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=-1)  # Usa device=0 si tienes GPU disponible

In [26]:
import pandas as pd

# Cargar el CSV
df = pd.read_csv('Datafiniti_Hotel_Reviews_Jun19.csv')

# Define las columnas de interés
columnas_interes = ["address", "categories", "city", "country", "name", "province", "reviews.rating", "reviews.text"]

df = df[columnas_interes]


In [27]:
# clean text, lowercase and remove punk
df["reviews.text"] = df["reviews.text"].apply(lambda x: str(x).lower() if isinstance(x, str) else str(x))
df["reviews.text"] = df["reviews.text"].apply(lambda x: remove_numbers(remove_punct(clean(remove_emoji(x))[0][0])))

In [28]:
df.head()

Unnamed: 0,address,categories,city,country,name,province,reviews.rating,reviews.text
0,5620 Calle Real,"Hotels,Hotels and motels,Hotel and motel mgmt....",Goleta,US,Best Western Plus South Coast Inn,CA,3,this hotel was nice and quiet did not know t...
1,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,4,we stayed in the king suite with the separatio...
2,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,3,parking was horrible somebody ran into my ren...
3,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,5,not cheap but excellent location price is som...
4,5th And San Carlos PO Box 3574,"Hotels,Lodging,Hotel",Carmel by the Sea,US,Best Western Carmel's Town House Lodge,CA,2,if you get the room that they advertised on th...


In [30]:
# Mapeo de etiquetas
label_mapping = {
    "LABEL_0": 0.0,
    "LABEL_1": 1.0,
    "LABEL_2": 2.5
}

def process_text(text):
    prediction = pipeline(text)
    label = prediction['label']
    sentiment = label_mapping.get(label, label)  # Obtener la etiqueta mapeada o dejarla como está si no se encuentra
    confidence=prediction['score']
    text_score=sentiment*confidence
    return sentiment, confidence, text_score

In [None]:
# Aplicar la función a cada fila en la columna 'text'
df[['sentiment', 'confidence','text_score']] = df['reviews.text'].apply(lambda x: pd.Series(process_text(x)))                                               

In [None]:
df['final_score'] = df['reviews.rating'] * 1.5 + df['text_score']

In [None]:
# Agrupa por las columnas especificadas y calcula el promedio de final_score
grouped_df = df.groupby(['address', 'categories', 'city', 'country', 'name', 'province']).agg(
    average_final_score=('final_score', 'mean')
).reset_index()

# Ordena el DataFrame de mayor a menor por average_final_score
grouped_df = grouped_df.sort_values(by='average_final_score', ascending=False)

# Muestra el resultado
print(grouped_df)