In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv


In [2]:
# Import the CSV file
df = pd.read_csv('/kaggle/input/515k-hotel-reviews-data-in-europe/Hotel_Reviews.csv')

In [3]:
# Create a new dataframe with only the Positive_Review and Negative_Review columns
new_df = df[["Positive_Review", "Negative_Review"]]

# Use the melt function to stack the Positive_Review and Negative_Review columns on top of each other
new_df = new_df.melt(var_name="Sentiment", value_name="Text")

# Map the Sentiment column to 1 for Positive_Review and 0 for Negative_Review
new_df["Sentiment"] = new_df["Sentiment"].map({"Positive_Review": 1, "Negative_Review": 0})

# Drop any rows with empty or missing Text
new_df = new_df.dropna(subset=["Text"])

# Reset the index of the dataframe
new_df = new_df.reset_index(drop=True)

# Preview the modified dataframe
print(new_df.head())

   Sentiment                                               Text
0          1   Only the park outside of the hotel was beauti...
1          1   No real complaints the hotel was great great ...
2          1   Location was good and staff were ok It is cut...
3          1   Great location in nice surroundings the bar a...
4          1    Amazing location and building Romantic setting 


In [4]:
new_df["ID"] = new_df.index
new_df

Unnamed: 0,Sentiment,Text,ID
0,1,Only the park outside of the hotel was beauti...,0
1,1,No real complaints the hotel was great great ...,1
2,1,Location was good and staff were ok It is cut...,2
3,1,Great location in nice surroundings the bar a...,3
4,1,Amazing location and building Romantic setting,4
...,...,...,...
1031471,0,no trolly or staff to help you take the lugga...,1031471
1031472,0,The hotel looks like 3 but surely not 4,1031472
1031473,0,The ac was useless It was a hot week in vienn...,1031473
1031474,0,No Negative,1031474


In [5]:
# Define a function to count the number of words in a string
def count_words(text):
    return len(text.split())

new_df = new_df[new_df["Text"].apply(lambda x: count_words(x) > 2)]

In [6]:
new_df

Unnamed: 0,Sentiment,Text,ID
0,1,Only the park outside of the hotel was beauti...,0
1,1,No real complaints the hotel was great great ...,1
2,1,Location was good and staff were ok It is cut...,2
3,1,Great location in nice surroundings the bar a...,3
4,1,Amazing location and building Romantic setting,4
...,...,...,...
1031469,0,No parking Public parking garage is 15 Euro p...,1031469
1031471,0,no trolly or staff to help you take the lugga...,1031471
1031472,0,The hotel looks like 3 but surely not 4,1031472
1031473,0,The ac was useless It was a hot week in vienn...,1031473


In [7]:
# Set the seed value
seed = 123

# Randomly select 2% of the rows with a seed
new_df= new_df.sample(frac=0.02, random_state=seed)

new_df

Unnamed: 0,Sentiment,Text,ID
614439,0,the bed was huge but the mattress was not the...,614439
270955,1,This was a nice older hotel in a residential ...,270955
485273,1,large and quiet rooms king size beds smoking ...,485273
567131,0,The water pressure was not good in the shower...,567131
150214,1,Clean friendly and easy access to the tube,150214
...,...,...,...
82181,1,Staff were fantastic Friendly and very helpful,82181
486507,1,Breakfast selection and quality was excellent,486507
407708,1,The staff were very helpful The roof terrace ...,407708
406197,1,Beautiful hotel in great location close to ce...,406197


In [8]:
# Split the data into training and validation sets
train_text, val_text, train_labels, val_labels = train_test_split(new_df['Text'], new_df['Sentiment'], test_size=0.2, random_state=42)

In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaTokenizerFast

# Load the pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Move the model to the GPU
model.cuda()

# Tokenize the text
train_encodings = tokenizer(list(train_text), truncation=True, padding=True)
val_encodings = tokenizer(list(val_text), truncation=True, padding=True)

print('Finished tokenizing the test')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Finished tokenizing the test


In [10]:
print('Convert the data to PyTorch tensors')
# Convert the data to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels.values))
val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                             torch.tensor(val_encodings['attention_mask']),
                                             torch.tensor(val_labels.values))

# Train the model
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.cuda()

print("Ready to go through epoch")

Convert the data to PyTorch tensors
Ready to go through epoch


In [11]:
from sklearn.metrics import classification_report

for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch[0].cuda()
        attention_mask = batch[1].cuda()
        labels = batch[2].cuda()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = []
        for batch in val_loader:
            input_ids = batch[0].cuda()
            attention_mask = batch[1].cuda()
            labels = batch[2].cuda()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0]
            preds = torch.argmax(logits, dim=1)
            val_preds.append(preds)
    
    val_preds = torch.cat(val_preds)
    val_accuracy = accuracy_score(val_labels, val_preds.cpu())
    print(f"Epoch {epoch+1} validation accuracy: {val_accuracy:.2f}")
    print(classification_report(val_labels, val_preds.cpu()))

# Save the trained model
model.save_pretrained('roberta_3epoch')
tokenizer.save_pretrained('roberta_3epoch')

Epoch 1 validation accuracy: 0.95
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      1389
           1       0.94      0.96      0.95      1703

    accuracy                           0.95      3092
   macro avg       0.95      0.94      0.94      3092
weighted avg       0.95      0.95      0.94      3092

Epoch 2 validation accuracy: 0.96
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1389
           1       0.97      0.95      0.96      1703

    accuracy                           0.96      3092
   macro avg       0.96      0.96      0.96      3092
weighted avg       0.96      0.96      0.96      3092

Epoch 3 validation accuracy: 0.95
              precision    recall  f1-score   support

           0       0.91      0.98      0.95      1389
           1       0.99      0.92      0.95      1703

    accuracy                           0.95      3092
   macro avg       0.95   

('roberta_3epoch/tokenizer_config.json',
 'roberta_3epoch/special_tokens_map.json',
 'roberta_3epoch/vocab.json',
 'roberta_3epoch/merges.txt',
 'roberta_3epoch/added_tokens.json',
 'roberta_3epoch/tokenizer.json')

In [13]:
print(classification_report(val_labels, val_preds.cpu(), digits = 4))

              precision    recall  f1-score   support

           0     0.9095    0.9834    0.9450      1389
           1     0.9855    0.9201    0.9517      1703

    accuracy                         0.9486      3092
   macro avg     0.9475    0.9518    0.9484      3092
weighted avg     0.9514    0.9486    0.9487      3092



In [15]:
print(f"Epoch {epoch+1} validation accuracy: {val_accuracy:.4f}")

Epoch 3 validation accuracy: 0.9486
