In [1]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_

Importing required Libraries

In [2]:
import torch

import pandas as pd
from torch.utils.data import TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset, load_metric
from sklearn.preprocessing import LabelEncoder

In [3]:
torch.cuda.is_available()

True

In [45]:
# Load the Hotel review dataset for training and testing
dataset_train = pd.read_csv("/content/drive/MyDrive/Practice/tripadvisor_hotel_reviews.csv")
dataset_test = pd.read_csv("/content/drive/MyDrive/Practice/hotel_review.csv")

In [5]:
dataset_train.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [46]:
def convert_rating_to_label(rating):
    if rating in [1, 2]:
        return "negative"
    elif rating == 3:
        return "neutral"
    elif rating in [4, 5]:
        return "positive"

# Apply the conversion to the 'rating' column
dataset_train['label'] = dataset_train['Rating'].apply(convert_rating_to_label)

In [7]:
dataset_train.head()

Unnamed: 0,Review,Rating,label
0,nice hotel expensive parking got good deal sta...,4,positive
1,ok nothing special charge diamond member hilto...,2,negative
2,nice rooms not 4* experience hotel monaco seat...,3,neutral
3,"unique, great stay, wonderful time hotel monac...",5,positive
4,"great stay great stay, went seahawk game aweso...",5,positive


In [8]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
 2   label   20491 non-null  object
dtypes: int64(1), object(2)
memory usage: 480.4+ KB


In [9]:
dataset_train.describe()

Unnamed: 0,Rating
count,20491.0
mean,3.952223
std,1.23303
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [10]:
dataset_train.isnull()

Unnamed: 0,Review,Rating,label
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
20486,False,False,False
20487,False,False,False
20488,False,False,False
20489,False,False,False


In [11]:
dataset_train.isnull().sum()

Unnamed: 0,0
Review,0
Rating,0
label,0


In [12]:
dataset_test.head()

Unnamed: 0,Review,label
0,The check-in process was smooth and quick.,positive
1,The room was dirty and the staff were unhelpful.,negative
2,Great service! The staff went above and beyond...,positive
3,bad experience. Nothing special about the serv...,negative
4,The concierge was very knowledgeable and provi...,positive


In [13]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  39 non-null     object
 1   label   39 non-null     object
dtypes: object(2)
memory usage: 752.0+ bytes


In [14]:
dataset_test.describe()

Unnamed: 0,Review,label
count,39,39
unique,39,3
top,The check-in process was smooth and quick.,neutral
freq,1,19


In [15]:
dataset_test.isnull()

Unnamed: 0,Review,label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


In [16]:
dataset_test.isnull().sum()

Unnamed: 0,0
Review,0
label,0


In [47]:
# Initialize LabelEncoder and encode the labels
label_encoder = LabelEncoder()
dataset_train['label'] = label_encoder.fit_transform(dataset_train['label'])
dataset_test['label'] = label_encoder.transform(dataset_test['label'])

In [48]:
# Shuffle the datasets
dataset_train = dataset_train.sample(frac=1, random_state=42).reset_index(drop=True)
dataset_test = dataset_test.sample(frac=1, random_state=42).reset_index(drop=True)

In [49]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the datasets
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True)

# Apply the tokenization function
tokenized_train = dataset_train['Review'].apply(tokenize_function)
tokenized_test = dataset_test['Review'].apply(tokenize_function)

# Convert the tokenized data to Hugging Face Datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(tokenized_train.tolist()))
train_dataset = train_dataset.add_column("label", dataset_train['label'])

test_dataset = Dataset.from_pandas(pd.DataFrame(tokenized_test.tolist()))
test_dataset = test_dataset.add_column("label", dataset_test['label'])

In [50]:
# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# Define the evaluation metrics
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy}

# Define the training arguments
training_args = TrainingArguments(
    output_dir="FineTuned-HotelReview-Sentiment-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=False,
)


In [53]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [54]:
# Train the model
trainer.train()

Step,Training Loss
500,0.4366
1000,0.3318
1500,0.3045
2000,0.2589
2500,0.2562


TrainOutput(global_step=2562, training_loss=0.3168245623765003, metrics={'train_runtime': 2006.1218, 'train_samples_per_second': 20.428, 'train_steps_per_second': 1.277, 'total_flos': 5428875746285568.0, 'train_loss': 0.3168245623765003, 'epoch': 2.0})

In [55]:
import numpy as np

In [56]:
# Evaluate the model on the test dataset
trainer.evaluate()

{'eval_loss': 0.533417284488678,
 'eval_accuracy': 0.717948717948718,
 'eval_runtime': 0.734,
 'eval_samples_per_second': 53.134,
 'eval_steps_per_second': 4.087,
 'epoch': 2.0}

In [57]:
# Save the model
import os
path = "/content/drive/MyDrive/Practice"
if not os.path.exists(path):
    os.makedirs(path)

# Save the model with the full path including the filename
torch.save(model, os.path.join(path, "model.pth"))

In [58]:
from transformers import pipeline

In [59]:
# Creating a pipeline for testing the model
#model.to('cpu')
class_labels = ['Negative', 'Positive']
model.config.id2label = class_labels
sentiment_model = pipeline(task = 'sentiment-analysis', model = model, tokenizer = tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [61]:
# Testing the model on a sample text
sentiment_model("I felt toilets were not nice. Shower area was not relaxing enough")

[{'label': 'Negative', 'score': 0.8353247046470642}]

In [62]:
# Testing the model on a sample text
sentiment_model("hotel environment was okay to stay in it.")

[{'label': 'Positive', 'score': 0.6669718623161316}]