In [None]:
!pip install transformers datasets torch scikit-learn nltk wandb



In [None]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.corpus import stopwords
import wandb

# Download NLTK stopwords
nltk.download('stopwords')

# Log in to W&B (only needs to be done once on your system)
wandb.login()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
#preprocessing
# Read data
data = pd.read_csv('/content/Restaurant reviews.csv')
data = data.drop(columns=["Restaurant", "Reviewer", "Metadata", "Time", "Pictures", "7514"])

# Display data before preprocessing
print("Data Before Preprocessing:")
print(data.head())

# Data cleaning
df = data.dropna().drop_duplicates()
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df = df.dropna(subset=['Rating'])

# Text preprocessing for BERT
def clean_text(review):
    # Remove non-alphabetic characters and convert to lowercase
    return re.sub(r'[^a-zA-Z ]', '', review).lower()

df['Review'] = df['Review'].apply(clean_text)

# Convert ratings to sentiment labels
def label_sentiment(rating):
    if rating <= 2:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['Rating'].apply(label_sentiment)

# Display data after preprocessing
print("\nData After Preprocessing:")
print(df.head())

# Split data into training and testing sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Review'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)



Data Before Preprocessing:
                                              Review Rating
0  The ambience was good, food was quite good . h...      5
1  Ambience is too good for a pleasant evening. S...      5
2  A must try.. great food great ambience. Thnx f...      5
3  Soumen das and Arun was a great guy. Only beca...      5
4  Food is good.we ordered Kodi drumsticks and ba...      5

Data After Preprocessing:
                                              Review  Rating  label
0  the ambience was good food was quite good  had...     5.0      2
1  ambience is too good for a pleasant evening se...     5.0      2
2  a must try great food great ambience thnx for ...     5.0      2
3  soumen das and arun was a great guy only becau...     5.0      2
4  food is goodwe ordered kodi drumsticks and bas...     5.0      2


In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Convert to torch Dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)




In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    run_name='bert-sentiment-analysis',  # Set a custom name for the W&B run
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="wandb"  # Enable W&B logging
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#train

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkholoud[0m ([33mkholoudt[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2297,0.405263,0.849548,0.838582,0.832695,0.849548
2,0.1598,0.469984,0.850611,0.839969,0.834433,0.850611
3,0.2031,0.642371,0.847953,0.847893,0.84785,0.847953


TrainOutput(global_step=2823, training_loss=0.37716296969329344, metrics={'train_runtime': 653.1081, 'train_samples_per_second': 34.561, 'train_steps_per_second': 4.322, 'total_flos': 1484749016239104.0, 'train_loss': 0.37716296969329344, 'epoch': 3.0})

In [None]:
#evaluate

trainer.evaluate()

{'eval_loss': 0.6423711180686951,
 'eval_accuracy': 0.847953216374269,
 'eval_f1': 0.8478930763134188,
 'eval_precision': 0.8478499035979278,
 'eval_recall': 0.847953216374269,
 'eval_runtime': 12.8466,
 'eval_samples_per_second': 146.42,
 'eval_steps_per_second': 18.371,
 'epoch': 3.0}

In [None]:
# Save the fine-tuned model and tokenizer

model_name = "anlp-sentiment-analysis-bert"
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

print(f"Model and tokenizer saved to '{model_name}' directory.")

Model and tokenizer saved to 'anlp-sentiment-analysis-bert' directory.


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Assuming your fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('anlp-sentiment-analysis-bert')
tokenizer = BertTokenizer.from_pretrained('anlp-sentiment-analysis-bert')

# Save the model and tokenizer
model.save_pretrained('./anlp-sentiment-analysis-bert')
tokenizer.save_pretrained('./anlp-sentiment-analysis-bert')

('./anlp-sentiment-analysis-bert/tokenizer_config.json',
 './anlp-sentiment-analysis-bert/special_tokens_map.json',
 './anlp-sentiment-analysis-bert/vocab.txt',
 './anlp-sentiment-analysis-bert/added_tokens.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r anlp-sentiment-analysis-bert /content/drive/MyDrive/

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer from your saved directory
model_name = "anlp-sentiment-analysis-bert"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define a mapping from class numbers to sentiment labels
sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}

In [None]:
# Example sentence
sentence = "the soup was tastless"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Perform inference
outputs = model(**inputs)

# Get the predicted class (e.g., 0 = negative, 1 = neutral, 2 = positive)
logits = outputs.logits
predicted_class = logits.argmax().item()

# Convert the class number to a label
predicted_sentiment = sentiment_labels[predicted_class]

print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: negative


In [None]:
# Example sentence
sentence = "i totally recommend the olive oil pasta"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Perform inference
outputs = model(**inputs)

# Get the predicted class (e.g., 0 = negative, 1 = neutral, 2 = positive)
logits = outputs.logits
predicted_class = logits.argmax().item()

# Convert the class number to a label
predicted_sentiment = sentiment_labels[predicted_class]

print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: positive


In [None]:
# Example sentence
sentence = "the greek yogurt dip was okay but not as expected"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Perform inference
outputs = model(**inputs)

# Get the predicted class (e.g., 0 = negative, 1 = neutral, 2 = positive)
logits = outputs.logits
predicted_class = logits.argmax().item()

# Convert the class number to a label
predicted_sentiment = sentiment_labels[predicted_class]

print(f"Predicted sentiment: {predicted_sentiment}")


Predicted sentiment: neutral
