In [None]:
from google.colab import drive

# mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import json
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader

 Preprocessing Yelp JSON data

In [None]:
json_file = open('/content/drive/MyDrive/yelp_academic_dataset_review.json', 'r', encoding='utf-8')
csv_file = "/content/drive/MyDrive/pre_processed_reviews.csv"

# preprocessing JSON data and saving to CSV
def pre_process_json():
    object_count = 0
    for line in json_file:
        try:
            if object_count == 10000:
                break

            data = json.loads(line)

            stars = int(data['stars'])
            useful = int(data['useful'])
            funny = int(data['funny'])
            cool = int(data['cool'])
            text = str(data['text'])

            neutral = funny + cool

            if useful == 0 or funny == 0 or cool == 0:
                continue
            else:
                sentiment_list = [useful, neutral]
                sentiment = sentiment_list.index(max(sentiment_list))
                if stars >= 3 and sentiment == 0:
                    reaction = 'satisfied considerable comment'
                elif stars >= 3 and sentiment == 1:
                    reaction = 'satisfied neutral comment'
                elif stars < 3 and sentiment == 0:
                    reaction = 'unsatisfied considerable comment'
                elif stars < 3 and sentiment == 1:
                    reaction = 'unsatisfied neutral comment'

            data = {'text': [text], 'Sentiment': [reaction]}
            df = pd.DataFrame(data)

            if object_count == 0:
                df.to_csv(csv_file, index=False)
            else:
                df.to_csv(csv_file, mode='a', header=False, index=False)

            object_count += 1

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

pre_process_json()
json_file.close()

processed_data = pd.read_csv(csv_file)
print(processed_data.head())

                                                text  \
0  I am a long term frequent customer of this est...   
1  HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...   
2  I thoroughly enjoyed the show.  Chill way to s...   
3  On a scale of one to things that are awesome, ...   
4  I've only had the cannolis here but they are a...   

                        Sentiment  
0     unsatisfied neutral comment  
1       satisfied neutral comment  
2       satisfied neutral comment  
3  satisfied considerable comment  
4       satisfied neutral comment  


Map sentiment labels to integers

In [None]:
sentiment_mapping = {
    "satisfied considerable comment": 0,
    "satisfied neutral comment": 1,
    "unsatisfied considerable comment": 2,
    "unsatisfied neutral comment": 3
}
processed_data['label'] = processed_data['Sentiment'].map(sentiment_mapping)

In [None]:
label_counts = processed_data['label'].value_counts()

sentiment_names = {v: k for k, v in sentiment_mapping.items()}
label_counts.index = label_counts.index.map(sentiment_names)

print("Sentiment Balance:")
print(label_counts)

Sentiment Balance:
label
satisfied neutral comment           5291
satisfied considerable comment      2909
unsatisfied considerable comment     921
unsatisfied neutral comment          879
Name: count, dtype: int64


In [None]:
from sklearn.utils import resample

df_majority_1 = processed_data[processed_data['label'] == 0]
df_majority_2 = processed_data[processed_data['label'] == 1]
df_minority_1 = processed_data[processed_data['label'] == 2]
df_minority_2 = processed_data[processed_data['label'] == 3]

# Oversample the minority classes
df_minority_1_oversampled = resample(df_minority_1,
                                     replace=True,      
                                     n_samples=2909,    
                                     random_state=42)   

df_minority_2_oversampled = resample(df_minority_2,
                                     replace=True,
                                     n_samples=2909,
                                     random_state=42)


processed_data_balanced = pd.concat([df_majority_1, df_majority_2, df_minority_1_oversampled, df_minority_2_oversampled])


processed_data = processed_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)


print("Balanced Sentiment Counts:")
print(processed_data['label'].value_counts())

Balanced Sentiment Counts:
label
1    5291
0    2909
3    2909
2    2909
Name: count, dtype: int64


Split into train and validation sets

In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    processed_data['text'], processed_data['label'], test_size=0.3, random_state=42, stratify=processed_data['label']
)

In [None]:
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

Tokenize using DeBERTa tokenizer

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


Prepare DataLoader

In [None]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
train_dataset = SentimentDataset(train_encodings, train_labels.to_list())
val_dataset = SentimentDataset(val_encodings, val_labels.to_list())
test_dataset = SentimentDataset(test_encodings, test_labels.to_list())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

Fine-Tuning DistilBert

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_training_steps = len(train_loader) * 10  # 10 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [None]:
early_stop_counter = 0
best_val_loss = float('inf')

for epoch in range(10):  
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)

    # validation
    model.eval()
    val_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = accuracy_score(all_labels, all_predictions)

    print(f"Epoch {epoch + 1}:")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")

Epoch 1:
  Training Loss: 0.9855
  Validation Loss: 0.8917
  Validation Accuracy: 0.5583
Epoch 2:
  Training Loss: 0.7031
  Validation Loss: 0.7567
  Validation Accuracy: 0.6890
Epoch 3:
  Training Loss: 0.4902
  Validation Loss: 0.6840
  Validation Accuracy: 0.7266
Epoch 4:
  Training Loss: 0.3874
  Validation Loss: 0.6939
  Validation Accuracy: 0.7033
Epoch 5:
  Training Loss: 0.2951
  Validation Loss: 0.7839
  Validation Accuracy: 0.7261
Epoch 6:
  Training Loss: 0.1931
  Validation Loss: 0.8539
  Validation Accuracy: 0.7142
Epoch 7:
  Training Loss: 0.1052
  Validation Loss: 1.0533
  Validation Accuracy: 0.7190
Epoch 8:
  Training Loss: 0.0596
  Validation Loss: 1.1337
  Validation Accuracy: 0.7109
Epoch 9:
  Training Loss: 0.0344
  Validation Loss: 1.2455
  Validation Accuracy: 0.7061
Epoch 10:
  Training Loss: 0.0272
  Validation Loss: 1.2622
  Validation Accuracy: 0.7047


Evaluation

In [None]:
model.eval()
all_test_predictions = []
all_test_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_test_predictions.extend(predictions.cpu().numpy())
        all_test_labels.extend(batch["labels"].cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_predictions)
test_classification_report = classification_report(all_test_labels, all_test_predictions, target_names=list(sentiment_mapping.keys()))

print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_classification_report)

Test Accuracy: 0.7152
Test Classification Report:
                                  precision    recall  f1-score   support

  satisfied considerable comment       0.41      0.39      0.40       437
       satisfied neutral comment       0.68      0.65      0.67       793
unsatisfied considerable comment       0.90      0.93      0.91       437
     unsatisfied neutral comment       0.86      0.94      0.90       436

                        accuracy                           0.72      2103
                       macro avg       0.71      0.73      0.72      2103
                    weighted avg       0.71      0.72      0.71      2103

