In [1]:
!pip3 install --quiet torch
!pip3 install --quiet -q transformers datasets
!pip3 install --quiet --upgrade scikit-learn==1.0.2
!pip3 install --quiet matplotlib
!pip3 install --quiet accelerate -U
!pip install --quiet datasets -q
!pip install --quiet wordcloud -q
!pip install --quiet sentence-transformers -q
!pip install --quiet nltk 

In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import  matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from wordcloud import WordCloud
from datasets import load_dataset
from wordcloud import WordCloud


[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type != 'cuda':
    raise SystemError('GPU device not found')

In [4]:
dataset = load_dataset("yelp_review_full")
train_dataset = dataset['train']
test_dataset = dataset['test']
del dataset

train_text = [train_dataset[i]['text'] for i in range(len(train_dataset))]
train_label = [train_dataset[i]['label'] for i in range(len(train_dataset))]

test_text = [test_dataset[i]['text'] for i in range(len(test_dataset))]
test_label = [test_dataset[i]['label'] for i in range(len(test_dataset))]


del train_dataset
del test_dataset

Deleting emotics from the text:

In [5]:
emoticon_regex = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
train_text_noemot = [re.sub(emoticon_regex, '', tweet) for tweet in tqdm(train_text)]
test_text_noemot  = [re.sub(emoticon_regex, '', tweet) for tweet in tqdm(test_text)]

100%|██████████| 650000/650000 [00:12<00:00, 51752.46it/s]
100%|██████████| 50000/50000 [00:00<00:00, 53299.96it/s]


Model Training :

In [6]:
#load model and move to the GPU
model_name = 'bert-base-multilingual-uncased'
bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(train_label)))
bert = bert.to(device)

#Load tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def randomSampling(X , Y , p):

    unique_labels = np.unique(Y)

    # Initialize lists to store sampled data
    sampled_X = []
    sampled_Y = []

    # Iterate over each unique label
    for label in unique_labels:
        # Find indices corresponding to the current label
        indices = np.where(np.array(Y )== label)[0]
        # Randomly shuffle the indices
        np.random.shuffle(indices)
        num_samples = int(len(indices) * p)
        if (num_samples<1):
            print(f"Not Enough samples for class {label}")
            return -1
        sampled_indices = indices[:num_samples]
        
        # Append sampled data to the lists
        sampled_X+=np.array(X)[sampled_indices.astype(int)].tolist()
        sampled_Y+=np.array(Y)[sampled_indices.astype(int)].tolist()
    
    combined_data = list(zip(sampled_X, sampled_Y))

    # Shuffle the combined data
    np.random.shuffle(combined_data)

    # Unzip the shuffled data back into X_train and Y_train
    X_train_shuffled, Y_train_shuffled = zip(*combined_data)

    # Convert back to lists if needed
    X_train_shuffled = list(X_train_shuffled)
    Y_train_shuffled = list(Y_train_shuffled)


    return X_train_shuffled, Y_train_shuffled

Preparing data for training:

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_text_noemot, train_label, test_size=0.3, random_state=42)

X_train,y_train = randomSampling(X_train, y_train , 0.1)

X_val , y_val = randomSampling(X_val, y_val, 0.1)

train_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(X_train, y_train)]
validation_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(X_val, y_val)]
test_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(test_text_noemot, test_label)]

#Convert to huggingface dataset api 
train_data = Dataset.from_list(train_data)
validation_data = Dataset.from_list(validation_data)
test_data = Dataset.from_list(test_data)



data = DatasetDict()
data['train'] = train_data
data['validation'] = validation_data
data['test'] = test_data

In [9]:
data['train'].shape , data['validation'].shape , data['test'].shape

((45499, 2), (19497, 2), (50000, 2))

Convert to Tokens:

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_data = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/45499 [00:00<?, ? examples/s]

Map:   0%|          | 0/19497 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [12]:
training_args = TrainingArguments(
    "Bert_V1_Training", 
    #per_device_train_batch_size=16,
    fp16=True,
    fp16_full_eval=True,
    half_precision_backend=True,
    fp16_opt_level=True,
    do_eval=True
)

In [13]:
trainer = Trainer(
    model=bert, 
    args=training_args, 
    train_dataset=tokenized_data['train'], 
    eval_dataset=tokenized_data['validation']
)

In [14]:
trainer.train()

Step,Training Loss
500,1.2198
1000,1.0411
1500,0.9915
2000,0.9735
2500,0.949
3000,0.8809
3500,0.7978
4000,0.8094
4500,0.7868
5000,0.7888


TrainOutput(global_step=8532, training_loss=0.8025369431790309, metrics={'train_runtime': 4902.5603, 'train_samples_per_second': 27.842, 'train_steps_per_second': 1.74, 'total_flos': 3.591483709190861e+16, 'train_loss': 0.8025369431790309, 'epoch': 3.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.9240946769714355,
 'eval_runtime': 202.7658,
 'eval_samples_per_second': 96.155,
 'eval_steps_per_second': 12.024,
 'epoch': 3.0}

In [18]:
preds = trainer.predict(tokenized_data['test'])
y_pred = torch.argmax(torch.tensor(preds.predictions), dim=1).numpy()
print(classification_report(tokenized_data['test']['label'], y_pred))

              precision    recall  f1-score   support

           0       0.76      0.75      0.76     10000
           1       0.58      0.59      0.59     10000
           2       0.57      0.58      0.57     10000
           3       0.55      0.56      0.56     10000
           4       0.73      0.71      0.72     10000

    accuracy                           0.64     50000
   macro avg       0.64      0.64      0.64     50000
weighted avg       0.64      0.64      0.64     50000

