In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting wandb>=0.10.32 (from simpletransformers)


In [12]:
import pandas as pd
import torch
import numpy as np

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA is available! Running on GPU.')
else:
    device = torch.device('cpu')
    print('CUDA is not available. Running on CPU.')

# Set a random seed for reproducibility
SEED = 2137
torch.manual_seed(SEED)
CLASSES = 2
# Clear any cached memory
torch.cuda.empty_cache()

# Check if CUDA is available
print('Is CUDA available?', torch.cuda.is_available())

CUDA is available! Running on GPU.
Is CUDA available? True


In [13]:
!nvidia-smi

Sat Apr 20 17:39:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [14]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [15]:
data = pd.read_csv("/content/drive/MyDrive/data_processed_pos_neg_130k.csv", header = 0)
data.head()


Unnamed: 0,text,rating
0,We went here with our kids for Xmas holiday an...,1
1,We have spent in this hotel our summer holiday...,1
2,I visited Hotel Baltic with my husband for som...,1
3,I've travelled quite a numbers of hotels but t...,1
4,We decided for this family holiday destination...,0


In [16]:
data['rating'].value_counts()

rating
1    65000
0    65000
Name: count, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
# Create empty DataFrames for train and validation sets
train_data = pd.DataFrame(columns=['text', 'rating'])
valid_data = pd.DataFrame(columns=['text', 'rating'])

# Split the data evenly
for class_label in range(CLASSES):
    # Extract examples of current class
    class_df = data[data['rating'] == class_label]

    # Split examples of current class into train and validation sets
    class_train, class_valid = train_test_split(class_df, test_size=0.08, random_state=42)

    # Append train examples to train_df
    train_data = pd.concat([train_data, class_train])

    # Append validation examples to valid_df
    valid_data = pd.concat([valid_data, class_valid])

  # Reset index for both DataFrames
train_data.reset_index(drop=True, inplace=True)
valid_data.reset_index(drop=True, inplace=True)

# Ensure balance across classes in train and validation sets
train_class_counts = train_data['rating'].value_counts()
valid_class_counts = valid_data['rating'].value_counts()

print("Train class counts: \n")
print(train_class_counts)

print("Validation class counts:")
print(valid_class_counts)

Train class counts: 

rating
0    59800
1    59800
Name: count, dtype: int64
Validation class counts:
rating
0    5200
1    5200
Name: count, dtype: int64


In [18]:
# shaffle the data

train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

valid_data = valid_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
print(train_data.head(10))
train_data['rating'].value_counts()

                                                text rating
0  We stayed here in July 2012 and we booked for ...      0
1  We stayed here three nights even though we had...      0
2  We really enjoyed this place. The staff were s...      1
3  The hotel is located directly on the main stri...      0
4  Food overpriced for standard delivered & prese...      0
5  From the moment of arrival until the drop off ...      1
6  My room was dirty and had an awful smell. Serv...      0
7  Awesome place to spend your holidays at,hospit...      1
8  I've always like the Hyatt House / Place feel,...      0
9  Stayed here last week for second time.nice sma...      0


rating
0    59800
1    59800
Name: count, dtype: int64

In [20]:
print(valid_data.head(10))
valid_data['rating'].value_counts()

                                                text rating
0  Lovely hotel, staff really friendly, rooftop r...      1
1  We really enjoyed staying at the Golden Banana...      1
2  We got upgraded to a luxury from a superior ro...      0
3  this hotel can offer you a great location espe...      0
4  This was by far the best little place my husba...      1
5  This hotel is easy on the wallet and is about ...      0
6  My husband and I spent 12 days (from January 7...      1
7  After many months of planning, my wife and I d...      1
8  Normally we don’t have negative opinions about...      0
9  Certainly would not stay here again. Our room ...      0


rating
1    5200
0    5200
Name: count, dtype: int64

In [21]:
from simpletransformers.classification import ClassificationModel
CLASSES = 2
# Create a ClassificationModel
model = ClassificationModel('roberta', 'roberta-base', num_labels=CLASSES, use_cuda = True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
model.train_model(train_data)

  self.pid = os.fork()


  0%|          | 0/239 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/14950 [00:00<?, ?it/s]

(14950, 0.46304665763242586)

In [23]:
predictions, raw_outputs = model.predict(["I would love to burn this place to the ground! The food was uneatable, please close this place.", "Best place to ever be in, 10 outta ten my man!"])
print(predictions, raw_outputs)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[0 1] [[ 1.83300781 -2.00585938]
 [-1.59082031  2.04101562]]


In [24]:
# Make predictions on the validation data
predictions, raw_outputs = model.predict(valid_data['text'].tolist())


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

In [25]:
# Evaluate the predictions
# Assuming 'labels' are the actual labels in your validation data
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print()

print(confusion_matrix(valid_data['rating'].tolist(), predictions))
print(classification_report(valid_data['rating'].tolist(), predictions))


[[4265  935]
 [ 962 4238]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      5200
           1       0.82      0.81      0.82      5200

    accuracy                           0.82     10400
   macro avg       0.82      0.82      0.82     10400
weighted avg       0.82      0.82      0.82     10400



In [34]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



# Compute metrics
accuracy = accuracy_score(valid_data['rating'].tolist(), predictions)
precision = precision_score(valid_data['rating'].tolist(), predictions, average='weighted')
recall = recall_score(valid_data['rating'].tolist(), predictions, average='weighted')
f1 = f1_score(valid_data['rating'].tolist(), predictions, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

# Classification report
print(classification_report(valid_data['rating'].tolist(), predictions))

Accuracy: 0.8175961538461538
Precision: 0.8176047164881034
Recall: 0.8175961538461538
F1-score: 0.8175949244332462
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      5200
           1       0.82      0.81      0.82      5200

    accuracy                           0.82     10400
   macro avg       0.82      0.82      0.82     10400
weighted avg       0.82      0.82      0.82     10400



In [35]:
# SAVE MODEL

output_dir = "outputs/checkpoint-22500-epoch-1"
model.save_model(output_dir)

import os
import shutil

In [37]:
## EXPORT
output_dir = "outputs/checkpoint-14950-epoch-1"
save_dir = "model_roberta_positive_negative"

shutil.copytree(output_dir, "/content/drive/My Drive/" + save_dir)

'/content/drive/My Drive/model_roberta_positive_negative'

In [None]:
## IMPORT

# Copy the model directory from Google Drive to Colab

imported_dir = "imported_model_the_one"

shutil.copytree("/content/drive/My Drive/" + save_dir, imported_dir)

from simpletransformers.classification import ClassificationModel

# Load the model
model = ClassificationModel('roberta', imported_dir, use_cuda=True)

OSError: imported_model_the_one does not appear to have a file named config.json. Checkout 'https://huggingface.co/imported_model_the_one/main' for available files.