In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import numpy as np


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

!nvidia-smi
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)

In [None]:
import pandas as pd
column_names = ['ID', 'Name','Label', 'Content']
df=pd.read_csv('/content/drive/MyDrive/twitter_training.csv',names=column_names)
df.tail()

Unnamed: 0,ID,Name,Label,Content
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [None]:
def sentiment_to_num(label):
  if label == "Negative":
    return 0
  elif label == 'Positive':
    return 1
  elif label == 'Neutral':
    return 2
  else:
    return 2

df['Label'] = df.Label.apply(sentiment_to_num)
df.tail()

Unnamed: 0,ID,Name,Label,Content
74677,9200,Nvidia,1,Just realized that the Windows partition of my...
74678,9200,Nvidia,1,Just realized that my Mac window partition is ...
74679,9200,Nvidia,1,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,1,Just realized between the windows partition of...
74681,9200,Nvidia,1,Just like the windows partition of my Mac is l...


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

class_names = ['negative', 'positive', 'neutral']
#ax = sns.countplot(df.Label)
#plt.xlabel('Content Sentiment')
#ax.set_xticklabels(class_names)

In [None]:
from nltk.corpus import stopwords
import nltk
import re

nltk.download('stopwords')

def clean_text(text):
    if isinstance(text, str):
        # Remove numbers
        text = re.sub(r'\d+', '', text)

        # Remove special characters
        str_text = re.sub(r'[^\w\s]', '', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = str_text.lower().split()
        str_text = ' '.join([word for word in tokens if word not in stop_words])
        return str_text
    else:
        return text
df["Content"] = df["Content"].apply(clean_text)
df.tail()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,ID,Name,Label,Content
74677,9200,Nvidia,1,realized windows partition mac like years behi...
74678,9200,Nvidia,1,realized mac window partition years behind nvi...
74679,9200,Nvidia,1,realized windows partition mac years behind nv...
74680,9200,Nvidia,1,realized windows partition mac like years behi...
74681,9200,Nvidia,1,like windows partition mac like years behind d...


Plotting token lengths for choosing BERT sequence length.

In [None]:
from transformers import BertTokenizer
token_lens = []

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for txt in df.Content:
    tokens = tokenizer.encode(str(txt), max_length=512)
    token_lens.append(len(tokens))

sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');



KeyboardInterrupt: 

We'll set it to 100 just to be safe

In [None]:
MAX_LEN = 100

In [None]:
from torch.utils.data import Dataset

class XDataset(Dataset):

  def __init__(self, content, targets, tokenizer, max_len):
    self.content = content
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.content)

  def __getitem__(self, item):
    content = str(self.content[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      content,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'content': content,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

df_train.shape, df_val.shape, df_test.shape


((67213, 4), (3734, 4), (3735, 4))

In [None]:
from torch.utils.data import DataLoader

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = XDataset(
    content=df.Content.to_numpy(),
    targets=df.Label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

  self.pid = os.fork()


dict_keys(['content', 'input_ids', 'attention_mask', 'targets'])

In [None]:
from torch import nn, optim
from transformers import BertModel


class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)

In [None]:


model = SentimentClassifier(len(class_names))
model = model.to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup


EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=0.00002, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
from tqdm import tqdm  # Import tqdm for progress bars


def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0

  for d in tqdm(data_loader, desc='Data Loader'):  # tqdm for data loader progress
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)



In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
from collections import defaultdict
from google.colab import files




history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc
files.download('best_model_state.bin')

Epoch 1/10
----------


  self.pid = os.fork()
Data Loader: 100%|██████████| 4201/4201 [18:33<00:00,  3.77it/s]

Train loss 0.645034938910566 accuracy 0.7240266019966376





Val   loss 0.36538697491025823 accuracy 0.8682378146759507

Epoch 2/10
----------


Data Loader: 100%|██████████| 4201/4201 [18:48<00:00,  3.72it/s]

Train loss 0.2748992471262157 accuracy 0.900450805647717





Val   loss 0.34508382386237413 accuracy 0.8982324584895555

Epoch 3/10
----------


Data Loader: 100%|██████████| 4201/4201 [18:48<00:00,  3.72it/s]

Train loss 0.181182580909123 accuracy 0.9391635546694835





Val   loss 0.36137328956661247 accuracy 0.9100160685591858

Epoch 4/10
----------


Data Loader: 100%|██████████| 4201/4201 [18:48<00:00,  3.72it/s]

Train loss 0.14022906672994662 accuracy 0.9528960171395414





Val   loss 0.3368534200848926 accuracy 0.9204606320299946

Epoch 5/10
----------


Data Loader: 100%|██████████| 4201/4201 [18:48<00:00,  3.72it/s]

Train loss 0.11236216463916406 accuracy 0.9611087140880484





Val   loss 0.38447952927135004 accuracy 0.9204606320299946

Epoch 6/10
----------


Data Loader: 100%|██████████| 4201/4201 [18:47<00:00,  3.72it/s]

Train loss 0.09380052407999985 accuracy 0.9649026230044783





Val   loss 0.37013748083274206 accuracy 0.9258168184252812

Epoch 7/10
----------


Data Loader:  28%|██▊       | 1169/4201 [05:14<13:35,  3.72it/s]

In [None]:
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

NameError: name 'SentimentClassifier' is not defined

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()



0.9092369477911647

In [None]:
sample_text = "The food was not good"


In [None]:
encoded_sample = tokenizer.encode_plus(
  sample_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
input_ids = encoded_sample['input_ids'].to(device)
attention_mask = encoded_sample['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Sample text: {sample_text}')
print(f'Sentiment  : {class_names[prediction]}')

Sample text: The food was not good
Sentiment  : negative
