In [45]:
from transformers import ElectraTokenizer, ElectraModel
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import AdamW
import transformers

In [46]:
import pandas as pd
import numpy as np

In [47]:
df =  pd.read_csv("data.csv")

In [48]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [49]:
RANDOM_SEED=1

In [50]:
class FakeNewsDataset(Dataset):
  def __init__(self, tokenizer, titles,content, labels):
    self.titles = titles
    self.content = content
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len =160

  def __len__(self):
    return len(self.titles)


  def __getitem__(self, id):
    title = str(self.titles[id])
    content = str(self.content[id])
    label=self.labels[id]
    encoding = self.tokenizer.encode_plus(
      title+content,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      add_special_tokens=True,
      max_length=self.max_len
    )
    return {
      'text': title+content,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [51]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1, random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5, random_state=RANDOM_SEED
)

In [52]:
def get_loader(df, tokenizer, batch_size):
  ds = FakeNewsDataset(
    titles=df.title.to_numpy(),
    content=df.content.to_numpy(),
    labels=df.label.to_numpy(),
    tokenizer=tokenizer
  )
  return DataLoader(
    ds,
    batch_size=batch_size
  )

In [30]:
BATCH_SIZE = 16
train_data_loader = get_loader(df_train, tokenizer, BATCH_SIZE)
val_data_loader = get_loader(df_val, tokenizer, BATCH_SIZE)
test_data_loader = get_loader(df_test, tokenizer, BATCH_SIZE)

In [31]:
class SentimentClassifier(nn.Module):
  def __init__(self, class_size):
    super(SentimentClassifier, self).__init__()
    self.electra = ElectraModel.from_pretrained('google/electra-small-discriminator', return_dict=True)
    self.drop = nn.Dropout(p=0.5)
    self.out = nn.Linear(self.electra.config.hidden_size, class_size)

  def forward(self, input_ids, attention_mask):
    pooled_output = self.electra(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    output = self.drop(pooled_output[0])

        
    return self.out(torch.mean(output,1))

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [33]:
device

device(type='cpu')

In [34]:
model = SentimentClassifier(2).to(device)

In [35]:
data = next(iter(train_data_loader))
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [36]:
input_ids

tensor([[  101,  2129,  2308,  ...,  2515,  2025,   102],
        [  101,  2175,  2361,  ...,  2007,  1996,   102],
        [  101,  3392,  5044,  ...,     0,     0,     0],
        ...,
        [  101,  2034,  2202,  ...,  1012,  1000,   102],
        [  101,  2119,  4243,  ...,  2517,  1037,   102],
        [  101,  1996, 22274,  ...,  1521,  1055,   102]])

In [37]:
torch.nn.functional.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.4957, 0.5043],
        [0.5479, 0.4521],
        [0.3147, 0.6853],
        [0.4688, 0.5312],
        [0.5157, 0.4843],
        [0.3839, 0.6161],
        [0.5150, 0.4850],
        [0.4816, 0.5184],
        [0.4984, 0.5016],
        [0.4792, 0.5208],
        [0.4807, 0.5193],
        [0.5116, 0.4884],
        [0.5222, 0.4778],
        [0.5199, 0.4801],
        [0.5374, 0.4626],
        [0.5156, 0.4844]], grad_fn=<SoftmaxBackward>)

In [43]:
EPOCHS=3

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = transformers.get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [39]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    print("1")
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _,preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [40]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      
      labels = d["labels"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _,preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [41]:
from collections import defaultdict

In [44]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/3
----------
1
1
1
1
1
1
Train loss 0.691565732161204 accuracy 0.5888888888888889
tensor([1, 1, 1, 1, 1])
tensor([0, 1, 1, 1, 0])


Val   loss 0.6450514793395996 accuracy 0.6

Epoch 2/3
----------
1
1
1
1
1
1
Train loss 0.6659543514251709 accuracy 0.5888888888888889
tensor([1, 1, 1, 1, 1])
tensor([0, 1, 1, 1, 0])


Val   loss 0.6346697807312012 accuracy 0.6

Epoch 3/3
----------
1
1
1
1
1
1
Train loss 0.6556258201599121 accuracy 0.5888888888888889
tensor([1, 1, 1, 1, 1])
tensor([0, 1, 1, 1, 0])


Val   loss 0.6299880743026733 accuracy 0.6

Wall time: 8min 7s
