<a href="https://www.kaggle.com/code/kacperkodo/sarcasm-detection-using-the-ivy-library?scriptVersionId=171030039" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# DEPENDANCIES AND SETUP

Installing kaggle and uploading the API key necessary to use it.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q kaggle
# from google.colab import files
# from google.colab import userdata
import os
# files.upload(); #Upload kaggle.json - you can get from the kaggle account settings, from the API section.

Installing packages necessary to use torch's transformers.

In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses botocore>=1.34.79

To use the API, credentials need to be copied into the kaggle folder. If everything works, the output will show the list of available datasets.

In [None]:
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets list

Preparing the ivy library.

In [None]:
#Insert the correct user when cloning the repos. Make sure that they are up-to-date.

!git clone "https://github.com/Kacper-W-Kozdon/demos.git"
# !git clone "https://github.com/Kacper-W-Kozdon/ivy.git"
!pip install -U -q paddlepaddle-gpu ivy accelerate>=0.21.0 mlflow datasets>=2.14.5 nlp 2>/dev/null # If ran in a notebook with only cpu enabled, edit "paddlepaddle-gpu" to "paddlepaddle"

Next: import the ivy library and get the dataset.

In [None]:
import ivy

In [None]:
# !kaggle datasets download -d danofer/sarcasm
# !cp -f sarcasm.zip '/kaggle/working/demos/Contributor_demos/Sarcasm Detection/'
# !unzip '/kaggle/working/demos/Contributor_demos/Sarcasm Detection/sarcasm.zip' -d '/kaggle/working/demos/Contributor_demos/Sarcasm Detection/'

Import the libraries suggested in the model which is to be transpiled.

In [None]:
import paddle

In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import gc  # For garbage collection to manage memory
import re  # For regular expressions
import numpy as np  # For numerical operations and arrays
import tensorflow as tf
import torch  # PyTorch library for deep learning

import ivy.functional.frontends.paddle as paddle_frontend

# Libraries to accompany torch's transformers
import tqdm
import boto3
import requests
import regex
import sentencepiece
import sacremoses

import warnings  # For handling warnings
warnings.filterwarnings("ignore")  # Ignore warning messages

from transformers import AutoModel, AutoTokenizer  # Transformers library for natural language processing
# from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
# pipeline, Trainer, TrainingArguments, DataCollatorWithPadding  # Transformers components for text processing
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, \
pipeline, TrainingArguments, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification  # Transformer model for sequence classification

import accelerate

# from nlp import Dataset  # Import custom 'Dataset' class for natural language processing tasks
from imblearn.over_sampling import RandomOverSampler  # For oversampling to handle class imbalance
# import datasets  # Import datasets library
# from datasets import Dataset, Image, ClassLabel  # Import custom 'Dataset', 'ClassLabel', and 'Image' classes
from transformers import pipeline  # Transformers library for pipelines
from bs4 import BeautifulSoup  # For parsing HTML content

import matplotlib.pyplot as plt  # For data visualization
import itertools  # For working with iterators
from sklearn.metrics import (  # Import various metrics from scikit-learn
    accuracy_score,  # For calculating accuracy
    roc_auc_score,  # For ROC AUC score
    confusion_matrix,  # For confusion matrix
    classification_report,  # For classification report
    f1_score  # For F1 score
)

# from datasets import load_metric  # Import load_metric function to load evaluation metrics

from tqdm import tqdm  # For displaying progress bars

tqdm.pandas()  # Enable progress bars for pandas operations

In [None]:
device = "gpu:0" if paddle.device.cuda.device_count() else "cpu" # Either "gpu" or "gpu:0".
ivy.set_default_device(device)
ivy.set_soft_device_mode(True)


In [None]:
print(ivy.default_device())
print(ivy.num_gpus())


Set the seeds.

In [None]:
tf.keras.utils.set_random_seed(0)
torch.manual_seed(0)
paddle.seed(0)

Get the API key for ivy transpiler from your account and upload it to the project. Move it to the correct directory.

In [None]:
pwd

In [None]:
# files.upload(); #Upload key.pem - you can get from the kaggle account settings, from the API section.

from kaggle_secrets import UserSecretsClient
secret_label = "key.pem"
secret_value = UserSecretsClient().get_secret(secret_label)

with open('/kaggle/working/key.pem','w+') as ivy_api_key:
    ivy_api_key.write(secret_value)



In [None]:
!mkdir ~/.ivy #It might be necessary to change ".ivy" to "ivy".
!cp key.pem /kaggle/working/.ivy

In [None]:
!cp key.pem /kaggle/working/.ivy

First we're loading the tokenizer and the model from torch. All of the basic set-up instructions can be found here: https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/huggingface_pytorch-transformers.ipynb#scrollTo=72d8f2de

In [None]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')

sequence_classifier = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased')

In [None]:
from ivy.stateful.module import Module
from ivy.stateful.sequential import Sequential
from ivy.stateful.layers import *
from ivy.stateful.losses import *
from ivy.stateful.optimizers import *
from ivy.stateful.activations import *
from ivy.stateful.initializers import *
from ivy.stateful.norms import *


# DATASET AND MODEL OVERVIEW

In [None]:
!echo -n API_KEY > .ivy/key.pem

In [None]:
ivy.set_backend("tensorflow")
#tokenizer_tf = ivy.transpile(tokenizer, source="torch", to="tensorflow")
#model_pd = ivy.to_ivy_module(model)
#model_pd = model_pd.trace_graph()
#model_pd = model_pd.set_backend("tensorflow")
model_tf = ivy.transpile(model, source="torch", to="tensorflow")
sequence_classifier_tf = ivy.transpile(sequence_classifier, source="torch", to="tensorflow")

In [None]:
# df = pd.read_csv("/content/demos/Contributor_demos/Sarcasm Detection/train-balanced-sarcasm.csv")
df = pd.read_csv("/kaggle/input/sarcasm/train-balanced-sarcasm.csv")
df = df.drop_duplicates()
df = df.rename(columns={'comment': 'title'})
df = df[['label', 'title']]
df = df[~df['label'].isnull()]
df = df[~df['title'].isnull()]
df.sample(5)

In [None]:
def count_words(text: str) -> int:
  return len(text.split())

def count_symbols(text: str) -> int:
  return len("".join(text.split()))

def symbol_to_word_ratio(text: str) -> float:
  return count_symbols(text)/count_words(text)

def upper_lower_ratio(text: str) -> float:
  text = "".join(text.split())
  return sum(1 for c in text if c.isupper())/(max([sum(1 for c in text if c.islower()), 1]))

df['word_count'] = df["title"].apply(count_words)
df['symbol_count'] = df["title"].apply(count_symbols)
df["upper_lower_ratio"] = df["title"].apply(upper_lower_ratio)
df["symbol_to_word_ratio"] = df["title"].apply(symbol_to_word_ratio)
df.sample(5)

A few plots to see some some characteristics of the data.

In [None]:
df_no_sarc = df.where(df["label"] == 0)
df_no_sarc = df_no_sarc.where(df_no_sarc["word_count"] <= 51)
df_sarc = df.where(df["label"] == 1)
df_sarc = df_sarc.where(df_sarc["word_count"] <= 51)
df_no_sarc = df_no_sarc[np.isfinite(df_no_sarc["word_count"])]
df_sarc = df_sarc[np.isfinite(df_sarc["word_count"])]
plt.style.use('_mpl-gallery-nogrid')

hist_df_no_sarc, bin_edges_no = np.histogram(df_no_sarc["word_count"].values, density=True)
hist_df_sarc, bin_edges = np.histogram(df_sarc["word_count"].values, density=True)
# plot:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

bin_mids_no = [(bin_edges_no[i+1] + bin_edges_no[i])/2 for i in range(len(bin_edges_no) - 1)]
bin_mids = [(bin_edges[i+1] + bin_edges[i])/2 for i in range(len(bin_edges) - 1)]
ax1.bar(bin_mids_no, hist_df_no_sarc, width=bin_edges_no[1] - bin_edges_no[0])
ax2.bar(bin_mids, hist_df_sarc, width=bin_edges[1] - bin_edges[0])
ax1.set_title("Hist no sarcasm")
ax1.set_ylabel("density")
ax1.set_xlabel("word count")
ax1.set_xticks(bin_edges_no)
ax1.grid(True)
ax2.set_title("Hist sarcasm")
ax2.set_xlabel("word count")
ax2.set_xticks(bin_edges)
ax2.grid(True)
plt.show()

In [None]:
df_no_sarc = df.where(df["label"] == 0)
df_no_sarc = df_no_sarc.where(df_no_sarc["symbol_count"] <= 201)
df_sarc = df.where(df["label"] == 1)
df_sarc = df_sarc.where(df_sarc["symbol_count"] <= 201)
df_no_sarc = df_no_sarc[np.isfinite(df_no_sarc["symbol_count"])]
df_sarc = df_sarc[np.isfinite(df_sarc["symbol_count"])]
plt.style.use('_mpl-gallery-nogrid')

hist_df_no_sarc, bin_edges_no = np.histogram(df_no_sarc["symbol_count"].values, density=True)
hist_df_sarc, bin_edges = np.histogram(df_sarc["symbol_count"].values, density=True)
# plot:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

bin_mids_no = [(bin_edges_no[i+1] + bin_edges_no[i])/2 for i in range(len(bin_edges_no) - 1)]
bin_mids = [(bin_edges[i+1] + bin_edges[i])/2 for i in range(len(bin_edges) - 1)]
ax1.bar(bin_mids_no, hist_df_no_sarc, width=bin_edges_no[1] - bin_edges_no[0])
ax2.bar(bin_mids, hist_df_sarc, width=bin_edges[1] - bin_edges[0])
ax1.set_title("Hist no sarcasm")
ax1.set_ylabel("density")
ax1.set_xlabel("symbol count")
ax1.set_xticks(bin_edges_no)
ax1.grid(True)
ax2.set_title("Hist sarcasm")
ax2.set_xlabel("symbol count")
ax2.set_xticks(bin_edges)
ax2.grid(True)
plt.show()

In [None]:
df_no_sarc = df.where(df["label"] == 0)
df_no_sarc = df_no_sarc.where(df_no_sarc["upper_lower_ratio"] <= 0.3)
df_sarc = df.where(df["label"] == 1)
df_sarc = df_sarc.where(df_sarc["upper_lower_ratio"] <= 0.3)
df_no_sarc = df_no_sarc[np.isfinite(df_no_sarc["upper_lower_ratio"])]
df_sarc = df_sarc[np.isfinite(df_sarc["upper_lower_ratio"])]
plt.style.use('_mpl-gallery-nogrid')

hist_df_no_sarc, bin_edges_no = np.histogram(df_no_sarc["upper_lower_ratio"].values, density=True)
hist_df_sarc, bin_edges = np.histogram(df_sarc["upper_lower_ratio"].values, density=True)
# plot:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

bin_mids_no = [(bin_edges_no[i+1] + bin_edges_no[i])/2 for i in range(len(bin_edges_no) - 1)]
bin_mids = [(bin_edges[i+1] + bin_edges[i])/2 for i in range(len(bin_edges) - 1)]
ax1.bar(bin_mids_no, hist_df_no_sarc, width=bin_edges_no[1] - bin_edges_no[0])
ax2.bar(bin_mids, hist_df_sarc, width=bin_edges[1] - bin_edges[0])
ax1.set_title("Hist no sarcasm")
ax1.set_ylabel("density")
ax1.set_xlabel("upper/lower ratio")
ax1.set_xticks(bin_edges_no)
ax1.grid(True)
ax2.set_title("Hist sarcasm")
ax2.set_xlabel("upper/lower ratio")
ax2.set_xticks(bin_edges)
ax2.grid(True)
plt.show()

In [None]:
df_no_sarc = df.where(df["label"] == 0)
df_no_sarc = df_no_sarc.where(df_no_sarc["symbol_to_word_ratio"] <= 11)
df_sarc = df.where(df["label"] == 1)
df_sarc = df_sarc.where(df_sarc["symbol_to_word_ratio"] <= 11)
df_no_sarc = df_no_sarc[np.isfinite(df_no_sarc["symbol_to_word_ratio"])]
df_sarc = df_sarc[np.isfinite(df_sarc["symbol_to_word_ratio"])]
plt.style.use('_mpl-gallery-nogrid')

hist_df_no_sarc, bin_edges_no = np.histogram(df_no_sarc["symbol_to_word_ratio"].values, density=True)
hist_df_sarc, bin_edges = np.histogram(df_sarc["symbol_to_word_ratio"].values, density=True)
# plot:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

bin_mids_no = [(bin_edges_no[i+1] + bin_edges_no[i])/2 for i in range(len(bin_edges_no) - 1)]
bin_mids = [(bin_edges[i+1] + bin_edges[i])/2 for i in range(len(bin_edges) - 1)]
ax1.bar(bin_mids_no, hist_df_no_sarc, width=bin_edges_no[1] - bin_edges_no[0])
ax2.bar(bin_mids, hist_df_sarc, width=bin_edges[1] - bin_edges[0])
ax1.set_title("Hist no sarcasm")
ax1.set_ylabel("density")
ax1.set_xlabel("symbols/words ratio")
ax1.set_xticks(bin_edges_no)
ax1.grid(True)
ax2.set_title("Hist sarcasm")
ax2.set_xlabel("symbols/words ratio")
ax2.set_xticks(bin_edges)
ax2.grid(True)
plt.show()

Checking if the tokenizer, encoder/decoder and classifier work.

In [None]:
input = df["title"][1]
print(f"The raw input: \n{input}\n")
token = tokenizer(input, return_tensors="pt", add_special_tokens=True)
print(f"The token: \n{token}\n")
with torch.no_grad():
  encoded_token = model(**token)
print(f"The encoded token: \n{encoded_token}\n")

Checking if the transpiled tokenizer, encoder/decoder and classifier work.

In [None]:
input = df["title"][1]
print(f"The raw input: \n{input}\n")
token = tokenizer.encode(input, return_tensors="tf", add_special_tokens=True).cpu()
print(f"The token: \n{token}\n")
#input_ids, token_type_ids, attention_mask = token["input_ids"], token["token_type_ids"], token["attention_mask"]
encoded_token = model_tf(token)
print(f"The encoded token: \n{encoded_token}\n")

# MODEL TRANSPILATION

A quick check whether transpiling to paddle works as intended.

In [None]:
class Network(torch.nn.Module):

    def __init__(self):
     super().__init__()
     self._linear = torch.nn.Linear(3, 3)

    def forward(self, x):
     return self._linear(x)

x = torch.tensor([1., 2., 3.])
net = Network()
net(x)

In [None]:
ivy.set_backend("paddle")
net_pd = ivy.transpile(net, source="torch", to="paddle")
x_pd = paddle.to_tensor([1., 2., 3.]).cpu()
net_pd(x_pd)

Setting up the classifier based on BERT's sequence classifier model.

In [None]:
class Classifier(torch.nn.Module):
    def __init__(self, num_classes=2):
        super(Classifier, self).__init__()
        self.tokenizer = tokenizer
        self.model = sequence_classifier
        self.softmax = torch.nn.functional.softmax

    def forward(self, x):
        # print(f"The input: {x}")
        x = self.tokenizer(x, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
        x = self.model(**x)
        x = self.softmax(x["logits"], dim=1)
        return x


ivy.set_backend("paddle")
classifier = Classifier()
classifier_paddle = ivy.transpile(classifier, source="torch", to="paddle")

In [None]:
print(dir(classifier))
# print(list(classifier.named_parameters()))

Testing the transpilation of the BERT part of the model.

In [None]:
# ivy.set_backend("torch")
# sequence_classifier_paddle = ivy.transpile(sequence_classifier, source="torch", to="paddle")

In [None]:
ivy.set_backend("paddle")
tokens = tokenizer(["This is it", "This is not it."], add_special_tokens=True, padding=True, truncation=True)
input_ids, token_type_ids, attention_mask = ivy.array(tokens["input_ids"]), ivy.array(tokens["token_type_ids"]), paddle.to_tensor(tokens["attention_mask"])
tokens = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
kwargs = {"input_ids": input_ids.cpu()}
sequence_classifier_paddle = ivy.transpile(sequence_classifier, kwargs=kwargs, source="torch", to="paddle")

In [None]:
ivy.set_backend("paddle")
tokens = tokenizer(["This is it", "This is not it."], add_special_tokens=True, padding=True, truncation=True)
print(tokens)
input_ids, token_type_ids, attention_mask = tokens["input_ids"], tokens["token_type_ids"], tokens["attention_mask"]
input_ids, token_type_ids, attention_mask = paddle.to_tensor(tokens["input_ids"]), paddle.to_tensor(tokens["token_type_ids"]), paddle.to_tensor(tokens["attention_mask"])
# input_ids, token_type_ids, attention_mask = input_ids.to_native(), token_type_ids.to_native(), attention_mask.to_native()
tokens = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
print(tokens)
print(sequence_classifier_paddle(input_ids=tokens["input_ids"]))
sequence_classifier_paddle(**tokens)

Testing the model put together (tokenizer, transpiled BERT, softmax)

In [None]:
# ivy.set_backend("paddle")
# tokens = tokenizer(["This is it", "This is not it."], add_special_tokens=True, padding=True, truncation=True)
# input_ids, token_type_ids, attention_mask = ivy.array(tokens["input_ids"]), ivy.array(tokens["token_type_ids"]), ivy.array(tokens["attention_mask"])
# tokens = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
# kwargs = {"input_ids": input_ids.cpu()}

# sequence_classifier_paddle = ivy.transpile(sequence_classifier, kwargs=kwargs, source="torch", to="paddle")



In [None]:
class Classifier_paddle(paddle.nn.Layer):

    def __init__(self, num_classes=2):
        super(Classifier_paddle, self).__init__()
        self.tokenizer = tokenizer
        self.model = sequence_classifier_paddle
        self.softmax = paddle.nn.functional.softmax

    def forward(self, x):
        # print(f"The input: {x}")
        x = self.tokenizer(x, add_special_tokens=True, padding=True, truncation=True)
        input_ids, token_type_ids, attention_mask = paddle.to_tensor(x["input_ids"]), paddle.to_tensor(x["token_type_ids"]), paddle.to_tensor(x["attention_mask"])
        x = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
        x = self.model(**x)
        x = self.softmax(x["logits"], axis=1)
        return x

classifier_paddle = Classifier_paddle()

In [None]:
print(list(dir(classifier_paddle)))


In [None]:
print(dir(classifier_paddle.model))
# print(classifier_paddle.model.__getattr__('bert/embeddings/LayerNorm/weight'))
print(list(classifier_paddle.model.named_parameters())[-2:])


We are freezing the layers responsible for embedding, pooling, etc.

In [None]:
for layer in list(classifier_paddle.model.parameters())[-2:]:
  layer.trainable = True

for layer in list(classifier_paddle.model.named_parameters())[-2:]:
  print(layer)
for layer in list(classifier_paddle.model.parameters())[-2:]:
  print(layer)

print(sum([1 if layer.trainable else 0 for layer in list(classifier_paddle.parameters())[:]]))

for layer in list(classifier_paddle.model.parameters())[:-2]:
  layer.trainable = False

In [None]:
gc.collect()

In [None]:
input = df["title"][1]
input2 = df["title"][5]
input3 = df["title"][2]

print(input)
print(classifier([input]))
print(classifier([input, input2, input3]))

In [None]:
ivy.set_backend("paddle")
print(classifier_paddle([input]))
print(classifier_paddle([input2]))
print(classifier_paddle([input, input2, input3]))
print(f"Layers: {len(classifier_paddle.parameters())}")

I'd like to make the cell below run as is but there still are some issues with how the arguments are passed to finish the transpilation in the lazy mode.

In [None]:
# tokens = tokenizer(["Just one sentence"], add_special_tokens=True, padding=True, truncation=True)
# print(tokens)
# input_ids, token_type_ids, attention_mask = ivy.array(tokens["input_ids"]), ivy.array(tokens["token_type_ids"]), ivy.array(tokens["attention_mask"])
# tokens = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
# print(type(input_ids), input_ids, token_type_ids, attention_mask)
# print(sequence_classifier_paddle(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask))
# sequence_classifier_paddle(**tokens)

In [None]:
print(type(sequence_classifier_paddle))

In [None]:
tokens = tokenizer(["Just one sentence"], add_special_tokens=True, padding=True, truncation=True)
print(tokens)
input_ids, token_type_ids, attention_mask = paddle.to_tensor(tokens["input_ids"]), paddle.to_tensor(tokens["token_type_ids"]), paddle.to_tensor(tokens["attention_mask"])
tokens = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
print(type(input_ids), input_ids, token_type_ids, attention_mask)
print(sequence_classifier_paddle(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask))
sequence_classifier_paddle(**tokens)

Setting up the training and training the model.

In [None]:
def one_hot(input):

  input = paddle.to_tensor(input)
  return paddle.nn.functional.one_hot(input, num_classes=2)

sample = df.sample(5)
sample['label'] = sample['label'].apply(one_hot, "columns")

type(sample["label"])
for label in sample["label"]:
  print(label)


In [None]:
df.drop(['word_count',	'symbol_count',	'upper_lower_ratio',	'symbol_to_word_ratio'], axis=1, inplace=True)
gc.collect()

In [None]:
import paddle.distributed as dist
def one_hot(input):
  input = paddle.to_tensor(input)
  return paddle.nn.functional.one_hot(input, num_classes=2)



# if type(df['label'][1]) is np.int64:
#   df['label_one_hot'] = df['label'].apply(one_hot, "columns")

# train_dataset = df[['title', 'label_one_hot']]
# test_dataset = df[['title', 'label_one_hot']]
train_dataset = df[['title', 'label']]
test_dataset = df[['title', 'label']]

In [None]:
print(train_dataset.info())
train_dataset_sample = train_dataset.sample(3200)
print(train_dataset_sample.info())

In [None]:
# sample_train = train_dataset.sample(5)
# for entry, label in zip(sample_train["title"], sample_train["label_one_hot"]):
#   print(entry, label)

Switching from pandas.dataframe to paddle.io.Dataset for the convenience of use and compatibility.

In [None]:
gc.collect()

In [None]:
import numpy as np
from paddle.io import Dataset

batch_size = 8

# define a random dataset
class pd_Dataset(Dataset):
    def __init__(self, df):
      super().__init__()
      self.num_samples = df['title'].size
      self.data = [[entry[0], entry[1]] for entry in zip(df["title"], df["label"])]

    def __getitem__(self, idx):
        title = self.data[idx][0]
        label = self.data[idx][1]
        return title, label

    def __len__(self):
        return self.num_samples

train_dataset_pd = pd_Dataset(train_dataset)

train_loader = paddle.io.DataLoader(train_dataset_pd, batch_size=batch_size, shuffle=True)


In [None]:
for batch_id, data in enumerate(train_loader()):
  if batch_id > 1:
    break
  print(data[0])
  print(data[1])

In [None]:

def train(model):
  logs = []
  learning_rate = 3e-5
  parameters = model.parameters()
  print(f"Trainable parameters: {sum([1 if layer.trainable else 0 for layer in list(classifier_paddle.parameters())[:]])}")
  opt = paddle.optimizer.SGD(learning_rate=learning_rate, parameters=model.parameters())
  loss_fn = paddle.nn.CrossEntropyLoss(use_softmax=False)
  metric = paddle.metric.Accuracy()
  epochs = 2
  loss = 0
  acc = 0
  classifier = paddle.DataParallel(model)
  scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
  model, opt = paddle.amp.decorate(models=model, optimizers=opt, level='O2', master_weight=None, save_dtype=None)

  for epoch in range(epochs):

    for batch_id, data in tqdm(enumerate(train_loader()), postfix={"epoch": epoch, "loss": loss, "acc": acc}):

      x_data = data[0]
      y_data = data[1]

      with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
        predicts = classifier(x_data) # The transpiled model seems to have problems with inputs, so instead of feeding it a container, we map onto one.
        loss = loss_fn(predicts, y_data)

      correct = metric.compute(predicts, y_data)
      metric.update(correct)
      acc = metric.accumulate()
      # acc = paddle.metric.accuracy(predicts, y_data) # This needs to be corrected.
      scaled = scaler.scale(loss)
      scaled.backward()

      # update parameters
      scaler.minimize(opt, scaled)

      if batch_id % 100 == 0:
          # print("\nepoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc))
          logs.append([epoch, batch_id, loss.numpy(), acc])

      opt.clear_grad()
    gc.collect()


  obj = {'model': model.state_dict(), 'opt': opt.state_dict(), 'epoch': epochs}
  path = '/content/demos/Contributor_demos/Sarcasm Detection/model.pdparams'
  paddle.save(obj, path)

  return logs, model


In [None]:
# for batch_id, data in enumerate(train_loader()):
#   print(data[0], data[1])
#   print(classifier_paddle(data[0]))
#   if batch_id >= 1:
#     break

In [None]:
print(classifier_paddle)
print(sequence_classifier_paddle)
print(classifier_paddle(["Sarcastic sentence one.", "Sarcastic sentence two"]))

Checking if the accuracy metric works.

In [None]:
predicts = paddle.to_tensor(np.array([[0.60146040, 0.39853954],
        [0.63550186, 0.36449814],
        [0.67369622, 0.32630381],
        [0.61961246, 0.38038763],
        [0.64124215, 0.35875788],
        [0.72334731, 0.27665269],
        [0.60772324, 0.39227673],
        [0.68578976, 0.31421021],
        ]))
y_data = paddle.to_tensor(np.array([[0], [0], [0], [0], [1], [1], [0], [1]]))
accuracy = paddle.metric.accuracy(predicts, y_data)
m = paddle.metric.Accuracy()
correct = m.compute(predicts, y_data)
m.update(correct)
res = m.accumulate()
print(res, accuracy)

In [None]:
logs, trained_classifier_paddle = train(classifier_paddle)

# BUILDING LSTM ON CORE IVY

The training of the BERT model is computationally fairly expensive. It might be better to prepare your own model, using core Ivy on torch or jax backend.

In [None]:
# dir(tokenizer)

In [None]:
print(tokenizer.vocab_size)
print(tokenizer.all_special_tokens_extended)
print(tokenizer.all_special_ids)
print(tokenizer.pad_token_id)

In [None]:
sample = list(df.sample(8)["title"])
print(sample)
tokenizer(sample, add_special_tokens=True, padding=True, truncation=True)

In [None]:
ivy.set_backend("torch")
num_embeddings = tokenizer.vocab_size
embedding_dim = 5
pad_token_id = tokenizer.pad_token_id
input_channels = embedding_dim
num_classes = 2
output_channels = 1
num_layers = 1
linear_input_channels = 2
max_length = 29
tokenizer.model_max_length = max_length
eps = 1e-05
testing_input = list(df.sample(8)["title"])
batch_size = 8
linear_input_channels = (tokenizer.model_max_length + 3) * batch_size # 3 comes from the hidden states of the LSTM
linear_output_channels = num_classes * batch_size
normalized_shape = (num_classes)

class LSTM_postproc(Module):

  def __init__(self):
    super(LSTM_postproc, self).__init__()

  def __call__(self, args):

    lstm_output, lstm_state = args
    lstm_state_latest, lstm_state_hidden = lstm_state
    lstm_state_latest = ivy.array(lstm_state_latest)
    # print(lstm_state_hidden, lstm_state_latest)
    lstm_state_hidden = ivy.array([state for state in lstm_state_hidden][0])

    lstm_state = ivy.concat((lstm_state_latest, lstm_state_hidden), axis=0).reshape((batch_size, -1, 1))
    # print(lstm_output.shape, lstm_state.shape)
    out = ivy.concat([lstm_output, lstm_state], axis=1)
    out = out.flatten()
    return out

class Tokenizer(Module):

  def __init__(self, tokenizer):
    super(Tokenizer, self).__init__()
    self.tokenizer = tokenizer

  def __call__(self, args):
    args = list(args)
    return self.tokenizer(args, add_special_tokens=True, max_length=max_length, padding="max_length", truncation=True)["input_ids"]

class Reshaper(Module):

  def __init__(self):
    super(Reshaper, self).__init__()

  def __call__(self, args):
    return args.reshape((batch_size, num_classes))

ivy_LSTM = Sequential(
    Tokenizer(tokenizer),
    Embedding(num_embeddings, embedding_dim, pad_token_id),
    LSTM(input_channels, output_channels, num_layers=1, return_sequence=True, return_state=True, device=None, v=None, dtype=None),
    LSTM_postproc(),
    Linear(linear_input_channels, linear_output_channels, with_bias=True),
    Reshaper(),
    Sigmoid(),
    Softmax(),
)

In [None]:
print(dir)

In [None]:
def ivy_train_loader(dataset = df, batch_size = 8):
  num_batches = int(len(dataset)/batch_size)
  out = ((dataset["title"][batch_idx * batch_size : batch_idx * batch_size + batch_size], dataset["label"][batch_idx * batch_size : batch_idx * batch_size + batch_size]) for batch_idx in range(num_batches))
  return out

loader = ivy_train_loader()
for batch_id, data in tqdm(enumerate(loader)):
    x_data = data[0]
    y_data = data[1]
    ivy_LSTM_test_out = ivy_LSTM(x_data)
    # print()
    # print(ivy.sum(ivy_LSTM_test_out, axis=1))
    if batch_id == 4:
      break

In [None]:
def one_hot(args, num_clases = 2):
  out = [[1 if idx == elem else 0 for idx in range(2)] for elem in args]
  return out

def argmax(args):
  out = [ivy.argmax(elem) for elem in args]
  return out

print(one_hot([0, 0, 1, 0]))
print(argmax(ivy.array([[0.49967843, 0.50032151],
       [0.49986687, 0.50013322],
       [0.49912587, 0.50087422],
       [0.50080854, 0.4991914 ],
       [0.50049627, 0.4995037 ],
       [0.4998956 , 0.50010443],
       [0.50008798, 0.49991205],
       [0.50053447, 0.49946556]])))

In [None]:
ivy.set_backend("torch")
num_embeddings = tokenizer.vocab_size
embedding_dim = 5
pad_token_id = tokenizer.pad_token_id
input_channels = embedding_dim
num_classes = 2
output_channels = 1
num_layers = 1
linear_input_channels = 2
max_length = 29
tokenizer.model_max_length = max_length
eps = 1e-05
testing_input = list(df.sample(8)["title"])
batch_size = 8
linear_input_channels = (tokenizer.model_max_length + 3) * batch_size # 3 comes from the hidden states of the LSTM
linear_output_channels = num_classes * batch_size
normalized_shape = (num_classes)

class LSTM_postproc(Module):

  def __init__(self):
    super(LSTM_postproc, self).__init__()

  def __call__(self, args):

    lstm_output, lstm_state = args
    lstm_state_latest, lstm_state_hidden = lstm_state
    lstm_state_latest = ivy.array(lstm_state_latest)
    # print(lstm_state_hidden, lstm_state_latest)
    lstm_state_hidden = ivy.array([state for state in lstm_state_hidden][0])

    lstm_state = ivy.concat((lstm_state_latest, lstm_state_hidden), axis=0).reshape((batch_size, -1, 1))
    # print(lstm_output.shape, lstm_state.shape)
    out = ivy.concat([lstm_output, lstm_state], axis=1)
    out = out.flatten()
    return out

class Tokenizer(Module):

  def __init__(self, tokenizer):
    super(Tokenizer, self).__init__()
    self.tokenizer = tokenizer

  def __call__(self, args):
    args = list(args)
    return self.tokenizer(args, add_special_tokens=True, max_length=max_length, padding="max_length", truncation=True)["input_ids"]

class Reshaper(Module):

  def __init__(self):
    super(Reshaper, self).__init__()

  def __call__(self, args):
    return args.reshape((batch_size, num_classes))

class Argmax(Module):

  def __init__(self):
    super(Argmax, self).__init__()

  def __call__(self, args):
    return ivy.argmax(args, axis=-1)



ivy_LSTM = Sequential(
    Tokenizer(tokenizer),
    Embedding(num_embeddings, embedding_dim, pad_token_id),
    LSTM(input_channels, output_channels, num_layers=1, return_sequence=True, return_state=True, device=None, v=None, dtype=None),
    LSTM_postproc(),
    Linear(linear_input_channels, linear_output_channels, with_bias=True),
    Reshaper(),
    Sigmoid(),
    Softmax(),
    Argmax(),
)

In [None]:
def train_ivy(model):
  logs = []
  learning_rate = 3e-5
  opt = SGD(lr=learning_rate, inplace=True, stop_gradients=True, trace_on_next_step=False)
  loss_fn = CrossEntropyLoss(axis=-1, epsilon=1e-07, reduction='sum')
  epochs = 2
  grads = ivy.zeros_like(model.v)
  classifier = model
  train_loader = ivy_train_loader(df, batch_size)

  for epoch in range(epochs):

    for batch_id, data in tqdm(enumerate(train_loader)):

      x_data = data[0]
      y_data = list(data[1])
      # print(y_data)
      # The transpiled model seems to have problems with inputs, so instead of feeding it a container, we map onto one.
      predictions = classifier(x_data)

      loss = loss_fn(predictions, y_data).float()
      loss.requires_grad = True
      # print(f"LOSS: {loss}")
      
      # acc = paddle.metric.accuracy(predicts, y_data) # This needs to be corrected.
      loss.backward()

      # update parameters
      opt.step(model.v, grads)

      if batch_id % 100 == 0:
          # print("\nepoch: {}, batch_id: {}, loss is: {}, acc is: {}".format(epoch, batch_id, loss.numpy(), acc))
          logs.append([epoch, batch_id, loss])

      # opt.clear_grad()
    gc.collect()


  obj = {'model': model.state_dict(), 'opt': opt.state_dict(), 'epoch': epochs}
  path = '/content/demos/Contributor_demos/Sarcasm Detection/model.pdparams'
  paddle.save(obj, path)

  return logs, model

In [None]:
logs, model = train_ivy(ivy_LSTM)