In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import lib

In [2]:
!pip install transformers
!pip install datasets
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m122.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding,\
                        TrainingArguments, Trainer,TextClassificationPipeline
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import random

#Define function

In [5]:
def tokenize_function(examples):
  return tokenizer(examples["headline"], padding="max_length", truncation=True)

def get_tokenized_datasets(data_files, remove_columns):
  dataset = load_dataset("csv", data_files=data_files)
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
  tokenized_datasets = tokenized_datasets.remove_columns(remove_columns)
  tokenized_datasets = tokenized_datasets.with_format('torch')
  print(tokenized_datasets)
  return tokenized_datasets

def compute_metrics(pred):
  labels = pred[1]
  preds = np.argmax(pred[0], axis = -1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average = "macro")
  acc = accuracy_score(labels, preds)
  return {
    "accuracy": acc,
    "f1": f1,
    "precision": precision,
    "recall": recall
  }

def convert_file_csv(input, labels, name_file):
  df = pd.DataFrame([input, labels]).transpose()
  df.columns = ['headline', 'labels']
  df.to_csv(name_file, index = False)
  print(len(input))
  print(len(labels))

#Get data

In [None]:
df = pd.read_json('/content/drive/MyDrive/CS114/data.json')
df = df.drop(columns=["article_link"])
df = df.dropna()
df = df.drop_duplicates()
df = df.astype({'is_sarcastic':'int'})
df.value_counts("is_sarcastic")

is_sarcastic
0.0    2947
1.0    1337
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4284 entries, 0 to 5308
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   headline      4284 non-null   object 
 1   is_sarcastic  4284 non-null   float64
dtypes: float64(1), object(1)
memory usage: 100.4+ KB


In [None]:
df_is_sarcatic = df.loc[df["is_sarcastic"] == 1]
df_is_non_sarcatic = df.loc[df["is_sarcastic"] == 0]

In [None]:
# input = df["headline"].tolist()
# labels = df["is_sarcastic"].tolist()
# convert_file_csv(input, labels, "/content/drive/MyDrive/CS114/dataset.csv")labels

#Load model & tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("helinivan/english-sarcasm-detector", use_fast=False)
# model = AutoModelForSequenceClassification.from_pretrained("helinivan/english-sarcasm-detector")
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/CS114/model")

Downloading (…)okenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

#Few-Shot

In [6]:
df_is_sarcatic = pd.read_csv("/content/drive/MyDrive/CS114/dataset_is_sarcatic.csv")
df_is_non_sarcatic = pd.read_csv("/content/drive/MyDrive/CS114/dataset_is_non_sarcatic.csv")

input_is_sarcatic = df_is_sarcatic["headline"].tolist()
labels_is_sarcatic = df_is_sarcatic["labels"].tolist()
input_is_non_sarcatic = df_is_non_sarcatic["headline"].tolist()
labels_is_non_sarcatic = df_is_non_sarcatic["labels"].tolist()

In [23]:
samples = 650
#MAX PHASE = 2
phase = 2
# input = random.sample(input_is_non_sarcatic, samples) + input_is_sarcatic[phase*samples:samples*(phase+1)]
input = random.sample(input_is_non_sarcatic, samples) + random.sample(input_is_sarcatic, samples)
labels = [0]*samples + [1]*samples

In [24]:
input_train_val, input_test, labels_train_val, labels_test = train_test_split(input, labels, test_size = 0.3)
input_train, input_val, labels_train, labels_val = train_test_split(input_train_val, labels_train_val, test_size = 0.142857)

In [25]:
train_data_path = "/content/drive/MyDrive/CS114/train_data.csv"
val_data_path = "/content/drive/MyDrive/CS114/val_data.csv"
test_data_path = "/content/drive/MyDrive/CS114/test_data.csv"
convert_file_csv(input_train, labels_train, train_data_path)
convert_file_csv(input_val, labels_val, val_data_path)
convert_file_csv(input_test, labels_test, test_data_path)

780
780
130
130
390
390


In [26]:
train_data_files = {"train": train_data_path, "test": test_data_path, "validation": val_data_path}
train_remove_columns = ["headline"]
tokenized_datasets = get_tokenized_datasets(train_data_files, train_remove_columns)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-8e2760fdf0244605/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-8e2760fdf0244605/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/780 [00:00<?, ? examples/s]

Map:   0%|          | 0/390 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 780
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 390
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 130
    })
})


#Config trainer

In [27]:
data_collator = DataCollatorWithPadding(tokenizer)
training_args = TrainingArguments(output_dir="train_test",
                                  num_train_epochs = 15,
                                  learning_rate = 2e-5,
                                  weight_decay = 0.01,
                                  evaluation_strategy = 'epoch',
                                  )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
    data_collator = data_collator,
    compute_metrics=compute_metrics,
)

#Training

In [28]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.998454,0.769231,0.769012,0.78861,0.794872
2,No log,1.06767,0.853846,0.847257,0.848474,0.846154
3,No log,0.893117,0.861538,0.855769,0.855769,0.855769
4,No log,1.003626,0.876923,0.87,0.876524,0.865385
5,No log,0.853073,0.892308,0.883363,0.908009,0.871795
6,0.052100,0.867244,0.892308,0.8844,0.901956,0.875
7,0.052100,0.999301,0.876923,0.870936,0.87375,0.86859
8,0.052100,1.143213,0.846154,0.841618,0.838803,0.846154
9,0.052100,1.008861,0.892308,0.8844,0.901956,0.875
10,0.052100,1.339257,0.830769,0.82578,0.823118,0.830128


TrainOutput(global_step=1470, training_loss=0.02388989296900172, metrics={'train_runtime': 1129.8143, 'train_samples_per_second': 10.356, 'train_steps_per_second': 1.301, 'total_flos': 3078399347712000.0, 'train_loss': 0.02388989296900172, 'epoch': 15.0})

In [29]:
trainer.evaluate(tokenized_datasets['test'])

{'eval_loss': 1.2287980318069458,
 'eval_accuracy': 0.8692307692307693,
 'eval_f1': 0.8687743697673037,
 'eval_precision': 0.8691620879120879,
 'eval_recall': 0.8684904416611734,
 'eval_runtime': 12.6209,
 'eval_samples_per_second': 30.901,
 'eval_steps_per_second': 3.882,
 'epoch': 15.0}

#Save model

In [30]:
!rm /content/drive/MyDrive/CS114/model/config.json
!rm /content/drive/MyDrive/CS114/model/pytorch_model.bin

In [31]:
model.save_pretrained("/content/drive/MyDrive/CS114/model", from_pt=True)

#Predict

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
tokenizer = AutoTokenizer.from_pretrained("helinivan/english-sarcasm-detector", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/CS114/model")
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [4]:
sentence = "Đi Phú Yên trong ngày khởi hành từ Quy Nhơn"
res = pipe(sentence)[0]
print("Sarcatic score:", res[0]['score'])
print("Non-sarcatic score:", res[1]['score'])

Sarcatic score: 0.9999912977218628
Non-sarcatic score: 8.687332410772797e-06
