In [2]:
pip install -q transformers datasets rouge_score

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict

train_df = pd.read_csv("sent_clf_train_data.csv")
test_df = pd.read_csv("sent_clf_test_data.csv")

print("size of train:", train_df.shape)
print("size of test:", test_df.shape)

train_dict = {"text":[], "label":[]}
for label, text in zip(train_df['label'], train_df['text']):
    train_dict['text'].append(text)
    train_dict['label'].append(label)

test_dict = {"text":[], "label":[]}
for label, text in zip(test_df['label'], test_df['text']):
    test_dict['text'].append(text)
    test_dict['label'].append(label)

dataset_train = Dataset.from_dict(train_dict)
dataset_test = Dataset.from_dict(test_dict)

dataset = DatasetDict({
    'train': dataset_train, 
    'test': dataset_test
    })

dataset

size of train: (10423, 5)
size of test: (6815, 5)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10423
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 6815
    })
})

In [4]:
from transformers import AutoTokenizer

model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)



  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_name,  num_labels=2)
model.to('cuda')

print("Model initialized")

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

Model initialized


In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [12]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10423
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3260


Step,Training Loss
500,0.1738
1000,0.1404
1500,0.1219
2000,0.1011
2500,0.0824
3000,0.0605


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

TrainOutput(global_step=3260, training_loss=0.10938913953816233, metrics={'train_runtime': 384.5971, 'train_samples_per_second': 135.505, 'train_steps_per_second': 8.476, 'total_flos': 817264860319056.0, 'train_loss': 0.10938913953816233, 'epoch': 5.0})

In [13]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin
tokenizer config file saved in ./results/tokenizer_config.json
Special tokens file saved in ./results/special_tokens_map.json


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="results", tokenizer="results")

print("Model loaded")

In [15]:
from tqdm.notebook import tqdm

prediction = []
gold = []
for index in tqdm(range(len(test_dict['text']))):
    prediction.append(int(classifier(test_dict['text'][index])[0]['label'][-1]))
    gold.append(test_dict['label'][index])

  0%|          | 0/6815 [00:00<?, ?it/s]

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification report: \n\n", classification_report(gold, prediction))

Classification report: 

               precision    recall  f1-score   support

           0       0.98      0.98      0.98      6501
           1       0.57      0.49      0.53       314

    accuracy                           0.96      6815
   macro avg       0.77      0.74      0.75      6815
weighted avg       0.96      0.96      0.96      6815



In [18]:
print(confusion_matrix(gold, prediction))

[[6384  117]
 [ 160  154]]


In [1]:
# !zip -r results.zip results/
!rm -r results