In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import pandas as pd
from datasets import load_dataset
from datasets import Dataset


In [None]:
!pip install evaluate

In [13]:
import evaluate

In [14]:
df = pd.read_csv("/kaggle/input/huffdata-undersampled4k/test_set.csv")

In [15]:
df['labels'] = df['labels'].replace(["BUSINESS", "ENTERTAINMENT", "SPORTS", "OTHER"] ,[0, 1, 2, 3])

In [16]:
df['labels'].value_counts()

labels
3    2275
0    1228
2    1021
1     925
Name: count, dtype: int64

In [17]:
raw_data = Dataset.from_pandas(df)

In [18]:
split_data = raw_data.train_test_split(test_size=0.1)

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [20]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [31]:
tokenized_huff = split_data.map(preprocess_function, batched = True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
from transformers import DataCollatorWithPadding

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
import numpy as np

In [25]:
accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [27]:
id2label = {0: "BUSINESS", 1: "ENTERTAINMENT", 2: "SPORTS", 3: "OTHER"}
label2id = {"BUSINESS": 0, "ENTERTAINMENT" : 1, "SPORTS" : 2, "OTHER" : 3}


In [30]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base",  num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    report_to="wandb",
    output_dir="DistilroBERTa-merge-df",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_huff['train'],
    eval_dataset=tokenized_huff['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3368,0.250214,0.921101
2,0.1985,0.230796,0.92844
3,0.1611,0.28464,0.92844
4,0.1604,0.295567,0.933945


TrainOutput(global_step=2452, training_loss=0.19678815309608547, metrics={'train_runtime': 648.1874, 'train_samples_per_second': 30.263, 'train_steps_per_second': 3.783, 'total_flos': 2596826196330240.0, 'train_loss': 0.19678815309608547, 'epoch': 4.0})

In [1]:
from transformers import pipeline

In [2]:
import pandas as pd

In [3]:
classifier = pipeline('text-classification', model="/kaggle/working/DistilroBERTa-merge-df/checkpoint-2452")

In [14]:
test_df = pd.read_csv("/kaggle/input/huffdata-undersampled4k/huffdata_under18k.csv")

In [15]:
raw_data = Dataset.from_pandas(test_df)

In [23]:
split_data = raw_data.train_test_split(test_size=0.05)

In [25]:
split_data['test']

Dataset({
    features: ['text', 'labels'],
    num_rows: 926
})

In [17]:
label2id = {"BUSINESS": 0, "ENTERTAINMENT" : 1, "SPORTS" : 2, "OTHER" : 3}

In [18]:
def test_pipeline(df):
    X = list(df['text'])
    y_act = list(df['labels'])
    labels = list(label2id.keys())
    return X, y_act, labels

In [26]:
X, y_act, labels = test_pipeline(split_data['test'])

In [27]:
len(X)

926

In [28]:
y_pred = [result["label"] for result in classifier(X, padding=True, truncation=True)]

In [29]:
from sklearn.metrics import classification_report

In [30]:
print(classification_report(y_pred, y_act, labels=labels))

               precision    recall  f1-score   support

     BUSINESS       0.63      0.85      0.72       196
ENTERTAINMENT       0.94      0.73      0.82       300
       SPORTS       0.62      0.95      0.75       151
        OTHER       0.55      0.39      0.45       279

     accuracy                           0.69       926
    macro avg       0.69      0.73      0.69       926
 weighted avg       0.70      0.69      0.68       926

