In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import pandas as pd
from datasets import load_dataset
from datasets import Dataset




In [2]:
!pip install evaluate

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [3]:
import evaluate

In [4]:
df = pd.read_csv("/kaggle/input/huffdata-undersampled4k/huffdata_under18k.csv")

In [5]:
df['labels'] = df['labels'].replace(["BUSINESS", "ENTERTAINMENT", "SPORTS", "OTHER"] ,[0, 1, 2, 3])

In [6]:
df['labels'].value_counts()

labels
0    5113
1    5000
2    4391
3    4006
Name: count, dtype: int64

In [7]:
raw_data = Dataset.from_pandas(df)

In [8]:
split_data = raw_data.train_test_split(test_size=0.1)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
tokenized_huff = split_data.map(preprocess_function, batched = True)

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [12]:
from transformers import DataCollatorWithPadding

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
import numpy as np

In [15]:
accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [17]:
id2label = {0: "BUSINESS", 1: "ENTERTAINMENT", 2: "SPORTS", 3: "OTHER"}
label2id = {"BUSINESS": 0, "ENTERTAINMENT" : 1, "SPORTS" : 2, "OTHER" : 3}


In [18]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",  num_labels=4, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="DistilBERT",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_huff['train'],
    eval_dataset=tokenized_huff['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4258,0.388695,0.883306
2,0.3407,0.396263,0.887628
3,0.2626,0.445465,0.89141
4,0.2372,0.470516,0.893571


TrainOutput(global_step=8332, training_loss=0.3258708673661223, metrics={'train_runtime': 2027.1004, 'train_samples_per_second': 32.873, 'train_steps_per_second': 4.11, 'total_flos': 8822141731486776.0, 'train_loss': 0.3258708673661223, 'epoch': 4.0})

In [None]:
text = 'ANAHEIM, Calif. — Kody Clemens will never catch his father in career strikeouts. When it comes to their first one, though, the kid has the upper hand.The Detroit Tigers’ rookie utility player — and son of seven-time Cy Young Award winner Roger Clemens — caught Shohei Ohtani looking late in Monday night’s 10-0 loss to the Los Angeles Angels.Mopping up in the eighth inning, Clemens froze last year’s AL MVP with a 68 mph pitch on the outside corner for his first major league strikeout and gave an exuberant strike three call on the mound.Ohtani hit two home runs earlier — his sixth multi-homer game of the season.“I’m just trying to get outs, and for it to come like that is obviously super cool,” Clemens said. “He’s the best player in baseball. So it’s a pretty cool moment for me.”Grinning ear to ear, the 26-year-old Clemens tossed the souvenir ball into the dugout for safe keeping and is hoping to get it signed by the Angels’ two-way star during the final two days of the series.“It was a wonderful pitch,” Ohtani said through his interpreter.By far the hardest one of their encounter, too.Ohtani fouled off a 54 mph delivery from Clemens, took a 57 mph pitch for a ball and then fouled off a 56 mph offering.“I mean, obviously can’t do it slow enough for Ohtani. Maybe you can throw it slow and slow and slow and then maybe surprise him with a fastball,” Detroit manager A.J. Hinch said. “Big smile on Kody’s face, and Shohei tipped his cap.”Those were four of the seven slowest pitches Ohtani has faced this season, according to MLB Statcast. The other three were by Tampa Bay outfielder Brett Phillips, including a 53 mph “fastball” during the Angels’ 12-0 victory on May 12, when Reid Detmers threw a no-hitter for Los Angeles.Clemens was pitching for the third time in the past seven days to save Detroit’s bullpen with the game out of reach. He worked one inning and allowed one run on three hits.He has pitched six times this season and given up three runs on 10 hits in six innings.Roger Clemens had 4,672 strikeouts during his 24-year career, third on the all-time list behind Nolan Ryan and Randy Johnson. The Rocket’s first one came against Cleveland’s Mike Hargrove on May 15, 1984.“I don’t know how (Kody’s) father would have done against (Ohtani), but he can always say he punched him out,” Hinch said.'

In [1]:
from transformers import pipeline

In [2]:
import pandas as pd

In [3]:
classifier = pipeline('text-classification', model="/kaggle/working/DistilBERT/checkpoint-8332")

In [4]:
test_df = pd.read_csv("/kaggle/input/huffdata-undersampled4k/test_set.csv")

In [5]:
label2id = {"BUSINESS": 0, "ENTERTAINMENT" : 1, "SPORTS" : 2, "OTHER" : 3}

In [6]:
X = list(test_df["text"])
y_act = list(test_df["labels"])
labels = list(label2id.keys())

In [7]:
y_pred = [result["label"] for result in classifier(X, padding=True, truncation=True)]

In [8]:
from sklearn.metrics import classification_report

In [9]:
print(classification_report(y_pred, y_act, labels=labels))

               precision    recall  f1-score   support

     BUSINESS       0.82      0.60      0.69      1688
ENTERTAINMENT       0.85      0.82      0.84       967
       SPORTS       1.00      0.91      0.95      1120
        OTHER       0.61      0.83      0.70      1674

     accuracy                           0.77      5449
    macro avg       0.82      0.79      0.80      5449
 weighted avg       0.80      0.77      0.77      5449

