In [1]:
# !pip install spark-nlp-display
# Install PySpark and Spark NLP
# ! pip install -q pyspark==3.1.2 spark-nlp==4.2.8
import pandas as pd
import numpy as np
import json

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Opening JSON file
# f = open('/content/drive/MyDrive/final_df_100.json')

# returns JSON object as
# a dictionary
# data = json.load(f)

data = []
with open('/content/drive/MyDrive/final_df_1000.json') as f:
    for line in f:
        data.append(json.loads(line))

In [3]:
!pip install transformers[torch]
!pip install datasets
!pip install accelerate



In [4]:
import datasets
import pandas as pd

In [53]:
dataset = pd.DataFrame(data)

labels = list(set(dataset['section']))
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
dataset['section']= dataset['section'].map(label2id)
dataset = dataset[['text', 'section']]
dataset = datasets.Dataset.from_pandas(dataset)
dataset = dataset.rename_column('section','label')
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [62]:
labels

['Science',
 'Politics',
 'showtime',
 'Companies',
 'latest-news',
 'magazine',
 'Crime',
 'NEWS',
 'U.S.',
 'NEON',
 'Politics, Plan and Policy',
 'opinion',
 'world',
 'Markets',
 'World',
 'Health',
 'Nation',
 'In Other News',
 'Economy',
 'MoneyWatch']

In [55]:
from transformers import AutoTokenizer, BertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(examples):
  return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [56]:
from transformers import DataCollatorWithPadding

data_collector = DataCollatorWithPadding(tokenizer = tokenizer )

In [8]:
!pip install evaluate



In [9]:
import evaluate
accuracy = evaluate.load('accuracy')

In [59]:
import numpy as np

def compute_metric(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions = predictions, references = labels)


In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels = len(labels),
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size = 0.1)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [13]:
!pip install accelerate>=0.20.1

'U.S.'

In [60]:
from transformers import TrainingArguments, Trainer
import torch
print(torch.cuda.is_available())
device = torch.device('cuda')
model.to(device)
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collector,
    compute_metrics=compute_metric,
)

trainer.train()

True


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.032358,0.73
2,No log,0.926649,0.73


TrainOutput(global_step=226, training_loss=1.087593146130047, metrics={'train_runtime': 34.6761, 'train_samples_per_second': 51.909, 'train_steps_per_second': 6.517, 'total_flos': 59629464576000.0, 'train_loss': 1.087593146130047, 'epoch': 2.0})

In [61]:
trainer.predict(tokenized_dataset["test"])

PredictionOutput(predictions=array([[-1.0518942 , -0.2680969 , -0.6362022 , ..., -1.1226237 ,
        -1.1738908 , -1.0356212 ],
       [-1.0834268 , -0.4502835 , -0.38026085, ..., -0.9450165 ,
        -0.7926061 , -1.4387949 ],
       [-0.64243585,  0.41861585, -0.49002892, ..., -1.1765279 ,
        -0.63304347, -0.50780463],
       ...,
       [-0.40958855,  0.85681903, -0.20963767, ..., -0.7878127 ,
        -0.22873043, -0.21244939],
       [-1.0083853 , -0.38021863, -0.68114007, ..., -1.1260831 ,
        -1.2241532 , -0.9820643 ],
       [-0.9611488 , -0.5666479 , -0.6711311 , ..., -1.1576688 ,
        -1.1546293 , -0.9411543 ]], dtype=float32), label_ids=array([12,  7, 12, 12,  8, 10, 12,  8, 14,  2,  7, 12, 12,  1, 12, 14, 12,
        1,  7, 12, 13, 12, 12, 12,  4,  7, 12,  1,  4, 12,  7, 14, 12,  8,
        4, 12, 12, 12,  1, 12, 12,  4,  7,  7,  8, 12, 12, 12, 12,  7,  7,
        7, 12, 12,  7, 12,  4,  7,  7, 12,  7,  4, 12,  7, 12,  4, 12,  8,
       13,  7, 12, 12, 13, 12,  