In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_parquet("train-00000-of-00001.parquet")
df.head()

Unnamed: 0,text,label,label_text
0,"Explosion Rocks Baghdad Neighborhood BAGHDAD, ...",0,World
1,BBC reporters' log BBC correspondents record e...,0,World
2,Israel welcomes Rice nomination; Palestinians ...,0,World
3,Medical Journal Calls for a New Drug Watchdog ...,0,World
4,Militants Kidnap Relatives of Iraqi Minister-T...,0,World


In [3]:
df.columns

Index(['text', 'label', 'label_text'], dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48000 entries, 0 to 47999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        48000 non-null  object
 1   label       48000 non-null  int64 
 2   label_text  48000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [5]:
df.describe()

Unnamed: 0,label
count,48000.0
mean,1.5
std,1.118046
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [6]:
df.isnull().sum()

Unnamed: 0,0
text,0
label,0
label_text,0


In [7]:
df['text'].str.len()

Unnamed: 0,text
0,246
1,157
2,268
3,178
4,224
...,...
47995,185
47996,199
47997,377
47998,143


In [8]:
# text preprocessing
df['text'] = df['text'].str.lower()

In [9]:
# convert dataframe into datasets for text analysis and bert analysis
from datasets import Dataset
ds = Dataset.from_pandas(df[['text']])

In [10]:
# tokenization
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#tokenizer function
def tokenizer_function(example):
  return tokenizer(example['text'], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = ds.map(tokenizer_function, batched = True)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

In [11]:
# masking of the tokens using the special collator that are generated from tokenization
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


In [None]:
#model pretaraining and finetuning
from transformers import BertForMaskedLM, Trainer, TrainingArguments

#load the training model
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# sample = tokenized_text.shuffle().select(range(10))
# print(sample)
training_args = TrainingArguments(
    output_dir="./bert-agnews-mlm",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_steps=10_000, #save the spes after 10_000 steps
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[34m[1mwandb[0m: Currently logged in as: [33mdaudimujabi[0m ([33mdaudimujabi-dedan-kimathi-university-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# model evaluation
perplexity = trainer.eval()
print(f"The Perplexity of our fine tuned model id: \n{perplexity:.2f}")


NameError: name 'trainer' is not defined