In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install bert

In [None]:
!pip install transformers

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

In [None]:
from transformers import AutoTokenizer, BertModel 
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [6]:
df = pd.read_csv ('/content/drive/MyDrive/Quize/news.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,title,category
0,Wall St. Bears Claw Back Into the Black (Reuters),Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters),Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,Business
4,"Oil prices soar to all-time record, posing new...",Business


In [27]:
df.category.unique()

array(['Business', 'Sci/Tech', 'Sports', 'World'], dtype=object)

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['label']=encoder.fit_transform(df['category'])


In [8]:
X=df.title
Y=df.label
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

In [9]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

In [10]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=70)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=70)

In [11]:
class DATALOADER(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = DATALOADER(train_encodings, y_train)
valid_dataset = DATALOADER(valid_encodings, y_test)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=4)

In [13]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [15]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,            
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,  
    warmup_steps=500,              
    weight_decay=0.01,               
    logging_dir='./logs',           
    load_best_model_at_end=True,     
    logging_steps=400,              
    save_steps=400,
    evaluation_strategy="steps",    
)

In [16]:
from transformers import  Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=valid_dataset,         
    compute_metrics=compute_metrics,    
)

In [17]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
400,0.9587,0.608322,0.802146
800,0.6342,0.609283,0.799217
1200,0.6113,0.533668,0.831439
1600,0.5603,0.54307,0.821869
2000,0.5532,0.592561,0.828106
2400,0.5383,0.51782,0.842172
2800,0.5338,0.48808,0.834823
3200,0.4805,0.540743,0.844343
3600,0.5104,0.502918,0.844874
4000,0.4808,0.470768,0.851313


TrainOutput(global_step=10050, training_loss=0.4908882909746312, metrics={'train_runtime': 5070.0154, 'train_samples_per_second': 15.858, 'train_steps_per_second': 1.982, 'total_flos': 2768265789494400.0, 'train_loss': 0.4908882909746312, 'epoch': 1.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.38112711906433105,
 'eval_accuracy': 0.8838636363636364,
 'eval_runtime': 149.3595,
 'eval_samples_per_second': 265.132,
 'eval_steps_per_second': 13.257,
 'epoch': 1.0}

In [19]:
# saving the fine tuned model & tokenizer
model_path = "bert-base-cased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('bert-base-cased/tokenizer_config.json',
 'bert-base-cased/special_tokens_map.json',
 'bert-base-cased/vocab.txt',
 'bert-base-cased/added_tokens.json',
 'bert-base-cased/tokenizer.json')

In [28]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=70 ,return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    target_names=['Business', 'Sci/Tech', 'Sports', 'World']
    return target_names[probs.argmax()]

In [30]:
get_prediction('The price of oil is not stable')

'Business'