<h3> Imports

In [12]:
#Pytorch
import torch
import torch.nn.functional as F 

#Transformers
from transformers import pipeline

#Maths
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Sckit 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

#NLP
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Others
import seaborn as sns
import time
import string
import os
import random

In [13]:
#Torchtext
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

<h3>Loading News Data

This is the data preprocessed in the TFIDF notebook.

In [14]:
data = pd.read_csv('datasets/news-article-categories-clean.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  6871 non-null   object
 1   title     6871 non-null   object
 2   body      6871 non-null   object
dtypes: object(3)
memory usage: 161.2+ KB


<h3> Pipeline

In [21]:
### TASK ###
task = "zero-shot-classification"
#task = "text-classification"

### MODEL ###
model = "facebook/bart-large-mnli"

pipe = pipeline(task, model)

<h4> Pipeline Test

In [65]:
sequence = ['Trump is saying nonsense again', "Shops are going under"]
labels = ["energy", "retail", "politics", "economy"]
res = pipe(sequence, labels)

In [71]:
sequence[0]

'Trump is saying nonsense again'

In [67]:
res[0]

{'sequence': 'Trump is saying nonsense again',
 'labels': ['politics', 'retail', 'economy', 'energy'],
 'scores': [0.8485413789749146,
  0.06577206403017044,
  0.049382347613573074,
  0.03630419820547104]}

<h2> BART

In [89]:
labels = data['category'].unique()

<h4> Target Vector

In [92]:
title = data.iloc[:, 1].values
X = data.iloc[:, 2].values
y = data.iloc[:, 0].values

<h4> TF-IDF Matrix

In [93]:
# Building a TF IDF matrix out of the corpus of reviews
td = TfidfVectorizer(max_features = 4500)
X_vect = td.fit_transform(X).toarray()
title_vect = td.fit_transform(title).toarray()

<h4> Train / Test Split

In [94]:
# Splitting into training & test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size = 0.3,random_state = 0)

<h4> Results

In [104]:
title_test = title[0:10]
body_test = X[0:2]

In [101]:
results = pipe(title_test.tolist(),labels)

In [105]:
results2 = pipe(body_test.tolist(),labels)

In [109]:
y_pred = []
for dic in results:
    y_pred.append(dic['labels'][0])

In [110]:
y_pred2 = []
for dic in results2:
    y_pred2.append(dic['labels'][0])

In [112]:
print('____________ PREDICTED CATEGORIES FROM TITLE ____________')
for cat in y_pred:
    print(cat)
print('____________ PREDICTED CATEGORIES FROM BODY ____________')
for cat in y_pred2:
    print(cat)
print('____________ REAL CATEGORIES ____________')
for cat in y[0:10]:
    print(cat)

____________ PREDICTED CATEGORIES FROM TITLE ____________
CRIME
ENTERTAINMENT
MEDIA
ENTERTAINMENT
COMEDY
MEDIA
ENTERTAINMENT
ARTS & CULTURE
BUSINESS
ENTERTAINMENT
____________ PREDICTED CATEGORIES FROM BODY ____________
TECH
ENTERTAINMENT
____________ REAL CATEGORIES ____________
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE


<h2> BERT

<h3> Tokenization

In [106]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [129]:
test = ["This is a test", "Does it work with lists?"]

In [130]:
encoding = tokenizer.encode_plus(test, add_special_tokens = True,    
                                    truncation = True, 
                                    padding = "max_length", 
                                    return_attention_mask = True, 
                                    return_tensors = "pt")

TO COMPLETE

<h2> DistilBERT

<h3> Tokenization

In [126]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [141]:
def preprocess_function(text_df):
    return tokenizer(text_df, truncation=True)

In [145]:
tokenized_body = tokenizer(X.tolist(), truncation = True)

<h3> Batch

In [146]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

<h3> Evaluation

In [147]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [148]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

<h3>Train

In [151]:
labels

array(['ARTS & CULTURE', 'BUSINESS', 'COMEDY', 'CRIME', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'MEDIA', 'POLITICS', 'RELIGION',
       'SCIENCE', 'SPORTS', 'TECH', 'WOMEN'], dtype=object)

In [152]:
id2label = {0: "ARTS & CULTURE", 1: "BUSINESS",2: "COMEDY", 3: "CRIME",4: "EDUCATION", 5: "ENTERTAINMENT",6: "ENVIRONMENT", 7: "MEDIA",8: "POLITICS", 9: "RELIGION", 10: "SCIENCE", 11: "SPORTS",12: "TECH", 13: "WOMEN"}
label2id = {"ARTS & CULTURE" : 0,"BUSINESS" : 1 ,"COMEDY" : 2,"CRIME" : 3, "EDUCATION" : 4,"ENTERTAINMENT" : 5, "ENVIRONMENT" : 6, "MEDIA" : 7 ,"POLITICS" : 8, "RELIGION" : 9, "SCIENCE" : 10, "SPORTS" : 11 ,"TECH" : 12 , "WOMEN" : 13}

In [153]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=14, id2label=id2label, label2id=label2id
)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [154]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_body["train"],
    eval_dataset=tokenized_body["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

KeyError: 'train'

<h3> Test