<h3> Imports

In [12]:
#Pytorch
import torch
import torch.nn.functional as F 

#Transformers
from transformers import pipeline

#Maths
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Sckit 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

#NLP
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Others
import seaborn as sns
import time
import string
import os
import random

In [13]:
#Torchtext
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

<h3>Loading News Data

This is the data preprocessed in the TFIDF notebook.

In [14]:
data = pd.read_csv('datasets/news-article-categories-clean.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  6871 non-null   object
 1   title     6871 non-null   object
 2   body      6871 non-null   object
dtypes: object(3)
memory usage: 161.2+ KB


<h3> Pipeline

In [21]:
### TASK ###
task = "zero-shot-classification"
#task = "text-classification"

### MODEL ###
model = "facebook/bart-large-mnli"

pipe = pipeline(task, model)

<h4> Pipeline Test

In [65]:
sequence = ['Trump is saying nonsense again', "Shops are going under"]
labels = ["energy", "retail", "politics", "economy"]
res = pipe(sequence, labels)

In [71]:
sequence[0]

'Trump is saying nonsense again'

In [67]:
res[0]

{'sequence': 'Trump is saying nonsense again',
 'labels': ['politics', 'retail', 'economy', 'energy'],
 'scores': [0.8485413789749146,
  0.06577206403017044,
  0.049382347613573074,
  0.03630419820547104]}

<h2> BART

In [89]:
labels = data['category'].unique()

<h4> Target Vector

In [92]:
title = data.iloc[:, 1].values
X = data.iloc[:, 2].values
y = data.iloc[:, 0].values

<h4> TF-IDF Matrix

In [93]:
# Building a TF IDF matrix out of the corpus of reviews
td = TfidfVectorizer(max_features = 4500)
X_vect = td.fit_transform(X).toarray()
title_vect = td.fit_transform(title).toarray()

<h4> Train / Test Split

In [94]:
# Splitting into training & test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size = 0.3,random_state = 0)

<h4> Results

In [104]:
title_test = title[0:10]
body_test = X[0:2]

In [101]:
results = pipe(title_test.tolist(),labels)

In [105]:
results2 = pipe(body_test.tolist(),labels)

In [109]:
y_pred = []
for dic in results:
    y_pred.append(dic['labels'][0])

In [110]:
y_pred2 = []
for dic in results2:
    y_pred2.append(dic['labels'][0])

In [112]:
print('____________ PREDICTED CATEGORIES FROM TITLE ____________')
for cat in y_pred:
    print(cat)
print('____________ PREDICTED CATEGORIES FROM BODY ____________')
for cat in y_pred2:
    print(cat)
print('____________ REAL CATEGORIES ____________')
for cat in y[0:10]:
    print(cat)

____________ PREDICTED CATEGORIES FROM TITLE ____________
CRIME
ENTERTAINMENT
MEDIA
ENTERTAINMENT
COMEDY
MEDIA
ENTERTAINMENT
ARTS & CULTURE
BUSINESS
ENTERTAINMENT
____________ PREDICTED CATEGORIES FROM BODY ____________
TECH
ENTERTAINMENT
____________ REAL CATEGORIES ____________
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE
ARTS & CULTURE


<h2> BERT

<h3> Tokenization

In [106]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [119]:
text = ["This is a test", "Does it work with lists?"]

In [124]:
encoding = tokenizer.encode_plus(text, add_special_tokens = True,    
                                    truncation = True, 
                                    padding = "max_length", 
                                    return_attention_mask = True, 
                                    return_tensors = "pt")

In [125]:
encoding

{'input_ids': tensor([[101, 100, 100, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,