In [1]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pandas as pd
import torch
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marharyta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/marharyta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/marharyta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# create dataset using sklearn
# 0 is business, 1 is entertainment, 2 is politics, 3 is sport, 4 is tech
labels=["business","entertainment","politics","sport","tech"]
bbc_data = load_files(r"/Users/marharyta/Desktop/Data_science/IT_Academy/Data_science_course/Module_6_NLP/Homework/bbc")
X, y = bbc_data.data, bbc_data.target

In [3]:
#convert dataset to pandas df
NUM_LABELS= len(labels)
id2label={i:l for i,l in enumerate(labels)}
label2id={l:i for i,l in enumerate(labels)}

df = pd.DataFrame(data = X, columns = ['text'])
df['target'] = y
df["category"]=df.target.map(lambda x: id2label[x])
df.head()

Unnamed: 0,text,target,category
0,b'Tate & Lyle boss bags top award\n\nTate & Ly...,0,business
1,"b""Halo 2 sells five million copies\n\nMicrosof...",4,tech
2,b'MSPs hear renewed climate warning\n\nClimate...,2,politics
3,b'Pavey focuses on indoor success\n\nJo Pavey ...,3,sport
4,b'Tories reject rethink on axed MP\n\nSacked M...,2,politics


In [4]:
#use regex to clean the text
# text preprocessing, remove all the special characters using regex
from nltk.stem import WordNetLemmatizer
from pymystem3 import Mystem
stemmer = Mystem()
def regex_preprocessing(X):
    documents = []

    for sen in range(len(X)):
        # Remove all the special characters, only letters are left
        document = str(X[sen])
        document = document.rstrip()
        document = re.sub(r'\W', ' ', document)

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()
        # Lemmatization
        #document = document.split()
        document = "".join(stemmer.lemmatize(document)).strip()


        documents.append(document)

    return documents

In [5]:
df['preprocessed_text'] = regex_preprocessing(X)

In [6]:
df.head()

Unnamed: 0,text,target,category,preprocessed_text
0,b'Tate & Lyle boss bags top award\n\nTate & Ly...,0,business,tate lyle boss bags top award ntate lyle chief...
1,"b""Halo 2 sells five million copies\n\nMicrosof...",4,tech,halo 2 sells five million copies nmicrosoft is...
2,b'MSPs hear renewed climate warning\n\nClimate...,2,politics,msps hear renewed climate warning nclimate cha...
3,b'Pavey focuses on indoor success\n\nJo Pavey ...,3,sport,pavey focuses on indoor success njo pavey will...
4,b'Tories reject rethink on axed MP\n\nSacked M...,2,politics,tories reject rethink on axed mp nsacked mp ho...


In [7]:
# now use TFIDF to convert words to vectors
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(regex_preprocessing(X)).toarray()

In [8]:
# now we are ready for classical machine learning
# test-train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [9]:
# quality report
def quality_report(prediction, actual):
    print("Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\nf1_score: {:.3f}".format(
        accuracy_score(actual, prediction),
        precision_score(actual, prediction, average='micro'),
        recall_score(actual, prediction, average='micro'),
        f1_score(actual, prediction, average='micro')
    ))

In [10]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

print("Train quality:")
quality_report(log_reg.predict(X_train), y_train)

print("\nTest quality:")
quality_report(y_pred, y_test)

Train quality:
Accuracy: 0.996
Precision: 0.996
Recall: 0.996
f1_score: 0.996

Test quality:
Accuracy: 0.971
Precision: 0.971
Recall: 0.971
f1_score: 0.971


In [11]:
# inference
text = ["One year ago, on the same day Lewis Hamilton was being crowned BBC Sports Personality of the Year for 2020, Emma Raducanu swept aside Grace Piper to win the BT Masters title at the National Tennis Centre in Roehampton. Afterwards Raducanu, then ranked 345th in the world and yet to make her main WTA tour debut, provided a glimpse of the steely resolve that, 12 months on, would see her succeed Hamilton to put her own name on the prestigious trophy."]
text_preproc = regex_preprocessing(text)
text_transformed = tfidfconverter.transform(text_preproc).toarray()
my_prediction = log_reg.predict(text_transformed)


In [12]:
# make class prediction
x = int(my_prediction)
label = id2label[x]
print(label)

sport


In [13]:
# random forest
from sklearn.ensemble import RandomForestClassifier
random_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
random_classifier.fit(X_train, y_train)
y_pred = random_classifier.predict(X_test)

print("Train quality:")
quality_report(random_classifier.predict(X_train), y_train)

print("\nTest quality:")
quality_report(random_classifier.predict(X_test), y_test)

Train quality:
Accuracy: 1.000
Precision: 1.000
Recall: 1.000
f1_score: 1.000

Test quality:
Accuracy: 0.953
Precision: 0.953
Recall: 0.953
f1_score: 0.953


In [14]:
# using DistilBERT
from transformers import DistilBertForSequenceClassification
#from transformers import BertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [16]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


In [17]:
# train test split
SIZE= df.shape[0]

train_texts= list(df.preprocessed_text[:SIZE//2])
val_texts=   list(df.preprocessed_text[SIZE//2:(3*SIZE)//4 ])
test_texts=  list(df.preprocessed_text[(3*SIZE)//4:])

train_labels= list(df.target[:SIZE//2])
val_labels=   list(df.target[SIZE//2:(3*SIZE)//4])
test_labels=  list(df.target[(3*SIZE)//4:])

In [18]:
#check if everything is alright
print(train_texts[0])
print(train_labels[0])

tate lyle boss bags top award ntate lyle chief executive has been named european businessman of the year by leading business magazine niain ferguson was awarded the title by us publication forbes for returning one of the uk venerable manufacturers to the country top 100 companies the sugar group had been absent from the ftse 100 for seven years until mr ferguson helped it return to growth tate shares have leapt 55 this year boosted by firming sugar prices and sales of its artificial sweeteners n after years of sagging stock price and seven year hiatus from the ftse 100 one of britain venerable manufacturers has returned to the vaunted index forbes said mr ferguson took the helm at the company in 2003 after spending most of his career at consumer goods giant unilever tate lyle which was an original member of the historic ft 30 index in 1935 operates more than 41 factories and 20 more additional production facilities in 28 countries previous winners of the forbes award include royal bank

In [19]:
len(train_texts), len(val_texts), len(test_texts)

(1112, 556, 557)

In [20]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [21]:
from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [22]:
train_dataset = MyDataset(train_encodings, train_labels)
val_dataset = MyDataset(val_encodings, val_labels)
test_dataset = MyDataset(test_encodings, test_labels)

In [23]:
train_dataset [1]

{'input_ids': tensor([  101, 17201,  1016, 15187,  2274,  2454,  4809, 22484, 26775, 19137,
          6199,  2003, 12964, 21519,  4341,  1997,  2049, 12202, 16596, 10882,
         13108, 17201,  1016, 23961,  5369,  2208,  2038,  2853,  2062,  2084,
          2274,  2454,  4809,  4969,  2144,  2009,  2253,  2006,  5096,  1999,
          3054,  2281,  1996,  2194,  2056, 17201,  1016,  2038,  4928,  2759,
          3784,  2007, 27911,  2015, 18624,  2075,  2039,  2501,  2654,  2454,
          2847,  2652,  1996,  2208,  2006, 12202,  2444,  2429,  2000,  7513,
          3157,  2041,  1997,  2184, 12202,  2444,  2372,  2031,  2209,  1996,
          2208,  2005,  2019,  2779,  1997,  6205,  2781,  2566,  5219, 23961,
          5369,  8297,  2000,  1996,  2190,  4855,  2342,  2005,  3177,  5230,
          2038, 25330,  3805,  1997,  1996,  2971,  2000,  2202,  1996,  2327,
         10453,  1999,  1996,  2880,  2866,  2399,  6093,  1996,  3868,  2208,
          2333,  2039,  2028,  3962,  2

In [24]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [25]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./../outputs/TBERT/',
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./../outputs/multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    # save_strategy="epoch",
    #fp16=True,
    load_best_model_at_end=True
)

In [26]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1112
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 210


Step,Training Loss,Validation Loss


In [None]:
q=[trainer.evaluate(eval_dataset=data) for data in [train_dataset, val_dataset, test_dataset]]
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

In [None]:
# inference
def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs, probs.argmax(),model.config.id2label[probs.argmax().item()]

In [None]:
text = "liam gallagher enjoying the champions league final saturday sporting social"
predict(text)