# Topic Classification Workshop
Topic classification is a fundamental task in Natural Language Processing (NLP) that involves categorizing text into predefined categories based on its content. This process is crucial for various applications, including content organization, sentiment analysis, and information retrieval. In this notebook, we will develop a topic classification model to automatically classify text documents into specific categories using machine learning techniques.

## Objective
The objective of this notebook is to build an effective topic classification model that can accurately classify text into their respective topics. We will preprocess the text data, apply feature extraction techniques, and train a machine learning model to achieve high classification performance.

## Data
The dataset that should be used for this task consists of a text field and a label following these topics:
- Medical
- Finance
- Sports
- Politics
- Culture
- Tech
- Religion
-.....

## Collect Data from any source that satisfies your need
Use any way of data collection to do this task. At least 1000 datapoint per topic.


In [None]:
!pip -q install datasets matplotlib seaborn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m28.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip -q install evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns


import torch

In [None]:
print(torch.cuda.is_available())

True


In [None]:
dataset = load_dataset('jonaskoenig/topic_classification', split='train[:1%]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/869M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/869M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/124M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/124M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13054978 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1865000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3729994 [00:00<?, ? examples/s]

In [None]:
print(dataset)

Dataset({
    features: ['work', 'news', 'sports', 'music', 'movies', 'politics', 'phones', 'self-driving cars', 'family', 'cars', 'climate change', 'languages', 'business', 'health', 'science', 'style', 'opinion', 'economy', 'history', 'technology', 'affair', 'development', 'mobility', 'text'],
    num_rows: 130550
})


In [None]:
def create_label(dataset):
  keys_with_one = [key for key, value in dataset.items() if value == 1]
  if keys_with_one:
    dataset['label_text'] = keys_with_one[0]
    # Add the ID of the first '1'
    dataset['label'] = list(dataset.keys()).index(keys_with_one[0])
  return dataset

In [None]:
dataset = dataset.map(create_label)

Map:   0%|          | 0/130550 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'work': 0,
 'news': 0,
 'sports': 0,
 'music': 0,
 'movies': 0,
 'politics': 0,
 'phones': 0,
 'self-driving cars': 0,
 'family': 1,
 'cars': 0,
 'climate change': 0,
 'languages': 0,
 'business': 0,
 'health': 0,
 'science': 1,
 'style': 0,
 'opinion': 0,
 'economy': 0,
 'history': 0,
 'technology': 0,
 'affair': 0,
 'development': 0,
 'mobility': 1,
 'text': 'And this year, the number will be over 150,000 ventilators.',
 'label_text': 'family',
 'label': 8}

Medical
Finance
Sports
Politics
Culture
Tech
Religion

In [None]:
# prompt: i want to remove all columns sauf text,label and label_id

# Select the columns you want to keep
columns_to_keep = ['text', 'label', 'label_text']

# Remove all other columns from train_dataset
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

print(dataset.column_names)


['text', 'label_text', 'label']


In [None]:
dataset[0]

{'text': 'And this year, the number will be over 150,000 ventilators.',
 'label_text': 'family',
 'label': 8}

## Processing the data to be alligned with the data structure of the needed task

In [None]:
import re
def remove_punctuation_and_extra_spaces(text):
    # Replace multiple occurrences of ! and ? with a single occurrence
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)
    # Remove all other punctuation
    text = re.sub(r'[^\w\s!?]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
label2id = {"work":0,"news":1,"Sports":2,"music":3,"movies":4,"Politics":5,"phones":6,
            "self-driving cars":7,"family":8,"cars":9,"climate change":10,"languages":11,
            "business":12,"Medical": 13,"science":14,"style":15,"Religion":16,"Finance":17,
            "history":18,"Tech":19,"affair":20,"Culture":21,"mobility":22
            }
id2label = {v: k for k, v in label2id.items()}

In [None]:
id2label

{0: 'work',
 1: 'news',
 2: 'Sports',
 3: 'music',
 4: 'movies',
 5: 'Politics',
 6: 'phones',
 7: 'self-driving cars',
 8: 'family',
 9: 'cars',
 10: 'climate change',
 11: 'languages',
 12: 'business',
 13: 'Medical',
 14: 'science',
 15: 'style',
 16: 'Religion',
 17: 'Finance',
 18: 'history',
 19: 'Tech',
 20: 'affair',
 21: 'Culture',
 22: 'mobility'}

In [None]:
def preprocess_function(examples):
    examples['text'] = [remove_punctuation_and_extra_spaces(text) for text in examples['text']]
    tokenized_examples = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    # Convert labels to lowercase to match label2id and handle potential KeyErrors
    # examples["label"] = [label2id.get(l.lower(), -1) for l in examples["label"]] # Use .get() with a default value to handle missing keys

    return tokenized_examples

# Apply the preprocessing to the dataset
preprocessed_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/130550 [00:00<?, ? examples/s]

In [None]:
# If you want to sample randomly across all categories:
import random
indices = random.sample(range(len(preprocessed_dataset)), 2000)
small_train_dataset = preprocessed_dataset.select(indices[:1000]).shuffle(seed=42)
small_eval_dataset = preprocessed_dataset.select(indices[1000:]).shuffle(seed=42)

In [None]:
small_train_dataset

Dataset({
    features: ['text', 'label_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
small_eval_dataset

Dataset({
    features: ['text', 'label_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

## Training: Wrap the training process in a class with all the needed function to train and evalutate

In [None]:
from transformers import AutoModelForSequenceClassification, BertConfig, BertForSequenceClassification

model_name = 'bert-base-uncased'
# Update num_labels to match the number of labels in your id2label dictionary
num_labels = len(id2label)
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels, id2label=id2label, label2id=label2id)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = BertForSequenceClassification.from_pretrained(model_name, config=config)
print(model)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
if torch.cuda.is_available():
    model = model.cuda()

In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

training_args = TrainingArguments(
    output_dir="trainer_output",
    evaluation_strategy="epoch"
    , num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_strategy='epoch'
    )

metric = evaluate.load("f1")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='micro') # Change average to 'weighted'

model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,2.5484,2.450263,0.263


Epoch,Training Loss,Validation Loss,F1
1,2.5484,2.450263,0.263
2,2.1177,2.18505,0.35
3,1.4534,2.148076,0.395


## Evaluate your model
Use f1-score as metric for evalution and testing

In [None]:
import evaluate

trainer.evaluate()

## Test the model on the testing dataset provided

In [None]:
from transformers import pipeline

# Use the trainined model for inference
text_classification = pipeline("text-classification")
# model = "/content/trainer_output/checkpoint-500", tokenizer=tokenizer

In [None]:
text_classification("I dont know if the rest of you noticed it, but just last month alone the new jobs report came out Friday 313,000 new jobs created.")