In [1]:
import pandas as pd
import numpy as np
import random
import sklearn
import matplotlib.pyplot as plt
import torch

In [2]:
from torch import nn, optim

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [4]:
from transformers import DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments

## Load and Preprocess Data

In [5]:
data=pd.read_csv("data.tsv", sep='\t', header=None)

In [6]:
data.head()

Unnamed: 0,0,1,2,3
0,0,28,a,"Meat, fish, poultry and game; Meat extracts; P..."
1,1,34,a,Business management and organization consultan...
2,2,29,a,"Vinegar; Salts, seasonings, flavourings and co..."
3,3,41,a,Conducting technical project studies; Scientif...
4,4,35,a,Brokerage; Brokerage house services; Agencies ...


In [7]:
# drop columns that don't contain useful data, such as index column
data=data.drop(data.columns[[0, 2]], axis=1)

In [8]:
# rename the columns
data.columns=["NICE Category", "Product Description"]
# change the ordering of columns
data=data[["Product Description", "NICE Category"]]

In [9]:
data.head()

Unnamed: 0,Product Description,NICE Category
0,"Meat, fish, poultry and game; Meat extracts; P...",28
1,Business management and organization consultan...,34
2,"Vinegar; Salts, seasonings, flavourings and co...",29
3,Conducting technical project studies; Scientif...,41
4,Brokerage; Brokerage house services; Agencies ...,35


In [10]:
data.to_csv("processed_textual_data.csv")

In [11]:
description_text=data["Product Description"].tolist()

In [12]:
nice_labels=data["NICE Category"].tolist()

In [13]:
# split our data into train and validation sets
train_text, val_text, train_labels, val_labels=train_test_split(description_text, nice_labels, test_size=0.3)
val_text, test_text, val_labels, test_labels=train_test_split(val_text, val_labels, test_size=0.5)

## Tokenization

In [14]:
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [15]:
train_encodings=tokenizer(train_text, truncation=True, padding=True)

In [16]:
val_encodings=tokenizer(val_text, truncation=True, padding=True)

In [17]:
test_encodings=tokenizer(test_text, truncation=True, padding=True)

## Creating The Dataset

In [18]:
class productDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings=encodings
        self.labels=labels
    
    def __getitem__(self, idx):
        item={key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"]=torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset=productDataset(train_encodings, train_labels)
val_dataset=productDataset(val_encodings, val_labels)
test_dataset=productDataset(test_encodings, test_labels)

## Training the Classifier

In [None]:
training_args=TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=600,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=25
)

model=DistilBertModel.from_pretrained('distilbert-base-uncased')

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

In [None]:
trainer.save_model("distilBert_model")