In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import random
import sklearn
import matplotlib.pyplot as plt
import torch

In [None]:
from torch import nn, optim

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [None]:
from datasets import load_metric

## Load and Preprocess Data

In [None]:
data=pd.read_csv("../input/capstone-textual-dataset/data.tsv", sep='\t', header=None)

In [None]:
data.head()

In [None]:
# drop columns that don't contain useful data, such as index column
data=data.drop(data.columns[[0, 2]], axis=1)

In [None]:
# rename the columns
data.columns=["NICE Category", "Product Description"]
# change the ordering of columns
data=data[["Product Description", "NICE Category"]]

In [None]:
data.head()

In [None]:
description_text=data["Product Description"].tolist()

In [None]:
nice_labels=data["NICE Category"].tolist()

In [None]:
# split our data into train and validation sets
train_text, val_text, train_labels, val_labels=train_test_split(description_text, nice_labels, test_size=0.3)
val_text, test_text, val_labels, test_labels=train_test_split(val_text, val_labels, test_size=0.5)

## Tokenization

In [None]:
import transformers
tokenizer=transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings=tokenizer(train_text, truncation=True, padding=True)

In [None]:
val_encodings=tokenizer(val_text, truncation=True, padding=True)

In [None]:
test_encodings=tokenizer(test_text, truncation=True, padding=True)

## Creating the Dataset

In [None]:
class productDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings=encodings
        self.labels=labels
    
    def __getitem__(self, idx):
        item={key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"]=torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset=productDataset(train_encodings, train_labels)
val_dataset=productDataset(val_encodings, val_labels)
test_dataset=productDataset(test_encodings, test_labels)

## Training the Classifier

In [None]:
training_args=TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=600,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=25,
    evaluation_strategy="epoch"
)

model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=45)

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

trainer.save_model("distilBert_model")