In [1]:
import pandas as pd
import numpy as np

In [2]:
train_path = 'gs://gbi_ml/classification_hackathon/bbby_train_new.csv'
test_path = 'gs://gbi_ml/classification_hackathon/bbby_test_new.csv'

In [3]:
df = pd.read_csv(train_path)

In [29]:
test_df = pd.read_csv(test_path)

In [5]:
df.shape

(308284, 12)

In [6]:
df.bucket_name.value_counts()

Hair Cleaning & Treatments           700
Tableware Variety Packs              700
Frying Pans & Skillets               700
Cookware & Bakeware Variety Packs    700
Tops                                 700
                                    ... 
Mallets & Mashers                    101
Pet Strollers                        101
Measuring Spoons                     101
Grilling Planks/Stones/Mats          100
Problems                               1
Name: bucket_name, Length: 666, dtype: int64

In [12]:
df['raw_product_description'].loc[0]

"Evergreen Fruit Fly Trap, Red You'll be able to enjoy outdoor parties and events even more thanks to this decorative accent Its stunning red crackle glass design will attract fruit flies and add a pop of color to your outdoor decor The insects enter through an opening in the top and are unable to escape For best use, place on a flat surface near your garden and fill with a non-toxic fruit fly lure Crafted from weather-resistant and outdoor-safe materials, this trap will withstand all outdoor elements Keep those pesty fruit flies away with this Fruity Fly Trap in red. The shape of a juicy apple, this fly trap will add a natural design to your outdoor space and keep away those annoying insects. At Evergreen, we are committed to producing premium, quality products to bring a touch of color and personality to your home. We respect and celebrate the uniqueness in everyone by offering one of the world's largest selections of home and garden decor."

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
def preprocess_function(text):
    return tokenizer(text, truncation=True)

In [14]:
df['processed_description'] = df['raw_product_description'].apply(preprocess_function)

In [30]:
test_df['processed_description'] = test_df['raw_product_description'].apply(preprocess_function)

In [15]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [18]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
cats = sorted(df.bucket_name.unique())

In [22]:
id2label = {id_:name_ for id_, name_ in enumerate(cats)}
label2id = {name_: id_ for id_, name_ in enumerate(cats)}

In [27]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [None]:
training_args = TrainingArguments(
    output_dir="classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
    # hub_token='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'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=df['processed_description'],
    eval_dataset=test_df['processed_description'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 308284
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 38536
