In [53]:
import pandas as pd
import numpy as np

import sqlite3
import torch
import evaluate

from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [2]:
con = sqlite3.connect("recipes.db")
cur = con.cursor()

In [3]:
recipes = pd.read_sql("SELECT * FROM recipes", con)

### Add a calorie classification
tutorial from https://huggingface.co/docs/transformers/en/tasks/sequence_classification

In [54]:
recipes['labels'] = np.where(recipes['calories'] < 300, 0, 1)
calories_data = recipes[['ingredients', 'calories', 'labels', 'directions']]
calories_data['combined_ingredients_directions'] = calories_data['ingredients'] + calories_data['directions']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calories_data['combined_ingredients_directions'] = calories_data['ingredients'] + calories_data['directions']


In [5]:
con.close()

In [56]:
calories_data = calories_data[['combined_ingredients_directions', 'labels']]
calories_data

Unnamed: 0,combined_ingredients_directions,labels
0,"['2 tablespoons extra-virgin olive oil, divide...",0
1,"['¼ cup sliced fresh strawberries', '1 ½ fluid...",0
2,['1 ½ cups warm water (110 to 115 degrees F/43...,0
3,"['1 cup chocolate graham crackers', '¼ cup whi...",0
4,"['2 cups lukewarm water (105 degrees F, 40 deg...",0
...,...,...
19944,"['1 ⅓ cups French-fried onions, crushed', '1 l...",1
19945,"['2 summer squash, ends trimmed', '¼ cup oliv...",0
19946,"['1 (8 ounce) container ricotta cheese', '2 la...",1
19947,"['2 cups sliced fresh peaches', '2 cups sliced...",1


In [57]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(class_data):
    return tokenizer(class_data['combined_ingredients_directions'], truncation = True)

data = Dataset.from_pandas(calories_data, split = "train")
tokenized = data.map(preprocess_function, batched = True)
tokenized_split = tokenized.train_test_split()

Map:   0%|          | 0/19949 [00:00<?, ? examples/s]

In [58]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return accuracy.compute(predictions = predictions, references = labels)

In [59]:
id2label = {0: '<300 CALORIES', 1: '>= 300 CALORIES'}
label2id = {'<300 CALORIES': 0, '>=300 CALORIES': 1}

In [60]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels = 2, id2label = id2label, label2id = label2id)
train = TrainingArguments(
    output_dir = 'calorie_model',
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    push_to_hub = False
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
trainer = Trainer(model = model, args = train, train_dataset = tokenized_split["train"], eval_dataset = tokenized_split["test"], processing_class = tokenizer, data_collator = data_collator, compute_metrics = compute_metrics)
trainer.train()

  0%|          | 0/1872 [00:00<?, ?it/s]

KeyboardInterrupt: 