In [20]:
import pandas as pd
import sys
from dotenv import load_dotenv
from datasets import Dataset
import os
load_dotenv()

# Access the environment variables
ROOT = os.getenv("ROOT")
sys.path.append(ROOT)
from src.data_loader import load_reviews
from src.processing import reviews_processing
from src.nlp.sentiment_analysis import apply_reviews_sentiment
from src.recommendation import recommendation_system
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
df = load_reviews(category="All_beauty", frac=0.01)
df = reviews_processing(df=df, clean_text=False)

In [12]:
sub = df.rename(columns={"rating": "labels", "review_input": "text"})[["labels", "text"]]

In [29]:
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)

In [13]:
def tokenize_function(row):
    return tokenizer(row['text'], padding='max_length', truncation=True, max_length=512)

# Apply the tokenizer to the DataFrame
hf_dataset = Dataset.from_pandas(sub)
tokenized_datasets = hf_dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format('torch')
train_test_split = tokenized_datasets.train_test_split(test_size=0.7)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

Map:   0%|          | 0/7015 [00:00<?, ? examples/s]

Map: 100%|██████████| 7015/7015 [00:00<00:00, 11479.62 examples/s]


In [18]:
id2label = {x: str(x) for x in range(1, 6)}
label2id = {str(x): x for x in range(1, 6)}

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [83]:
trainer.train()

 20%|██        | 132/660 [04:01<16:56,  1.93s/it]
 20%|██        | 132/660 [06:55<16:56,  1.93s/it]

{'eval_loss': 0.4588784873485565, 'eval_runtime': 174.7036, 'eval_samples_per_second': 28.11, 'eval_steps_per_second': 1.757, 'epoch': 1.0}


 40%|████      | 264/660 [10:52<10:07,  1.53s/it]  