In [26]:
%reload_ext autoreload
%autoreload 2
import sys
from dotenv import load_dotenv
import os
load_dotenv()

ROOT = os.getenv("ROOT")
sys.path.append(ROOT)
from src.data_loader import load_reviews
from src.processing import reviews_processing
from src.nlp.classifier import BERTClassifier

In [27]:
df = load_reviews(category="All_beauty", frac=0.01)
df = reviews_processing(df=df, clean_text=False)
sub = df.rename(columns={"rating": "label", "review_input": "text"})[["label", "text"]]

In [28]:
classifier = BERTClassifier(df=sub)

Map: 100%|██████████| 5612/5612 [00:00<00:00, 11828.50 examples/s]
Map: 100%|██████████| 701/701 [00:00<00:00, 11687.43 examples/s]
Map: 100%|██████████| 702/702 [00:00<00:00, 12382.26 examples/s]


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
classifier.train()

                                                       
 20%|██        | 351/1755 [37:00<1:06:31,  2.84s/it]

{'eval_loss': 0.38432782888412476, 'eval_runtime': 39.1494, 'eval_samples_per_second': 17.906, 'eval_steps_per_second': 1.124, 'epoch': 1.0}


 23%|██▎       | 397/1755 [43:33<2:57:47,  7.86s/it]

KeyboardInterrupt: 

In [17]:
classifier.evaluate()

100%|██████████| 44/44 [00:28<00:00,  1.54it/s]


In [18]:
classifier.metrics

{'eval_loss': 0.6550495028495789,
 'eval_model_preparation_time': 0.0008,
 'eval_runtime': 28.64,
 'eval_samples_per_second': 24.476,
 'eval_steps_per_second': 1.536}

In [19]:
# classifier.predict()

In [11]:
classifier.push_to_hub()

model.safetensors: 100%|██████████| 268M/268M [00:26<00:00, 10.1MB/s] 


In [21]:
import plotly.graph_objects as go

# Assuming you logged training loss and evaluation loss in 'trainer.state.log_history'
log_history = classifier.trainer.state.log_history

train_loss = [x["loss"] for x in log_history if "loss" in x]
eval_loss = [x["eval_loss"] for x in log_history if "eval_loss" in x]
epochs = list(range(1, len(train_loss) + 1))

# Create a Plotly figure
fig = go.Figure()

# Add training loss
fig.add_trace(go.Scatter(
    x=epochs, y=train_loss,
    mode='lines+markers',
    name='Training Loss'
))

# Add evaluation loss
fig.add_trace(go.Scatter(
    x=epochs, y=eval_loss,
    mode='lines+markers',
    name='Evaluation Loss'
))

# Customize layout
fig.update_layout(
    title="Training vs Evaluation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    legend_title="Loss Type"
)

fig.show()


In [22]:
# Assuming you logged evaluation accuracy in 'trainer.state.log_history'
accuracies = [x["eval_accuracy"] for x in log_history if "eval_accuracy" in x]
precisions = [x["eval_precision"] for x in log_history if "eval_precision" in x]
recalls = [x["eval_recall"] for x in log_history if "eval_recall" in x]
f1_scores = [x["eval_f1"] for x in log_history if "eval_f1" in x]

# Create a Plotly figure
fig = go.Figure()

# Add accuracy
fig.add_trace(go.Scatter(
    x=epochs, y=accuracies,
    mode='lines+markers',
    name='Accuracy'
))

# Add precision
fig.add_trace(go.Scatter(
    x=epochs, y=precisions,
    mode='lines+markers',
    name='Precision'
))

# Add recall
fig.add_trace(go.Scatter(
    x=epochs, y=recalls,
    mode='lines+markers',
    name='Recall'
))

# Add F1 Score
fig.add_trace(go.Scatter(
    x=epochs, y=f1_scores,
    mode='lines+markers',
    name='F1 Score'
))

# Customize layout
fig.update_layout(
    title="Accuracy, Precision, Recall, F1 Score over Epochs",
    xaxis_title="Epochs",
    yaxis_title="Score",
    legend_title="Metric"
)

fig.show()


In [25]:
from sklearn.metrics import confusion_matrix
import plotly.express as px
import numpy as np

# Assuming y_true are the true labels and y_pred are predicted labels
y_true = [datapoint['label'] for datapoint in classifier.test_set]
y_pred = [np.argmax(pred["score"]) for pred in classifier.trainer.predict(classifier.test_set).predictions]

cm = confusion_matrix(y_true, y_pred)

# Create a heatmap using plotly express
fig = px.imshow(cm, 
                text_auto=True, 
                labels=dict(x="Predicted Label", y="True Label", color="Count"),
                x=list({x: str(x) for x in range(1, 6)}.values()), 
                y=list({x: str(x) for x in range(1, 6)}.values()))

# Customize layout
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label"
)

fig.show()


100%|██████████| 44/44 [00:28<00:00,  1.56it/s]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices