In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C:/Users/12wkd/Desktop/archive/Books_rating.csv')

In [3]:
data = df[["review/text", "review/score"]].copy()

In [4]:
data = data.dropna()

In [5]:
data = data[:10000]
data["review/score"] = data["review/score"].astype(int) - 1

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(data["review/text"].tolist(), data["review/score"].tolist(), test_size=0.2, random_state=42)

In [7]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)

In [9]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy":acc}

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

                                        
  0%|          | 0/2500 [14:40<?, ?it/s]          

{'loss': 0.9538, 'grad_norm': 5.824182033538818, 'learning_rate': 4e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        
[A                                               

  0%|          | 0/2500 [15:32<?, ?it/s]       
[A
[A

{'eval_loss': 0.8010100722312927, 'eval_accuracy': 0.6695, 'eval_runtime': 52.8445, 'eval_samples_per_second': 37.847, 'eval_steps_per_second': 0.606, 'epoch': 1.0}


                                        
  0%|          | 0/2500 [27:10<?, ?it/s]           

{'loss': 0.6874, 'grad_norm': 11.292871475219727, 'learning_rate': 3e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        
[A                                                

  0%|          | 0/2500 [28:04<?, ?it/s]       
[A
[A

{'eval_loss': 0.8380127549171448, 'eval_accuracy': 0.65, 'eval_runtime': 53.5649, 'eval_samples_per_second': 37.338, 'eval_steps_per_second': 0.597, 'epoch': 2.0}


                                        
  0%|          | 0/2500 [39:41<?, ?it/s]           

{'loss': 0.444, 'grad_norm': 6.318512916564941, 'learning_rate': 2e-05, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                                         
  0%|          | 0/2500 [40:35<?, ?it/s]           
[A
[A

{'eval_loss': 0.9919624924659729, 'eval_accuracy': 0.6585, 'eval_runtime': 53.4326, 'eval_samples_per_second': 37.43, 'eval_steps_per_second': 0.599, 'epoch': 3.0}


                                        
  0%|          | 0/2500 [52:07<?, ?it/s]           

{'loss': 0.2634, 'grad_norm': 23.274791717529297, 'learning_rate': 1e-05, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                                         
  0%|          | 0/2500 [52:59<?, ?it/s]           
[A
[A

{'eval_loss': 1.215038776397705, 'eval_accuracy': 0.6655, 'eval_runtime': 52.034, 'eval_samples_per_second': 38.436, 'eval_steps_per_second': 0.615, 'epoch': 4.0}


                                        
  0%|          | 0/2500 [1:04:31<?, ?it/s]           

{'loss': 0.1369, 'grad_norm': 3.4598543643951416, 'learning_rate': 0.0, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                          

[A[A                                         
  0%|          | 0/2500 [1:05:27<?, ?it/s]           
[A
[A

{'eval_loss': 1.4668965339660645, 'eval_accuracy': 0.655, 'eval_runtime': 52.9344, 'eval_samples_per_second': 37.783, 'eval_steps_per_second': 0.605, 'epoch': 5.0}


                                          
100%|██████████| 2500/2500 [1:03:14<00:00,  1.52s/it]

{'train_runtime': 3794.2856, 'train_samples_per_second': 10.542, 'train_steps_per_second': 0.659, 'train_loss': 0.49710484619140627, 'epoch': 5.0}





TrainOutput(global_step=2500, training_loss=0.49710484619140627, metrics={'train_runtime': 3794.2856, 'train_samples_per_second': 10.542, 'train_steps_per_second': 0.659, 'total_flos': 1.052472569856e+16, 'train_loss': 0.49710484619140627, 'epoch': 5.0})

In [18]:
def predict_score(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item() + 1
        return pred
    

print(predict_score("This movie was suck!"))

1


In [26]:
print(predict_score("jaehoon is dump!"))

1
