In [4]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch


# 加载数据
train_data = pd.read_csv('C:/Users/DELL/Downloads/movie_train.csv')
test_data = pd.read_csv('C:/Users/DELL/Downloads/movie_test.csv')

# 初始化tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 数据预处理函数
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# 创建datasets
train_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_data).map(preprocess_function, batched=True)

# 加载预训练模型
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 指定设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # 将模型转移到指定的设备

Map: 100%|██████████| 32000/32000 [00:26<00:00, 1226.67 examples/s]
Map: 100%|██████████| 8000/8000 [00:07<00:00, 1126.06 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:


# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize

In [7]:
from sklearn.utils import resample
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [8]:
import torch
from datasets import Dataset, load_metric
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [9]:
import torch
from datasets import Dataset, load_metric
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    f1 = load_metric("f1")
    precision = load_metric("precision")
    recall = load_metric("recall")
    accuracy = load_metric("accuracy")

    f1_score = f1.compute(predictions=predictions, references=labels, average='binary')
    precision_score = precision.compute(predictions=predictions, references=labels, average='binary')
    recall_score = recall.compute(predictions=predictions, references=labels, average='binary')
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    return {
        "accuracy": accuracy_score['accuracy'],
        "f1": f1_score['f1'],
        "precision": precision_score['precision'],
        "recall": recall_score['recall']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  3%|▎         | 500/16000 [17:07<6:36:40,  1.54s/it] 

{'loss': 0.515, 'grad_norm': 9.450413703918457, 'learning_rate': 5e-05, 'epoch': 0.12}


  6%|▋         | 1000/16000 [29:54<6:56:11,  1.66s/it]

{'loss': 0.4468, 'grad_norm': 3.3079748153686523, 'learning_rate': 4.8387096774193554e-05, 'epoch': 0.25}


  9%|▉         | 1500/16000 [44:30<7:02:58,  1.75s/it]

{'loss': 0.7078, 'grad_norm': 3.2141053676605225, 'learning_rate': 4.67741935483871e-05, 'epoch': 0.38}


 12%|█▎        | 2000/16000 [58:58<5:47:30,  1.49s/it]

{'loss': 0.5406, 'grad_norm': 8.625186920166016, 'learning_rate': 4.516129032258064e-05, 'epoch': 0.5}


 16%|█▌        | 2500/16000 [1:11:07<5:18:27,  1.42s/it]

{'loss': 0.4643, 'grad_norm': 17.539926528930664, 'learning_rate': 4.3548387096774194e-05, 'epoch': 0.62}


 19%|█▉        | 3000/16000 [1:23:10<5:13:56,  1.45s/it]

{'loss': 0.4511, 'grad_norm': 0.6657257080078125, 'learning_rate': 4.1935483870967746e-05, 'epoch': 0.75}


 22%|██▏       | 3500/16000 [1:42:38<8:49:20,  2.54s/it] 

{'loss': 0.4423, 'grad_norm': 1.084800362586975, 'learning_rate': 4.032258064516129e-05, 'epoch': 0.88}


 25%|██▌       | 4000/16000 [1:57:57<4:55:35,  1.48s/it] 

{'loss': 0.4392, 'grad_norm': 1.0629304647445679, 'learning_rate': 3.870967741935484e-05, 'epoch': 1.0}


  f1 = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                        
 25%|██▌       | 4000/16000 [2:04:03<4:55:35,  1.48s/it]

{'eval_loss': 0.5020880103111267, 'eval_accuracy': 0.807125, 'eval_f1': 0.8301596037424326, 'eval_precision': 0.7370992963252541, 'eval_recall': 0.9501133786848073, 'eval_runtime': 364.7145, 'eval_samples_per_second': 21.935, 'eval_steps_per_second': 1.371, 'epoch': 1.0}


 28%|██▊       | 4500/16000 [2:15:43<4:25:38,  1.39s/it]   

{'loss': 0.4351, 'grad_norm': 0.6704543232917786, 'learning_rate': 3.7096774193548386e-05, 'epoch': 1.12}


 31%|███▏      | 5000/16000 [2:27:31<4:16:29,  1.40s/it]

{'loss': 0.3819, 'grad_norm': 1.6181641817092896, 'learning_rate': 3.548387096774194e-05, 'epoch': 1.25}


 34%|███▍      | 5500/16000 [2:39:08<4:04:00,  1.39s/it]

{'loss': 0.3919, 'grad_norm': 2.2444615364074707, 'learning_rate': 3.387096774193548e-05, 'epoch': 1.38}


 38%|███▊      | 6000/16000 [2:50:39<3:51:07,  1.39s/it]

{'loss': 0.3767, 'grad_norm': 25.719499588012695, 'learning_rate': 3.2258064516129034e-05, 'epoch': 1.5}


 41%|████      | 6500/16000 [3:02:14<3:39:34,  1.39s/it]

{'loss': 0.3898, 'grad_norm': 3.8124265670776367, 'learning_rate': 3.0645161290322585e-05, 'epoch': 1.62}


 44%|████▍     | 7000/16000 [3:13:51<3:27:28,  1.38s/it]

{'loss': 0.3815, 'grad_norm': 4.126402854919434, 'learning_rate': 2.9032258064516133e-05, 'epoch': 1.75}


 47%|████▋     | 7500/16000 [3:25:26<3:15:59,  1.38s/it]

{'loss': 0.3633, 'grad_norm': 0.8641723990440369, 'learning_rate': 2.7419354838709678e-05, 'epoch': 1.88}


 50%|█████     | 8000/16000 [3:36:58<3:04:15,  1.38s/it]

{'loss': 0.3391, 'grad_norm': 1.4563385248184204, 'learning_rate': 2.5806451612903226e-05, 'epoch': 2.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                        
 50%|█████     | 8000/16000 [3:42:36<3:04:15,  1.38s/it]

{'eval_loss': 0.3840793967247009, 'eval_accuracy': 0.876375, 'eval_f1': 0.8750789440444613, 'eval_precision': 0.8774062816616008, 'eval_recall': 0.872763920382968, 'eval_runtime': 336.2701, 'eval_samples_per_second': 23.79, 'eval_steps_per_second': 1.487, 'epoch': 2.0}


 53%|█████▎    | 8500/16000 [3:54:06<2:52:08,  1.38s/it]   

{'loss': 0.308, 'grad_norm': 2.269658088684082, 'learning_rate': 2.4193548387096777e-05, 'epoch': 2.12}


 56%|█████▋    | 9000/16000 [4:05:39<2:41:16,  1.38s/it]

{'loss': 0.2964, 'grad_norm': 0.8143106698989868, 'learning_rate': 2.258064516129032e-05, 'epoch': 2.25}


 59%|█████▉    | 9500/16000 [4:17:08<2:27:57,  1.37s/it]

{'loss': 0.3034, 'grad_norm': 3.648118019104004, 'learning_rate': 2.0967741935483873e-05, 'epoch': 2.38}


 62%|██████▎   | 10000/16000 [4:28:42<2:17:46,  1.38s/it]

{'loss': 0.292, 'grad_norm': 0.3808221220970154, 'learning_rate': 1.935483870967742e-05, 'epoch': 2.5}


 66%|██████▌   | 10500/16000 [4:40:12<2:05:45,  1.37s/it]

{'loss': 0.2997, 'grad_norm': 22.708032608032227, 'learning_rate': 1.774193548387097e-05, 'epoch': 2.62}


 69%|██████▉   | 11000/16000 [4:51:42<1:55:23,  1.38s/it]

{'loss': 0.2843, 'grad_norm': 2.294463872909546, 'learning_rate': 1.6129032258064517e-05, 'epoch': 2.75}


 72%|███████▏  | 11500/16000 [5:03:13<1:43:00,  1.37s/it]

{'loss': 0.277, 'grad_norm': 3.4204230308532715, 'learning_rate': 1.4516129032258066e-05, 'epoch': 2.88}


 75%|███████▌  | 12000/16000 [5:14:44<1:31:53,  1.38s/it]

{'loss': 0.2834, 'grad_norm': 3.560669183731079, 'learning_rate': 1.2903225806451613e-05, 'epoch': 3.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                         
 75%|███████▌  | 12000/16000 [5:20:22<1:31:53,  1.38s/it]

{'eval_loss': 0.3838668167591095, 'eval_accuracy': 0.88425, 'eval_f1': 0.8885411651420317, 'eval_precision': 0.8506568333717447, 'eval_recall': 0.9299571680524061, 'eval_runtime': 336.4983, 'eval_samples_per_second': 23.774, 'eval_steps_per_second': 1.486, 'epoch': 3.0}


 78%|███████▊  | 12500/16000 [5:31:47<1:19:54,  1.37s/it]   

{'loss': 0.2354, 'grad_norm': 7.879631519317627, 'learning_rate': 1.129032258064516e-05, 'epoch': 3.12}


 81%|████████▏ | 13000/16000 [5:43:14<1:08:37,  1.37s/it]

{'loss': 0.2077, 'grad_norm': 0.18995659053325653, 'learning_rate': 9.67741935483871e-06, 'epoch': 3.25}


 84%|████████▍ | 13500/16000 [5:54:41<57:08,  1.37s/it]  

{'loss': 0.2151, 'grad_norm': 0.6865127086639404, 'learning_rate': 8.064516129032258e-06, 'epoch': 3.38}


 88%|████████▊ | 14000/16000 [6:06:11<45:49,  1.37s/it]  

{'loss': 0.1828, 'grad_norm': 13.350358963012695, 'learning_rate': 6.451612903225806e-06, 'epoch': 3.5}


 91%|█████████ | 14500/16000 [6:17:40<34:29,  1.38s/it]

{'loss': 0.2207, 'grad_norm': 0.46735748648643494, 'learning_rate': 4.838709677419355e-06, 'epoch': 3.62}


 94%|█████████▍| 15000/16000 [6:29:08<22:53,  1.37s/it]

{'loss': 0.1913, 'grad_norm': 32.650081634521484, 'learning_rate': 3.225806451612903e-06, 'epoch': 3.75}


 97%|█████████▋| 15500/16000 [6:40:36<11:30,  1.38s/it]

{'loss': 0.2127, 'grad_norm': 3.3616526126861572, 'learning_rate': 1.6129032258064516e-06, 'epoch': 3.88}


100%|██████████| 16000/16000 [6:52:05<00:00,  1.38s/it]

{'loss': 0.2071, 'grad_norm': 0.17462489008903503, 'learning_rate': 0.0, 'epoch': 4.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
                                                       
100%|██████████| 16000/16000 [6:57:42<00:00,  1.57s/it]


{'eval_loss': 0.4724868834018707, 'eval_accuracy': 0.891125, 'eval_f1': 0.8905091137649277, 'eval_precision': 0.8886101354741596, 'eval_recall': 0.892416225749559, 'eval_runtime': 336.0427, 'eval_samples_per_second': 23.806, 'eval_steps_per_second': 1.488, 'epoch': 4.0}
{'train_runtime': 25062.8416, 'train_samples_per_second': 5.107, 'train_steps_per_second': 0.638, 'train_loss': 0.34635793495178224, 'epoch': 4.0}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 500/500 [05:32<00:00,  1.50it/s]


{'eval_loss': 0.4724868834018707,
 'eval_accuracy': 0.891125,
 'eval_f1': 0.8905091137649277,
 'eval_precision': 0.8886101354741596,
 'eval_recall': 0.892416225749559,
 'eval_runtime': 333.2138,
 'eval_samples_per_second': 24.009,
 'eval_steps_per_second': 1.501,
 'epoch': 4.0}

In [10]:
model_path = "C:/Users/DELL/Downloads/modelmovie"  
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('C:/Users/DELL/Downloads/modelmovie\\tokenizer_config.json',
 'C:/Users/DELL/Downloads/modelmovie\\special_tokens_map.json',
 'C:/Users/DELL/Downloads/modelmovie\\vocab.json',
 'C:/Users/DELL/Downloads/modelmovie\\merges.txt',
 'C:/Users/DELL/Downloads/modelmovie\\added_tokens.json')