In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_review = pd.read_csv('/home/yamanishi/project/trip_recommend/data/jalan/review/review_all_period_.csv')
df_review.columns

Index(['Unnamed: 0', 'spot', 'pref', 'title', 'review', 'rating', 'tag', 'sex',
       'age', 'name', 'url', 'visit_time'],
      dtype='object')

In [3]:
categories = list(set(df_review['rating']))
categories
id2cat = dict(zip(list(range(len(categories))), categories))
cat2id = dict(zip(categories, list(range(len(categories)))))
print(id2cat)
print(cat2id)
df_review['label'] = df_review['rating'].map(cat2id)

# 念の為シャッフル
review_data = df_review.sample(frac=1).reset_index(drop=True)
review_data['text'] =  review_data['review']
# データセットを本文とカテゴリーID列だけにする
review_data = review_data[['text', 'label']]

{0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0}
{1.0: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4}


In [4]:
train_df, valid_df = train_test_split(review_data, train_size=0.8)
valid_df, test_df = train_test_split(valid_df, train_size=0.5)
print(f'train size:{len(train_df)}, valid_size: {len(valid_df)}, test size: {len(test_df)}')

train size:2852767, valid_size: 356596, test size: 356596


In [5]:
train_df.to_csv('/home/yamanishi/project/trip_recommend/data/review_classification/train.csv')
valid_df.to_csv('/home/yamanishi/project/trip_recommend/data/review_classification/valid.csv')
test_df.to_csv('/home/yamanishi/project/trip_recommend/data/review_classification/test.csv')

In [6]:
from datasets import Dataset

dataset_packed = Dataset.from_pandas(review_data)
dataset_split = dataset_packed.train_test_split(test_size=0.2, seed=0)
print(dataset_split)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2852767
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 713192
    })
})


In [7]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')

def preprocess_function(examples):
    MAX_LENGTH = 512
    return tokenizer(examples["text"], max_length=MAX_LENGTH, truncation=True)

tokenized_dataset = dataset_split.map(preprocess_function, batched=True)

 56%|█████▌    | 1591/2853 [07:46<06:23,  3.29ba/s]

In [None]:
with open('/home/yamanishi/project/trip_recommend/data/review_classification/tokenized_text.pkl', 'wb') as f:
    pickle.dump(tokenized_dataset, f)

In [None]:
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-v2", num_labels=5)

Downloading: 100%|██████████| 447M/447M [00:08<00:00, 51.4MB/s] 
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassif

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy':acc, 'f1':f1}

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] ="0"
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    no_cuda=False, # GPUを使用する場合はFalse
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


TypeError: Trainer.__init__() got an unexpected keyword argument 'label'

In [None]:
trainer.save_state()
trainer.save_model()

In [None]:
pred_result = trainer.predict(tokenized_dataset['test'], ignore_keys=['loss', 'last_hidden_state', 'hidden_states'])
pred_label= pred_result.predictions.argmax(axis=1).tolist()
print(pred_label)

In [None]:
pred_attention = pred_result.attention
print(pred_attention)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(tokenized_dataset['test']['review_id'], pred_label, target_names=categories))