pip3 install pandas
pip3 install torch
pip3 install scikit-learn
pip3 install transformers

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import XLNetForSequenceClassification, XLNetTokenizer, Trainer, TrainingArguments

In [3]:
filepath = 'data.csv'
df = pd.read_csv(filepath)
df['summary'] = df['summary'].str.lower()

In [4]:
genre_map = {genre: i for i, genre in enumerate(df['genre'].unique())}

In [5]:
df['genre'] = df['genre'].map(genre_map)

In [6]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['genre'])

pip3 install sentencepiece

In [7]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(df['genre'].unique()))

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [8]:
train_encodings = tokenizer(train_df['summary'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['summary'].tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = GenreDataset(train_encodings, train_df['genre'].tolist())
val_dataset = GenreDataset(val_encodings, val_df['genre'].tolist())

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()



  0%|          | 0/11175 [00:00<?, ?it/s]

: 

: 