In [None]:
!pip install transformers==4.28.0 datasets evaluate ftfy > null

In [None]:
import re
import ftfy
import evaluate
import time
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report

In [None]:
DIR_PATH = '/content/'
MODEL_SAVE_PATH = DIR_PATH + 'sentiment_model'
TRAIN_DATA_PATH = DIR_PATH + 'train.csv'
BATCH_SIZE = 334
MAX_LENGTH = 48
MODEL_NAME = "bhadresh-savani/distilbert-base-uncased-emotion"
DEVICE = 'cuda:0'

In [None]:
import pandas as pd

df = pd.read_csv("/content/train.csv")[['Text', 'Sentiment']]
df = df.dropna()
df.columns = ['text', 'label']

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
df.isna().sum()

In [None]:
valid_split = int(0.9*df.shape[0])

In [None]:
ds = DatasetDict({
    "train": Dataset.from_pandas(df[:valid_split]),
    "valid": Dataset.from_pandas(df[valid_split:])
    })

unique_sorted_labels = sorted(list(set(ds['train']['label'])))
id2label = dict([(i,tag) for (i, tag) in enumerate(unique_sorted_labels)])
label2id = dict([(tag,i) for (i, tag) in id2label.items()])
print(f"{len(id2label)} label")
print(id2label)


def preprocess_text(text, translate=True):
    lower_fixed_text = ftfy.fix_text(text.lower().strip())
    return lower_fixed_text

def preprocess_function(batch):
    """
    fix encodings, lowering, striping whitespaces
    """
    batch['text'] = [preprocess_text(text.lower()) for text in batch['text']]
    batch["label"] = [label2id[label] for label in batch["label"]]
    return batch


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## Preprocess and vectorize texts

from :

We use the `ftfy` library to clean the raw text in BooksCorpus, standardize some punctuation and whitespace, and use the `spaCy` tokenizer

In [None]:
def preprocess_text(text, translate=True):
    lower_fixed_text = ftfy.fix_text(text.lower().strip())
    return lower_fixed_text

def preprocess_function(batch):
    """
    fix encodings, lowering, striping whitespaces
    """
    batch['text'] = [preprocess_text(text.lower()) for text in batch['text']]
    tokenizer_batch = tokenizer(batch["text"], padding=True, truncation=True,
                                max_length=MAX_LENGTH)
    tokenizer_batch["label"] = [label2id[label] for label in batch["label"]]
    return tokenizer_batch


In [None]:
tokenized_ds = ds.map(preprocess_function, batched=True)

In [None]:
tokenized_ds

collate data with panding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

evaluation

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
id2label

define a model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(id2label),
    id2label=id2label, label2id=label2id, ignore_mismatched_sizes = True
)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
#model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/due_model_14/checkpoint-7125")

define a trainer for a model

In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="no",
    #load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train the model

In [None]:
start_time = time.time()
trainer.train()
end_time = time.time()
time_ellapsed = end_time - start_time
print(time_ellapsed)

In [None]:
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("tokenizer")

In [None]:
pipe = pipeline("text-classification",
                model="/kaggle/output/sentiment_model",
                tokenizer='/kaggle/output/tokenizer')

import pandas as pd

df = pd.read_csv("/kaggle/input/unit-3-nlp-txt-classification/test.csv")

In [None]:
from tqdm import tqdm

preds = []
for i, row in tqdm(df.iterrows()):
    preds.append((row['id'], pipe(row['Text'])[0]['label']))

In [None]:
submit_dict = {'id': [pred[0] for pred in preds],
              'Sentiment': [pred[1] for pred in preds]
              }

In [None]:
pd.DataFrame(submit_dict).to_csv("submission.csv", index=False)