In [26]:
!pip install datasets==2.1.0
!wandb disabled
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    precision= precision_score(labels, preds, average="weighted")
    recall=recall_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1, "precision" : precision, "recall" : recall}

In [23]:
def build_train_and_test(model_complexity, num_labels, seq_max_length, epochs, train_ds, val_ds, test_ds):
    # zbiory musza byc typu dataset, label i text
    if model_complexity=="small":
        bert_model_version="prajjwal1/bert-tiny"
    elif model_complexity=="normal":
        bert_model_version="bert-base-uncased"
    elif model_complexity=="large":
        bert_model_version="bert-large-uncased"
    else:
        print("ERROR")
        return
    
    tokenizer = AutoTokenizer.from_pretrained(bert_model_version)
    def tokenize(batch):
        return tokenizer(batch["text"], padding=True, truncation=True, max_length=seq_max_length)
    train_ds_encoded =  train_ds.map(tokenize, batched=True, batch_size=None)
    test_ds_encoded =  test_ds.map(tokenize, batched=True, batch_size=None)
    val_ds_encoded =  val_ds.map(tokenize, batched=True, batch_size=None)
    train_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    test_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    val_ds_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = (AutoModelForSequenceClassification.from_pretrained(bert_model_version, num_labels=num_labels).to(device))

    batch_size=64
    logging_steps = 1
    model_name = "The Warsaw Institute of Technology - EITI - NLP Project"
    training_args = TrainingArguments(output_dir=model_name,
                                     num_train_epochs=epochs,
                                     learning_rate=2e-5,
                                     per_device_train_batch_size=batch_size,
                                     per_device_eval_batch_size=batch_size,
                                     weight_decay=0.01,
                                     evaluation_strategy="epoch",
                                     disable_tqdm=False,
                                     logging_steps=logging_steps,
                                     log_level="error")
    
    trainer = Trainer(model=model, args=training_args,
                     compute_metrics = compute_metrics,
                     train_dataset = train_ds_encoded,
                     eval_dataset = val_ds_encoded,
                     tokenizer=tokenizer)
    trainer.train()
    preds_output = trainer.predict(test_ds_encoded)
    return trainer, preds_output.metrics

In [22]:
#emotions
def load_emotions_dataset():
    before_train_ds = load_dataset("text", data_files="../input/emotions-dataset-for-nlp/train.txt")['train']
    before_test_ds = load_dataset("text", data_files="../input/emotions-dataset-for-nlp/test.txt")['train']
    before_val_ds = load_dataset("text", data_files="../input/emotions-dataset-for-nlp/val.txt")['train']
    emotion_dict = {'surprise':0, 'love':1 , 'joy':2 , 'fear': 3, 'sadness': 4, 'anger':5}
    def split_data(data):
        after_process = data['text'].split(';')
        data['text'] = after_process[0]
        data['label'] = emotion_dict[after_process[1]]
        return data
    train_ds =  before_train_ds.map(lambda x: split_data(x))
    test_ds = before_test_ds.map(lambda x: split_data(x))
    val_ds = before_val_ds.map(lambda x: split_data(x))
    return train_ds, val_ds, test_ds

In [None]:
#sms
def load_sms_dataset():
    df=pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='ISO-8859-1')
    df=df[['v1','v2']]
    df.rename(columns={'v1':'label','v2':'text'},inplace=True)
    lab={'spam':0,'ham':1}
    df['label']=df['label'].replace(lab)
    train_ds,test_ds=train_test_split(df,test_size=.4,random_state=303)
    val_ds,test_ds=train_test_split(test_ds,test_size=.5,random_state=303)
    train_ds=Dataset.from_pandas(train_ds)
    val_ds=Dataset.from_pandas(val_ds)
    test_ds=Dataset.from_pandas(test_ds)
    train_ds=train_ds.remove_columns("__index_level_0__")
    val_ds=val_ds.remove_columns("__index_level_0__")
    test_ds=test_ds.remove_columns("__index_level_0__")
    return train_ds, val_ds, test_ds

In [21]:
#bbc
def load_bbc_dataset():
    testq1 = load_dataset('csv', data_files='../input/newsgroup20bbcnews/bbc-text.csv')
    x=testq1['train']
    x.set_format(type='pandas')
    dftest=x[:]
    dftest.columns=['label','text']
    columnsTitles = ['text', 'label']
    dftest = dftest.reindex(columns=columnsTitles)
    dftest['label'] = dftest.label.map({'sport':0,'business':1,'tech':2,'entertainment':3,'politics':4})
    df=dftest
    train_ds, test_ds = train_test_split(df, test_size=0.4, random_state=303)
    val_ds, test_ds = train_test_split(df, test_size=0.5, random_state=303)
    train_ds.columns=['text','label']
    val_ds.columns=['text','label']
    test_ds.columns=['text','label']
    train_ds=Dataset.from_pandas(train_ds)
    val_ds=Dataset.from_pandas(val_ds)
    test_ds=Dataset.from_pandas(test_ds)
    train_ds=train_ds.remove_columns("__index_level_0__")
    val_ds=val_ds.remove_columns("__index_level_0__")
    test_ds=test_ds.remove_columns("__index_level_0__")
    return train_ds, val_ds, test_ds

In [20]:
#imdb
def load_imdb_dataset():
    train_ds = load_dataset("imdb")['train']
    test_ds = load_dataset("imdb")['test']
    test_ds.set_format(type="pandas")
    df = test_ds[:] 
    val_ds, test_ds = train_test_split(df, test_size=0.5, random_state=303)
    val_ds.columns=['text','label']
    test_ds.columns=['text','label']
    val_ds=Dataset.from_pandas(val_ds)
    test_ds=Dataset.from_pandas(test_ds)
    val_ds=val_ds.remove_columns("__index_level_0__")
    test_ds=test_ds.remove_columns("__index_level_0__")
    return train_ds, val_ds, test_ds

In [19]:
#disaster tweets 
def load_tweets_dataset():
    train_data = load_dataset('csv', data_files='/kaggle/input/nlpgettingstarted/train.csv')['train']
    test_data = load_dataset('csv', data_files='/kaggle/input/nlpgettingstarted/test.csv')['train']

    train_data = train_data.remove_columns("location")
    train_data = train_data.remove_columns("keyword")
    train_data = train_data.remove_columns("id")

    #test_data = test_data.remove_columns("location")
    #test_data = test_data.remove_columns("keyword")
    #test_data = test_data.remove_columns("id")
    train_data=train_data.rename_column("target", "label")
    #test_data=test_data.rename_column("target", "label")
    train_data.set_format(type="pandas")
    df = train_data[:]
    train_ds, test_ds = train_test_split(df, test_size=0.4, random_state=303)
    val_ds, test_ds = train_test_split(test_ds, test_size=0.5, random_state=303)
    train_ds.columns=['text','label']
    val_ds.columns=['text','label']
    test_ds.columns=['text','label']

    train_ds=Dataset.from_pandas(train_ds)
    val_ds=Dataset.from_pandas(val_ds)
    test_ds=Dataset.from_pandas(test_ds)

    train_ds=train_ds.remove_columns("__index_level_0__")
    val_ds=val_ds.remove_columns("__index_level_0__")
    test_ds=test_ds.remove_columns("__index_level_0__")
    return train_ds, val_ds, test_ds

In [27]:
def load_under_study_dataset(ds_name):
    if ds_name == "emotions":
        return load_emotions_dataset()
    elif ds_name == "sms":
        return load_sms_dataset()
    elif ds_name == "bbc":
        return load_bbc_dataset()
    elif ds_name == "tweets":
        return load_tweets_dataset()
    elif ds_name == 'imdb':
        return load_imdb_dataset()

In [28]:
dataset_name = "emotions"
train_ds, val_ds, test_ds = load_under_study_dataset(dataset_name)
num_labels = len(set(train_ds['label']))
bert_version = "small"
epochs = 20
seq_max_length = 50

trainer, metrics = build_train_and_test(model_complexity = bert_version, num_labels = num_labels, \
    seq_max_length = seq_max_length, epochs = epochs,  train_ds = train_ds, val_ds = val_ds, test_ds = test_ds)
print(metrics)

In [29]:
dataset_name = "emotions"
train_ds, val_ds, test_ds = load_under_study_dataset(dataset_name)
num_labels = len(set(train_ds['label']))
bert_version = "large"
epochs = 2
seq_max_length = 50

trainer, metrics = build_train_and_test(model_complexity = bert_version, num_labels = num_labels, \
    seq_max_length = seq_max_length, epochs = epochs,  train_ds = train_ds, val_ds = val_ds, test_ds = test_ds)
print(metrics)