In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

In [2]:
test_df = pd.read_csv('../datasets/test_essays.csv')
submission_df = pd.read_csv('../datasets/sample_submission.csv')
train_df = pd.read_csv("../datasets/train_v2_drcat_02.csv")
kf_df = pd.read_csv('../datasets/kf_df.csv')

In [3]:
kf_df = kf_df.rename(columns={'prompt_title': 'prompt_name'})
kf_df['label'] = 1
kf_df['source'] = 'kf'
kf_df['RDizzl3_seven'] = False

In [4]:
train_df = pd.concat([train_df, kf_df[train_df.columns].sample(30000, random_state=42)])

In [5]:
train_df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
...,...,...,...,...,...
29644,The article discusses the concept of domestica...,1,Are Humans More Like Wolves or Dogs?,kf,False
42301,Background noise can make it difficult to hear...,1,I Can’t Hear Myself Think! How the Brain Deals...,kf,False
46584,"Long ago, there were many different species of...",1,What Would the Child of a Human and a Neandert...,kf,False
52305,"Chemotherapy drugs are used to treat cancer, b...",1,Getting to the Bottom of Cancer Treatment Pain,kf,False


# Step 1. Text Preprocessing

In [6]:
train_df["words_count"] = train_df["text"].apply(lambda x: len(x.split(" ")))

In [7]:
train_df

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,words_count
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,378
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,432
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,179
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,221
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,334
...,...,...,...,...,...,...
29644,The article discusses the concept of domestica...,1,Are Humans More Like Wolves or Dogs?,kf,False,194
42301,Background noise can make it difficult to hear...,1,I Can’t Hear Myself Think! How the Brain Deals...,kf,False,199
46584,"Long ago, there were many different species of...",1,What Would the Child of a Human and a Neandert...,kf,False,123
52305,"Chemotherapy drugs are used to treat cancer, b...",1,Getting to the Bottom of Cancer Treatment Pain,kf,False,241


In [8]:
train_df.query("label == 0")["words_count"].mean()

424.90153812429213

In [9]:
train_df.query("label == 1")["words_count"].mean()

228.13765079899784

In [10]:
train_df["generated"] = train_df["label"].apply(lambda x: 1.0 if x == 1 else 0.0)
train_df["human"] = train_df["label"].apply(lambda x: 1.0 if x == 0 else 0.0)

# Step 2. Modeling

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.30, random_state=42, shuffle=True, stratify=train_df["label"])

In [12]:
train.groupby("label").count()

Unnamed: 0_level_0,text,prompt_name,source,RDizzl3_seven,words_count,generated,human
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,19159,19159,19159,19159,19159,19159,19159
1,33248,33248,33248,33248,33248,33248,33248


In [13]:
test.groupby("label").count()

Unnamed: 0_level_0,text,prompt_name,source,RDizzl3_seven,words_count,generated,human
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,8212,8212,8212,8212,8212,8212,8212
1,14249,14249,14249,14249,14249,14249,14249


In [14]:
train.to_csv("train.csv")
test.to_csv("test.csv")

# Model Training

In [None]:
classification_trainer = ClassificationTrainer(
    pretrained_transformer_name='cointegrated/rubert-tiny2',
    dataset_dct={'train':'train.csv', 'test': 'test.csv'},
    warmup_steps=100,
    num_train_epochs=3
)

classification_trainer.trainer.train()

Gasoline stored in the fuel tank of a vehicle can escape from the vehicle and pollute the environment, even when the vehicle is not running. This occurs because gasoline is volatile and can change from liquid to gas, which can pass into the air. Evaporated gasoline escaping from fuel tanks is a significant source of environmental pollution with volatile organic compounds (VOCs), which can harm the environment and human health. To prevent this leakage of gasoline, modern vehicles are equipped with a canister packed with particles of activated carbon, which captures the gasoline molecules in a maze of carbon molecules. Activated carbon is a charcoal material widely used for the purification of drinking water and natural gas. The adsorption of evaporated gasoline on activated carbon can be compared to the Labyrinth of the Minotaur. The labyrinth passages must be cleaned so that they can adsorb new VOCs the next day. The vehicle’s engine acts as the Minotaur, by feeding on the VOCs. The ne

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ff

Epoch,Training Loss,Validation Loss


In [None]:
metrics = classification_trainer.trainer.evaluate()

classification_trainer.trainer.log_metrics("after_train_eval", metrics)
classification_trainer.trainer.save_metrics("after_train_eval",
                                            metrics)



In [15]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from transformers import EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

LABELS = ['generated', 'human']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

def read_csv_binary(filename):
    data = pd.read_csv(filename)
    texts = data['text'].tolist()
    labels = data[LABELS].values


    return texts, labels

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
            'roc_auc': roc_auc,
            'accuracy': accuracy}
    return metrics

class LLMDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }

        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)


class ClassificationTrainer():
    def __init__(self,
                 pretrained_transformer_name='distilbert-base-cased',
                 dataset_dct={'train':'train.csv', 'test':'test.csv', 'val':'val.csv'},
                 warmup_steps=500,
                 num_train_epochs=3):


        max_samples = {
            'train': 100000,
            'val': 100000,
            'test': 100000,
        }

        train_texts, train_labels = read_csv_binary(dataset_dct['train'])

        if 'test' not in dataset_dct:
            train_texts, test_texts, train_labels, test_labels = train_test_split(
                train_texts, train_labels, test_size=.1)
        else:
            test_texts, test_labels = read_csv_binary(dataset_dct['test'])

        if 'val' not in dataset_dct:
            train_texts, val_texts, train_labels, val_labels = train_test_split(
                train_texts, train_labels, test_size=.1)
        else:
            val_texts, val_labels = read_csv_binary(dataset_dct['val'])

        train_texts = train_texts[:max_samples['train']]
        val_texts = val_texts[:max_samples['val']]
        test_texts = test_texts[:max_samples['test']]

        train_labels = train_labels[:max_samples['train']]
        val_labels = val_labels[:max_samples['val']]
        test_labels = test_labels[:max_samples['test']]

        print(train_texts[0])
        print(train_labels[0])

        self.tokenizer = BertTokenizerFast.from_pretrained(
                pretrained_transformer_name)

#посмотреть по парамкетрам
        train_encodings = self.tokenizer(train_texts, truncation=True, max_length=256, padding=True)
        val_encodings = self.tokenizer(val_texts, truncation=True, max_length=256, padding=True)
        test_encodings = self.tokenizer(test_texts, truncation=True, max_length=256, padding=True)

        self.train_dataset = LLMDDataset(train_encodings, train_labels)
        self.val_dataset = LLMDDataset(val_encodings, val_labels)
        self.test_dataset = LLMDDataset(test_encodings, test_labels)

        self.model = DistilBertForSequenceClassification.from_pretrained(
                pretrained_transformer_name, num_labels=len(LABELS), problem_type="multi_label_classification",  id2label=id2label, label2id=label2id)

        self.metric = {metric:load_metric(metric) for metric in ['f1', 'precision', 'recall', 'accuracy']}

        self.training_args = TrainingArguments(
            output_dir='./results',  # output directory
            num_train_epochs=num_train_epochs, # total number of training epochs
            per_device_train_batch_size=
            8,  # batch size per device during training
            per_device_eval_batch_size=8,  # batch size for evaluation
            warmup_steps=
            warmup_steps,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            logging_strategy='epoch',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            save_total_limit = 3,
        )

        self.trainer = Trainer(
            model=self.
            model,  # the instantiated 🤗 Transformers model to be trained
            args=self.training_args,  # training arguments, defined above
            train_dataset=self.train_dataset,  # training dataset
            eval_dataset=self.val_dataset,  # evaluation dataset
            compute_metrics=self.compute_metrics,
        )


    def compute_metrics(self, p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                tuple) else p.predictions
        result = multi_label_metrics(
            predictions=preds,
            labels=p.label_ids)
        return result


    def inference(self, predict_dataset=None):
        if predict_dataset is None:
            predict_dataset = self.test_dataset
        predictions = self.trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
        predictions = np.argmax(predictions, axis=1)

        return predictions

In [None]:
text = 'Всем привет!'



encoding = classification_trainer.tokenizer(text, return_tensorse='pt')
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)


logits = outputs.logits

In [None]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

In [None]:
def inference(trainer, predict_dataset=None):
    if predict_dataset is None:
        predict_dataset = trainer.test_dataset
    predictions = trainer.trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
    predictions = np.argmax(predictions, axis=1)

    return predictions

In [None]:
preds = inference(classification_trainer)
test["pred_label"] = [id2label[x] for x in preds]

In [None]:
results = test[["text", "label", "pred_label"]]

# Results

In [None]:
LABELS

['reminder', 'famil', 'condition', 'basis', 'preambs', 'requests', 'inf_task']

In [None]:
results[results["label"] == "basis"].sample(100).to_excel("model_preds_test_basis.xlsx")

In [None]:
results[results["label"] == "reminder"].sample(100).to_excel("model_preds_test_reminder.xlsx")

In [None]:
results[results["label"] == "famil"].sample(100).to_excel("model_preds_test_famil.xlsx")

In [None]:
results[results["label"] == "condition"].sample(100).to_excel("model_preds_test_condition.xlsx")

In [None]:
results[results["label"] == "preambs"].sample(100).to_excel("model_preds_test_preambs.xlsx")

In [None]:
results[results["label"] == "requests"].to_excel("model_preds_test_requests.xlsx")

In [None]:
results[results["label"] == "inf_task"].sample(100).to_excel("model_preds_test_inf_task.xlsx")

In [None]:
results[results["label"] == "requests"]

Unnamed: 0,text,label,pred_label


# Save Trained Model

In [None]:
!cp -r /content/results $DATA_PATH_CLOUD