In [27]:
from transformers import DistilBertPreTrainedModel, PretrainedConfig, DistilBertModel
from transformers.models.distilbert.modeling_distilbert import SequenceClassifierOutput
import torch
from torch import nn
from typing import Optional
from torch.nn import MSELoss, CrossEntropyLoss
from typing import Union
from typing import Tuple
import numpy as np


class PermDistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.num_sources = 2
        self.config = config

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        # pemb[2] is the identity permutation
        self.pemb = torch.stack([torch.randperm(config.dim) for i in range(2)] + [torch.arange(0,config.dim)])
        self.pemb = torch.nn.Parameter(self.pemb, requires_grad=False)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # Initialize weights and apply final processing
        self.post_init()

    def get_position_embeddings(self) -> nn.Embedding:
        """
        Returns the position embeddings
        """
        return self.distilbert.get_position_embeddings()

    def resize_position_embeddings(self, new_num_position_embeddings: int):
        """
        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.

        Arguments:
            new_num_position_embeddings (`int`):
                The number of new position embedding matrix. If position embeddings are learned, increasing the size
                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
                end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the
                size will add correct vectors at the end following the position encoding algorithm, whereas reducing
                the size will remove vectors from the end.
        """
        self.distilbert.resize_position_embeddings(new_num_position_embeddings)

   # @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
   # @add_code_sample_docstrings(
   #     checkpoint=_CHECKPOINT_FOR_DOC,
   #     output_type=SequenceClassifierOutput,
   #     config_class=_CONFIG_FOR_DOC,
   # )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        source: Optional[int] = None,
    ) -> Union[SequenceClassifierOutput, Tuple[torch.Tensor, ...]]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        
        if source is None:
            perm_ids = 2*np.ones(pooled_output.shape[0])
        else:
            perm_ids = np.asarray([source.cpu().numpy()]) #for i in range(pooled_output.shape[0])])
        
        perms = self.pemb[perm_ids]
        sub_permuted = torch.gather(pooled_output,-1,perms)
        
        logits = self.classifier(sub_permuted)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )

In [2]:
from transformers import DistilBertConfig

config = DistilBertConfig(
    vocab_size=30522,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    hidden_size=768,
    intermediate_size=3072,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    pad_token_id=0,
    eos_token_id=2,
    bos_token_id=1,
    sep_token_id=3,
    cls_token_id=4,
    num_labels=2,
    problem_type="single_label_classification",
    output_attentions=False,
    output_hidden_states=False,
    use_cache=True,
)


In [28]:
model = PermDistilBertForSequenceClassification(config)

In [29]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [30]:
tokenizer.max_len = 512

In [31]:
inputs = tokenizer("This is a test sentence.", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
inputs.update({"source":torch.tensor(2)})
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231, 6251, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

In [69]:
import numpy as np
model.eval()
#model.cpu()

# Set random seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)

with torch.inference_mode():
    inputs = tokenizer("This is a scary movie", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs.update({"source":torch.tensor(1)})

    print(model(**inputs))


SequenceClassifierOutput(loss=None, logits=tensor([[ 1.8280, -1.4565]]), hidden_states=None, attentions=None)


In [37]:
import pandas as pd
df = pd.read_csv("DeconDTN/dataToy/horror_family.csv")

In [38]:
df.head(2)

Unnamed: 0,id,text,label,score,url,tconst,id_w_tag,set,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,domain
0,8286,Larry Fessenden has been thrashed by most of t...,pos,7,http://www.imdb.com/title/tt0275067/usercomments,tt0275067,pos_8286,train,movie,Wendigo,Wendigo,0,2001,\N,91,"Horror,Mystery,Thriller",Horror
1,8279,This film is more about how children make sens...,pos,10,http://www.imdb.com/title/tt0275067/usercomments,tt0275067,pos_8279,train,movie,Wendigo,Wendigo,0,2001,\N,91,"Horror,Mystery,Thriller",Horror


In [39]:


#prepare df for training
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df = df.dropna()
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['text'])
df = df.reset_index(drop=True)

#tokenize text
df['bert'] = df['text'].apply(lambda x: tokenizer(x, return_tensors="pt", truncation=True, padding="max_length", max_length=512))

#encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['labels'] = le.fit_transform(df['label'])

In [40]:
df.head(3)

Unnamed: 0,id,text,label,score,url,tconst,id_w_tag,set,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,domain,bert,labels
0,12389,"Bored with the normal, run-of-the-mill staple ...",pos,10,http://www.imdb.com/title/tt0076683/usercomments,tt0076683,pos_12389,train,movie,The Sentinel,The Sentinel,0,1977,\N,92,Horror,Horror,"[input_ids, attention_mask]",1
1,6283,What an incredible fall for Sean Ellis.<br /><...,neg,2,http://www.imdb.com/title/tt0906734/usercomments,tt0906734,neg_6283,train,movie,The Broken,The Broken,0,2008,\N,93,"Drama,Horror,Thriller",Horror,"[input_ids, attention_mask]",0
2,4821,Fiction film (it lists as based on a story tho...,neg,4,http://www.imdb.com/title/tt0078203/usercomments,tt0078203,neg_4821,train,movie,Sasquatch: The Legend of Bigfoot,Sasquatch: The Legend of Bigfoot,0,1976,\N,95,"Adventure,Horror,Mystery",Horror,"[input_ids, attention_mask]",0


In [41]:
import torch
#add labels and source to data for BERT
for i,row in df.iterrows():
    row["bert"].update({"labels" : torch.tensor(int(row["labels"]),dtype=torch.long)})
    row["bert"].update({"source" : torch.tensor(int(row.domain=="Horror"),dtype=torch.long)})
    row["bert"].update({"input_ids" : row["bert"]["input_ids"].squeeze()})
    row["bert"].update({"attention_mask" : row["bert"]["attention_mask"].squeeze()})

In [45]:

#train model
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
    
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)



In [46]:
#split into train and test
from sklearn.model_selection import train_test_split
df = df.reset_index(drop=True)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [47]:
x = 4
print(train_df["bert"].values[x]["input_ids"].shape)
print(train_df["bert"].values[x]["attention_mask"].shape)
print(train_df["bert"].values[x]["labels"])
print(train_df["bert"].values[x]["source"])

torch.Size([512])
torch.Size([512])
tensor(0)
tensor(0)


In [48]:
train_df.head(1)["bert"].values[0]["labels"]

tensor(1)

In [49]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_df["bert"],         # training dataset
    eval_dataset=test_df["bert"],             # evaluation dataset
    compute_metrics=compute_metrics
)



In [50]:
train_df.head(1)["bert"].values[0]["labels"]

tensor(1)

In [51]:

trainer.train()


  0%|          | 0/449 [00:00<?, ?it/s]

{'loss': 0.7553, 'learning_rate': 1e-05, 'epoch': 0.02}
{'loss': 0.5789, 'learning_rate': 2e-05, 'epoch': 0.04}
{'loss': 0.7095, 'learning_rate': 3e-05, 'epoch': 0.07}
{'loss': 0.6802, 'learning_rate': 4e-05, 'epoch': 0.09}
{'loss': 0.6497, 'learning_rate': 5e-05, 'epoch': 0.11}
{'loss': 0.6957, 'learning_rate': 4.8746867167919805e-05, 'epoch': 0.13}
{'loss': 0.5588, 'learning_rate': 4.74937343358396e-05, 'epoch': 0.16}
{'loss': 0.7008, 'learning_rate': 4.62406015037594e-05, 'epoch': 0.18}
{'loss': 0.6151, 'learning_rate': 4.49874686716792e-05, 'epoch': 0.2}
{'loss': 0.6272, 'learning_rate': 4.3734335839599e-05, 'epoch': 0.22}
{'loss': 0.5757, 'learning_rate': 4.24812030075188e-05, 'epoch': 0.24}
{'loss': 0.5232, 'learning_rate': 4.12280701754386e-05, 'epoch': 0.27}
{'loss': 0.6498, 'learning_rate': 3.9974937343358395e-05, 'epoch': 0.29}
{'loss': 0.5833, 'learning_rate': 3.87218045112782e-05, 'epoch': 0.31}
{'loss': 0.6592, 'learning_rate': 3.7468671679198e-05, 'epoch': 0.33}
{'loss': 

  0%|          | 0/113 [00:00<?, ?it/s]

{'eval_loss': 0.3961559534072876, 'eval_accuracy': 0.8226381461675579, 'eval_f1': 0.7464968152866241, 'eval_precision': 0.7493606138107417, 'eval_recall': 0.7436548223350253, 'eval_runtime': 14.52, 'eval_samples_per_second': 77.273, 'eval_steps_per_second': 7.782, 'epoch': 1.0}
{'train_runtime': 206.9574, 'train_samples_per_second': 21.676, 'train_steps_per_second': 2.17, 'train_loss': 0.5708138586949134, 'epoch': 1.0}


TrainOutput(global_step=449, training_loss=0.5708138586949134, metrics={'train_runtime': 206.9574, 'train_samples_per_second': 21.676, 'train_steps_per_second': 2.17, 'train_loss': 0.5708138586949134, 'epoch': 1.0})