# Extract features from a penultimate layer in Emotion English DistilRoBERTa-base model

In [1]:
# install the transformers library
!pip install transformers

# import required packages
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# work with cuda
device = torch.device('cuda')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 34.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 68.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


In [2]:
# load tokenizer and model
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

## Preparation of dataset

### **Option 1** list of texts



In [3]:
# create list of texts
pred_texts = ['I like that', 'That is annoying', 'This is great!', 'Wouldn´t recommend it.']

### **Option 2** texts from csv file

In [7]:
# run cell and select file for upload
from google.colab import files
files.upload()

Saving test.csv to test.csv


{'test.csv': b'\xef\xbb\xbftext\r\nI like that\r\nThat is annoying\r\nThis is great\r\nWouldn\xc2\xb4t recommend it.\r\n'}

In [8]:
# specify your filename

# note: you can right-click on your file and copy-paste the path to it here
file_name = "/content/test.csv" 

 # select the column in your csv that contains the text to be classified
text_column = "text"

# read in csv
df_pred = pd.read_csv(file_name)
pred_texts = df_pred[text_column].dropna().astype('str').tolist()

## Functions for extracting

In [9]:
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()
    return hook

In [10]:
model.classifier.dense.register_forward_hook(get_features('feats'))

<torch.utils.hooks.RemovableHandle at 0x7f81f1445850>

## Extract features from penultimate layer

In [11]:
# placeholders
PREDICTIONS = []
FEATS = []

# placeholder for batch features
features = {}

for idx, inputs in enumerate(pred_texts):
     
    inputs = tokenizer(inputs, return_tensors="pt")
    inputs = inputs.to(device)

    predictions=model(**inputs).logits

    PREDICTIONS.append(predictions.detach().cpu().numpy())
    FEATS.append(features['feats'].cpu().numpy())
       

In [12]:
# Inspect features

PREDICTIONS = np.concatenate(PREDICTIONS)
FEATS = np.concatenate(FEATS)

print('preds shape:', PREDICTIONS.shape)
print('feats shape:', FEATS.shape)

preds shape: (4, 7)
feats shape: (4, 768)


#Preprocessing features for PCA

In [13]:
standarizedFeatures = StandardScaler().fit_transform(FEATS[:,:])

pd.DataFrame(data = standarizedFeatures).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.817929,1.257764,-0.863248,0.688848,-1.310505,0.213265,1.10059,0.611315,-0.145968,0.627812,...,-1.035896,0.537052,1.13929,0.432907,0.628366,1.356244,-1.000858,-1.356136,1.426031,-0.064119
1,0.170922,-0.571639,-0.378365,-0.607493,0.746096,1.455868,-0.712945,-0.742912,0.478065,-0.71861,...,0.972747,1.216766,0.442185,-1.566121,-0.274001,-1.014161,0.896351,0.768792,-0.091177,1.416892
2,-0.919973,0.620233,-0.461054,1.214936,-0.602849,-1.304654,0.856243,1.303885,1.19318,-1.196641,...,-0.963084,-0.288219,-1.583189,-0.032464,1.139498,0.561615,-0.99402,-0.542201,0.063054,-1.408954
3,1.56698,-1.306358,1.702666,-1.296291,1.167258,-0.364479,-1.243888,-1.172288,-1.525277,1.28744,...,1.026232,-1.4656,0.001715,1.165678,-1.493862,-0.903697,1.098527,1.129545,-1.397908,0.056181


#Testing minimum number of principal components to reach 95% of the variance
####Calculating PCA

In [14]:
# Make an instance of the Model
pca = PCA(n_components=.95)

principalComponents = pca.fit_transform(standarizedFeatures)

pd.DataFrame(data = principalComponents).head()

Unnamed: 0,0,1,2
0,23.070829,5.513481,-13.360316
1,-21.46171,18.036915,4.359941
2,21.284191,-6.320959,13.759363
3,-22.893309,-17.229439,-4.758966


#Normalizing each principal component

In [15]:
def min_max_scaling(series):
    return (series - series.min()) / (series.max() - series.min())

In [16]:
for i in range(principalComponents.shape[1]):
    principalComponents[:,i] = min_max_scaling(principalComponents[:,i])

pd.DataFrame(data = principalComponents).head()

Unnamed: 0,0,1,2
0,1.0,0.64489,0.0
1,0.031146,1.0,0.65341
2,0.96113,0.309317,1.0
3,0.0,0.0,0.317163


In [17]:
# Normalize results

for i in range(PREDICTIONS.shape[0]):
    PREDICTIONS[i,:] = min_max_scaling(PREDICTIONS[i,:])
pd.DataFrame(data = PREDICTIONS).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.158374,0.178397,0.0,1.0,0.278307,0.255064,0.220727
1,1.0,0.964752,0.162139,0.0,0.644977,0.610315,0.227512
2,0.072741,0.129778,0.0,1.0,0.61705,0.136118,0.418114
3,0.473354,0.795871,0.662041,0.0,1.0,0.898824,0.136021
