# Extract features from a penultimate layer in Emotion English DistilRoBERTa-base model

In [1]:
# install the transformers library
!pip install transformers

# import required packages
import csv
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# work with cuda
device = torch.device('cuda')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 10.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 56.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [2]:
# load tokenizer and model
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

## Preparation of dataset

### **Option 1** list of texts



In [3]:
# create list of texts
pred_texts = ['I like that', 'That is annoying', 'This is great!', 'Wouldn´t recommend it.']

### **Option 2** texts from csv file

In [20]:
# run cell and select file for upload
from google.colab import files
files.upload()

Saving tweets.csv to tweets.csv


{'tweets.csv': b'text\\hashtag\r\n#DonaldJTrump what an absolute  -------&gt; clown  @RonDeSantisFL we have no skin in your political theater but will donate $ to your campaign as long as you commit to running for GOP leadership   .  https://t.co/2mCj0Pgk0j\\#DonaldJTrump\r\nBecause he\'s the hero the US deserves, and the one it needs right now. So he shall run in 2024, because he can take it. #DonaldJTrump  https://t.co/4wQbAWsrcy\\#DonaldJTrump\r\n#GetOut_JoeBiden Now its 2022 it\'s not 1965 that you would play with Pakistan and Pakistani nation bear that we will retaliate we will fight against ur regime change and against your facilitators congratulations #DonaldJTrump from Pakistan peaceful partnership from Pak America  https://t.co/Rb8mnqPOk1\\#DonaldJTrump\r\n2/16/2018 #BlackPanther  1  138 days before \xf0\x9f\x87\xba\xf0\x9f\x87\xb8 (24)(2) \xf0\x9f\x8e\x82 7/4/1786.   3/11/2020 #covid19  \xf0\x9f\x87\xba\xf0\x9f\x87\xb8 shutdown led by #NBA  in its 74th season (24) days after 

In [21]:
# specify your filename

# note: you can right-click on your file and copy-paste the path to it here
file_name = "/content/tweets.csv" 

 # select the column in your csv that contains the text to be classified
text_column = "text"
hashtag_column = "hashtag"

# read in csv
reader = csv.DictReader(open(file_name, "r"), delimiter='\\')
df_pred = pd.DataFrame.from_dict(reader)
# pred_texts = df_pred[text_column].dropna().astype('str').tolist()
hashtags_col = df_pred[hashtag_column]
hashtags = df_pred[hashtag_column].unique()
pred_texts = df_pred[text_column]

## Functions for extracting

In [4]:
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()
    return hook

In [5]:
model.classifier.dense.register_forward_hook(get_features('feats'))

<torch.utils.hooks.RemovableHandle at 0x7f57806e3310>

## Extract features from penultimate layer

In [22]:
# placeholders
PREDICTIONS = []
FEATS = []

# placeholder for batch features
features = {}

for idx, inputs in enumerate(pred_texts):
     
    inputs = tokenizer(inputs, return_tensors="pt")
    inputs = inputs.to(device)

    predictions=model(**inputs).logits

    PREDICTIONS.append(predictions.detach().cpu().numpy())
    FEATS.append(features['feats'].cpu().numpy())
       

In [23]:
# Inspect features

PREDICTIONS = np.concatenate(PREDICTIONS)
FEATS = np.concatenate(FEATS)

print('preds shape:', PREDICTIONS.shape)
print('feats shape:', FEATS.shape)

preds shape: (324, 7)
feats shape: (324, 768)


#Preprocessing features for PCA

In [24]:
standarizedFeatures = StandardScaler().fit_transform(FEATS[:,:])

pd.DataFrame(data = standarizedFeatures).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.872464,0.064147,0.427233,-0.578533,1.242161,1.152498,0.127263,-0.911984,-0.022298,0.704593,...,0.145501,-0.140054,0.659769,0.106754,-0.463577,-0.262075,0.885787,0.090685,-0.698118,0.635205
1,-0.052887,-0.169106,1.158909,-0.204618,0.04885,-0.453513,0.747085,-0.230901,0.186167,-0.351402,...,-0.504003,-0.906552,0.12643,-0.110451,0.439073,-0.279168,0.48461,0.426265,0.04095,-0.011956
2,-0.880963,0.064945,1.791458,1.023596,1.704791,1.132291,-0.023926,-1.735802,0.254393,0.600428,...,1.049334,0.391537,1.476609,0.083158,0.093334,-1.044701,0.386283,1.996325,-0.133311,1.233625
3,1.894188,-0.902334,0.247537,0.087905,0.231944,0.145786,0.482352,1.393703,1.07042,-0.395482,...,-0.651107,-0.954717,-1.722784,-1.527169,-0.237071,-1.160343,-0.026315,-1.458684,0.268128,-1.468524
4,0.024097,-0.03918,0.000771,-0.041756,0.109106,0.91208,0.872299,-0.527255,1.329801,-0.460445,...,-0.342438,-0.148266,-0.724006,-1.050911,0.607418,0.243679,-0.215478,0.436381,0.094415,-0.541247


#Testing minimum number of principal components to reach 95% of the variance
####Calculating PCA

In [25]:
pca = PCA(n_components=.95)

principalComponentsTest = pca.fit_transform(standarizedFeatures)

pd.DataFrame(data = principalComponentsTest).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,-7.586745,-3.434965,-3.867023,-0.473855,-0.172142,-8.35956,1.129683,1.486224,2.592638,-0.100944,...,-0.543674,0.484212,0.297838,-0.42746,1.020526,-1.157686,-1.212551,0.457701,-0.213781,0.320458
1,1.9145,2.813011,-3.016841,3.582977,8.038706,-4.170752,-5.869693,1.888704,-3.97556,-0.4477,...,-0.178521,-0.366112,0.23421,-0.340833,-0.187213,-0.069184,-1.659973,-0.043605,0.59227,-0.762543
2,-18.833324,-6.222909,4.094088,23.351583,1.404981,-10.178376,-4.101332,1.974478,-3.250144,3.857056,...,-0.399162,0.396865,-1.389331,3.565921,0.790949,0.855988,0.30122,0.238548,-0.636768,0.303585
3,-0.550848,17.441364,14.095762,-5.856034,-14.581955,0.508118,-3.747158,-3.724459,-1.773732,-0.640738,...,-1.310769,0.689125,0.211305,0.46262,-1.078548,-1.067686,0.812261,0.219551,-0.220107,0.880703
4,4.294119,5.917418,-2.973536,3.229643,2.277841,-4.637394,-2.940664,-3.010877,-1.134917,0.560473,...,0.36864,-0.367827,-0.183421,0.018827,0.937891,-1.228237,-0.814747,0.399599,0.316812,-0.427564


In [26]:
# Make an instance of the Model
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(standarizedFeatures)

pd.DataFrame(data = principalComponents).head()

Unnamed: 0,0,1
0,-7.586689,-3.434999
1,1.914501,2.813001
2,-18.833303,-6.222893
3,-0.550885,17.441391
4,4.294106,5.917397


In [27]:
pca2 = PCA(n_components=3)

principalComponents2 = pca2.fit_transform(standarizedFeatures)

pd.DataFrame(data = principalComponents2).head()

Unnamed: 0,0,1,2
0,-7.586711,-3.435002,-3.867057
1,1.914499,2.812996,-3.01687
2,-18.833315,-6.222907,4.094082
3,-0.550893,17.441406,14.095766
4,4.294104,5.917404,-2.973552


#Normalizing each principal component

In [28]:
# Scale data between -1 and 1
principalComponents = pd.DataFrame(data = principalComponents)
scaler = MinMaxScaler(feature_range=(-1, 1))
principalComponents = pd.DataFrame(data = scaler.fit_transform(principalComponents))

principalComponents.head()

Unnamed: 0,0,1
0,-0.492643,-0.360239
1,-0.219164,-0.16778
2,-0.816361,-0.446115
3,-0.290127,0.282821
4,-0.15067,-0.072155


In [29]:
principalComponents2 = pd.DataFrame(data = principalComponents2)
principalComponents2 = pd.DataFrame(data = scaler.fit_transform(principalComponents2))

principalComponents2.head()

Unnamed: 0,0,1,2
0,-0.492643,-0.360239,-0.405932
1,-0.219164,-0.167781,-0.38044
2,-0.816361,-0.446115,-0.167231
3,-0.290127,0.282821,0.132653
4,-0.15067,-0.072155,-0.379142


In [15]:
# def min_max_scaling(series):
#     return (series - series.min()) / (series.max() - series.min())

In [30]:
# for i in range(principalComponents.shape[1]):
#     principalComponents[:,i] = min_max_scaling(principalComponents[:,i])


pred_texts = pd.concat([pred_texts, hashtags_col], axis=1)
finalData = pd.concat([pred_texts, principalComponents], axis=1)
finalData.head()

Unnamed: 0,text,hashtag,0,1
0,#DonaldJTrump what an absolute -------&gt; cl...,#DonaldJTrump,-0.492643,-0.360239
1,"Because he's the hero the US deserves, and the...",#DonaldJTrump,-0.219164,-0.16778
2,#GetOut_JoeBiden Now its 2022 it's not 1965 th...,#DonaldJTrump,-0.816361,-0.446115
3,2/16/2018 #BlackPanther 1 138 days before 🇺🇸...,#DonaldJTrump,-0.290127,0.282821
4,Do people out there recall that #DonaldJTrump ...,#DonaldJTrump,-0.15067,-0.072155


In [33]:
# for i in range(principalComponents2.shape[1]):
#     principalComponents2[:,i] = min_max_scaling(principalComponents2[:,i])

finalData2 = pd.concat([pred_texts, principalComponents2], axis=1)
finalData2.head()

Unnamed: 0,text,hashtag,0,1,2
0,#DonaldJTrump what an absolute -------&gt; cl...,#DonaldJTrump,-0.492643,-0.360239,-0.405932
1,"Because he's the hero the US deserves, and the...",#DonaldJTrump,-0.219164,-0.167781,-0.38044
2,#GetOut_JoeBiden Now its 2022 it's not 1965 th...,#DonaldJTrump,-0.816361,-0.446115,-0.167231
3,2/16/2018 #BlackPanther 1 138 days before 🇺🇸...,#DonaldJTrump,-0.290127,0.282821,0.132653
4,Do people out there recall that #DonaldJTrump ...,#DonaldJTrump,-0.15067,-0.072155,-0.379142


In [34]:
# Normalize results

for i in range(PREDICTIONS.shape[0]):
    PREDICTIONS[i,:] = min_max_scaling(PREDICTIONS[i,:])
pd.DataFrame(data = PREDICTIONS).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.581599,0.0,1.0,0.495911,0.114639,0.392697,0.242955
1,0.619459,0.0,1.0,0.653886,0.312912,0.373442,0.267734
2,1.0,0.0,0.984902,0.212585,0.02569,0.150115,0.116667
3,0.910536,0.0,1.0,0.731483,0.528817,0.988611,0.712866
4,0.675931,0.0,1.0,0.719822,0.339041,0.444265,0.426783


#Creating JSON to frontend

In [35]:
import json

In [36]:
hashtagDf = finalData2[finalData2[hashtag_column] == '#WorldCup']

print(len(hashtagDf))
for index, row in hashtagDf.iterrows():
  print(row)

45
text       I'll give £100 to anyone who can  hack the sou...
hashtag                                            #WorldCup
0                                                  -0.135335
1                                                  -0.311142
2                                                  -0.410626
Name: 151, dtype: object
text       11 days to #WorldCup and other tournaments are...
hashtag                                            #WorldCup
0                                                   0.101981
1                                                        1.0
2                                                  -0.342216
Name: 152, dtype: object
text       Fuck You @SpursOfficial it's all bout @Cymru i...
hashtag                                            #WorldCup
0                                                  -0.682599
1                                                  -0.250207
2                                                   0.486712
Name: 153, dtype: object
text   

In [37]:
colors = [
    "#e60049",
    "#0bb4ff",
    "#50e991",
    "#e6d800",
    "#9b19f5",
    "#ffa300",
    "#dc0ab4",
    "#b3d4ff",
    "#00bfa0"
]

def saveAsJSON (componentsNum, dataframe):
  data = {'data':[]}

  for i in range(len(hashtags)):
    hashtagDf = dataframe[dataframe[hashtag_column] == hashtags[i]]
    new_group = {
      "id": str(i),
      "text": hashtags[i],
      "color": colors[i],
      "twitts": [],
    }
    twittsData = []
    for index, row in hashtagDf.iterrows():
      if componentsNum == 2:
        tweetfeature = {
            'x': row[0],
            'y': row[1],
            'text': row['text']
        }
      elif componentsNum == 3:
        tweetfeature = {
            'x': row[0],
            'y': row[1],
            'z': row[2],
            'text': row['text']
        }
      twittsData.append(tweetfeature)
    new_group['twitts'] = twittsData
    data["data"].append(new_group)
  return data


In [39]:
twittsData3dim = saveAsJSON(3, finalData2)
twittsData2dim = saveAsJSON(2, finalData)



In [40]:
with open('twittsData3dim.json', 'w', encoding='UTF8') as outfile1:
  json.dump(twittsData3dim, outfile1)

with open('twittsData2dim.json', 'w', encoding='UTF8') as outfile2:
  json.dump(twittsData2dim, outfile2)

In [41]:
files.download('twittsData3dim.json')
files.download('twittsData2dim.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>