<a href="https://colab.research.google.com/github/MorenoSara/Few-Shot_Text_Classification/blob/main/Few_shot_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers.util import cos_sim
import numpy as np
import scipy
import random

random_seed = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if device == 'cuda':
  torch.cuda.set_device(gpu_no)
  torch.cuda.manual_seed(random_seed)

### Load data

In [3]:
train_dataset = pd.read_excel('train.xlsx', index_col=0) # 32889 samples
train_dataset.head(10)

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,5,43,115,Medical,Parkinson's Disease,deep brain stimulation; basal ganglia; parkin...,The subthalamic nucleus (STN) and globus palli...
1,0,1,1,CS,Machine learning,Supervised classification; Label ranking prob...,Preference learning is the branch of machine l...
2,0,7,7,CS,Parallel computing,Track fitting; Track reconstruction; Multiple...,Modern semiconductor detectors allow for charg...
3,6,6,131,biochemistry,Polymerase chain reaction,bacterial community; calves; probiotics; rume...,The objective of this study was to assess the ...
4,6,6,131,biochemistry,Polymerase chain reaction,Bladder cancer; glycoprotein nonmetastatic me...,Glycoprotein nonmetastatic melanoma protein B ...
5,3,1,53,MAE,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...
6,6,2,127,biochemistry,Human Metabolism,BORON; MANGANESE; MEMBRANES; CELL WALLS; NADH...,Boron has been recognized since 1923 as an ess...
7,2,7,40,Psychology,Depression,putamen; basal ganglia; magnetic resonance im...,Putamen volume is seen to alter in neurologica...
8,0,4,4,CS,Operating systems,Wireless sensor networks; localized interacti...,Wireless sensor networks (WSNs) are characteri...
9,2,4,37,Psychology,Prosocial behavior,Eye contact effects; Therapeutic implications...,Introduction. The perception of a direct gaze ...


In [4]:
REMAP_LEV1 = {'CS': 'Computer Science', 
              'Civil': 'Civil Engineering', 
              'ECE': 'Electrical Engineering', 
              'Psychology': 'Psychology', 
              'MAE': 'Mechanical Engineering', 
              'Medical': 'Medical Science', 
              'biochemistry': 'Biochemistry'}

In [5]:
def get_mapped_labels(data, mapping_dict):
  labels = [l.strip() for l in data]
  return list(map(lambda l: mapping_dict[l], labels))

In [6]:
labels = get_mapped_labels(set(train_dataset['Domain']), REMAP_LEV1)
abstracts = train_dataset['Abstract']

In [7]:
labels

['Civil Engineering',
 'Biochemistry',
 'Medical Science',
 'Computer Science',
 'Electrical Engineering',
 'Mechanical Engineering',
 'Psychology']

### Compute entropy

In [8]:
st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

In [9]:
labels_embeddings = st_model.encode(labels)
# doc_embeddings = st_model.encode(abstracts, batch_size = 256, show_progress_bar=True) # directly encode the entire documents 

In [10]:
#with open('doc_embeddings.txt','wb') as f:
    #for line in np.matrix(doc_embeddings):
        #np.savetxt(f, line)

In [11]:
df = pd.read_csv('/content/drive/MyDrive/doc_embeddings.txt', sep = ' ', header=None) 
# retrieve corresponding document using train_dataset.iloc[i]

In [12]:
def floored_cosine_tensors(X, Y):
  sim = cos_sim(X, Y)
  return np.maximum(0, sim)

In [13]:
def get_entropies(sentences, labels_embeddings):
  probs = floored_cosine_tensors(sentences, labels_embeddings)
  normalized_probs = probs/np.linalg.norm(probs)
  normalized_entropy = scipy.stats.entropy(normalized_probs, axis = 1)/np.log(labels_embeddings.shape[0])
  return normalized_entropy

In [14]:
df = df.astype(np.float32)
doc_embeddings = np.matrix(df)
entropies = get_entropies(doc_embeddings, labels_embeddings)

  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


In [15]:
train_df = train_dataset
train_df['Entropy'] = entropies
train_df['Domain'] = get_mapped_labels(train_df['Domain'], REMAP_LEV1)
train_df.dropna(inplace=True)
train_df

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,5,43,115,Medical Science,Parkinson's Disease,deep brain stimulation; basal ganglia; parkin...,The subthalamic nucleus (STN) and globus palli...,0.577499
1,0,1,1,Computer Science,Machine learning,Supervised classification; Label ranking prob...,Preference learning is the branch of machine l...,0.784086
2,0,7,7,Computer Science,Parallel computing,Track fitting; Track reconstruction; Multiple...,Modern semiconductor detectors allow for charg...,0.543961
3,6,6,131,Biochemistry,Polymerase chain reaction,bacterial community; calves; probiotics; rume...,The objective of this study was to assess the ...,0.358490
4,6,6,131,Biochemistry,Polymerase chain reaction,Bladder cancer; glycoprotein nonmetastatic me...,Glycoprotein nonmetastatic melanoma protein B ...,0.337900
...,...,...,...,...,...,...,...,...
32884,5,26,98,Medical Science,HIV/AIDS,Biomarkers; Cytokine; HIV; Gingival crevicula...,Objective: This study evaluates the potential ...,0.349118
32885,3,3,55,Mechanical Engineering,Machine design,Red Catuai; Poisson's ratio; density; elastic...,The finite element method has been employed in...,0.766171
32886,2,18,51,Psychology,Problem-solving,health care issues; health care utilization; ...,The University Hospital of Zurich offers a tex...,0.699593
32887,5,41,113,Medical Science,Overactive Bladder,attentional and interoceptive networks; brain...,BACKGROUND: Treatment of urgency urinary incon...,0.828843


Create a dataframe for each label

In [16]:
dfs = {}
for label in labels:
  dfs[f"{label.replace(' ', '_')}"] = train_df[train_df['Domain'] == label]

In [17]:
from torch.utils.data.dataset import Dataset
class document_class(Dataset):
  def __init__(self, documents, labels):
    self.train_df = []
    for id, doc in enumerate(documents):
      curr_doc = [labels[id]]
      curr_doc.append(doc) 
      self.train_df.append(curr_doc)
  
  def __getitem__(self, index):
    return self.train_df[index] 

  def __len__(self):
    return len(self.train_df)

In [18]:
def my_collate_fn(batch):
  documents = []
  labels = []
  for doc in batch:
    documents.append(doc[1])
    labels.append(list(doc[0]))
  return (documents, torch.Tensor(labels))

In [19]:
def save_model(model, model_path):
    """Save model."""
    torch.save(model.state_dict(), model_path)

def load_model(model, model_path, use_cuda=True):
    """Load model."""
    map_location = 'cpu'
    if use_cuda and torch.cuda.is_available():
        map_location = 'cuda:0'
    model.load_state_dict(torch.load(model_path, map_location))
    return model

# Models

In [None]:
from torch import nn
class FreezeSTTrainClassifier(nn.Module):
  def __init__(self, sentence_transformer_model, st_embedding_dimension, num_classes, device, dropout = 0.3):
    super().__init__()
    self.st = SentenceTransformer(sentence_transformer_model)
    for param in self.st.parameters():
      param.requires_grad = False
    self.classification = nn.Linear(in_features=st_embedding_dimension, out_features=num_classes)
    torch.nn.init.eye_(self.classification.weight)
    self.device = device
    self.dropout = nn.Dropout(dropout)
    self.to(device)

  def forward(self, documents):
    with torch.no_grad():
      docs = self.st.encode(documents, convert_to_tensor=True) # exploit pretrained sentence transformer
    docs = self.dropout(docs)
    probs = self.classification(docs) # assign a score to each class for every document
    return probs

In [287]:
from torch import nn
class Classifier(nn.Module):
  def __init__(self, in_dim, out_dim, device, dropout = 0.3):
    super().__init__()
    self.classification = nn.Linear(in_features=in_dim, out_features=out_dim)
    torch.nn.init.eye_(self.classification.weight)
    self.device = device
    self.dropout = nn.Dropout(dropout)
    self.to(device)

  def forward(self, documents_embeddings):
    docs = self.dropout(documents_embeddings)
    scores = self.classification(docs) # assign a score to each class for every document
    return scores

# Few-shot text classification
Works also for one shot classification

In [45]:
N_shots = 2# number of examples for each class

### Random

In [46]:
from numpy.random import randint
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    N_shots_data.append(dfs[k].iloc[randint(0, len(dfs[k]), N_shots)]) # sort the values according to their entropy 
dataframes = pd.concat(N_shots_data).reset_index(drop=True)

In [47]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,4,8,69,Civil Engineering,Solar Energy,Open-cell foam; Complex index of refraction; ...,A set of 96 open-cell foams with growing poros...,0.834704
1,4,3,64,Civil Engineering,Rainwater Harvesting,Wheat-maize double-cropping system; Rainwater...,Developing water-saving cultivation techniques...,0.486651
2,6,3,128,Biochemistry,Immunology,T cell; apoptosis; melanoma; immunotherapy; I...,Advancements in adoptive cell transfer therapy...,0.250598
3,6,6,131,Biochemistry,Polymerase chain reaction,Alcohol; extended-release naltrexone; HIV; in...,Background and aimsHIV-infected people with su...,0.432195
4,5,33,105,Medical Science,Medicare,Alzheimer's disease; amyloid PET imaging; evi...,We examine a recent dispute regarding the Cent...,0.660595
5,5,30,102,Medical Science,Kidney Health,Acute kidney injury; chronic kidney injury; m...,Acute kidney injury (AKI) leads to chronic kid...,0.472923
6,0,13,13,Computer Science,Algorithm design,Multiple sequence alignment; Iterated local s...,Background: Aligning multiple sequences arises...,0.463577
7,0,1,1,Computer Science,Machine learning,Radial basis function neural networks; Suppor...,The grasslands of Western Jilin Province in Ch...,0.779157
8,1,9,25,Electrical Engineering,Analog signal processing,Translinear; Analog signal processing; Low su...,This paper presents a translinear topology sui...,0.706136
9,1,7,24,Electrical Engineering,Microcontroller,automated microscopy; remote controlled micro...,Photosynthesis research employs several biophy...,0.804999


### Entropy

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1].values] # sort the values according to their entropy 
    ordered = ordered[ordered['Entropy'] <= 0.8][:N_shots*2]
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 
     # select the document with the highest entropy
      first = ordered.iloc[0] # the first element has the highest entropy
      N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
      N_shots_embeddings.append(ordered_embeddings[0])
      ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
      ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

In [49]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,4,9,70,Civil Engineering,Construction Management,Construction management; Health hazards; Cons...,A theory-based intervention strategy to improv...,0.799878
1,4,2,63,Civil Engineering,Remote Sensing,Drought monitoring; crop yield; vegetation co...,Drought is a natural climatic phenomenon that ...,0.79961
2,6,3,128,Biochemistry,Immunology,time-delay; cancer immunotherapy; gene-regula...,"In this paper, we analyse the local stability ...",0.798928
3,6,3,128,Biochemistry,Immunology,Laboratory Medicine; Medical Biochemistry; ed...,Medical biochemistry is the usual name for cli...,0.798081
4,5,9,81,Medical Science,Skin Care,Early mobility in intensive care unit; Mobili...,BACKGROUND: Pressure ulcer formation continues...,0.799966
5,5,22,94,Medical Science,Healthy Sleep,breathing; sleep stages; autonomic regulation...,Healthy sleep can be characterized by several ...,0.799947
6,0,7,7,Computer Science,Parallel computing,Fast fuzzy c-means algorithm; Image segmentat...,The research on underwater image segmentation ...,0.799874
7,0,0,0,Computer Science,Computer vision,cell division rate; computer vision; elementa...,To understand how root growth responds to temp...,0.79967
8,1,13,30,Electrical Engineering,State space representation,Fuzzy transform; Concentrated solar collector...,This paper deals with the control of concentra...,0.799784
9,1,14,31,Electrical Engineering,PID controller,8 DOF; Fuzzy logic controller; Integrated sea...,Passenger travel comfort is important while an...,0.799771


### Distance

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance'] = np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy * Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance'] = np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy * Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance'])].to_dict()]) # retrieve the data of the document with the highest Entropy*Distance
      N_shots_data.append(top) # save it

dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

In [51]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance
0,4,0,61,Civil Engineering,Ambient Intelligence,pressure sensor; ambient intelligence; eating...,We present a novel sensor system for the suppo...,0.984267,
1,4,0,61,Civil Engineering,Ambient Intelligence,Dynamic heuristics-greedy search; cloud compu...,Research supervision services like providing s...,0.976283,1.112879
2,6,2,127,Biochemistry,Human Metabolism,early warning system; driver's biocybernetic ...,The paper describes a method and a device for ...,0.975556,
3,6,2,127,Biochemistry,Human Metabolism,thermal treatament; mineral content; dynamic;...,The quality of food products is a means used f...,0.970103,1.089452
4,5,9,81,Medical Science,Skin Care,delivery; Delphi method; formulation skin car...,OBJECTIVES: What are the opportunities to inno...,0.986114,
5,5,9,81,Medical Science,Skin Care,neural network; classification; the status of...,With the continuous development of peoples' li...,0.97867,1.135423
6,0,6,6,Computer Science,Image processing,image processing; nonlinear diffusion equatio...,A system of nonlinear parabolic equations desc...,0.995269,
7,0,0,0,Computer Science,Computer vision,fruit set rate; yield prediction; computer vi...,BACKGROUND: Grapevine flower number per inflor...,0.990512,1.014879
8,1,6,23,Electrical Engineering,Electrical network,Collinear; non-collinear; congruent; electric...,"E T Bell, the famous author of 'Men of Mathema...",0.989183,
9,1,2,19,Electrical Engineering,Electrical circuits,Frank Norris; McTeague; entropy; posthumanism...,This essay reinterprets Frank Norris's novel M...,0.987258,0.995665


### Entropy * Distance from centroid

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance*entropy'] = ordered['Entropy']*np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy * Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance*entropy']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance*entropy'] = ordered['Entropy']*np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy * Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance*entropy'])].to_dict()]) # retrieve the data of the document with the highest Entropy*Distance
      N_shots_data.append(top) # save it

dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

In [53]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance*entropy
0,4,0,61,Civil Engineering,Ambient Intelligence,pressure sensor; ambient intelligence; eating...,We present a novel sensor system for the suppo...,0.984267,
1,4,0,61,Civil Engineering,Ambient Intelligence,Dynamic heuristics-greedy search; cloud compu...,Research supervision services like providing s...,0.976283,1.086485
2,6,2,127,Biochemistry,Human Metabolism,early warning system; driver's biocybernetic ...,The paper describes a method and a device for ...,0.975556,
3,6,2,127,Biochemistry,Human Metabolism,thermal treatament; mineral content; dynamic;...,The quality of food products is a means used f...,0.970103,1.05688
4,5,9,81,Medical Science,Skin Care,delivery; Delphi method; formulation skin car...,OBJECTIVES: What are the opportunities to inno...,0.986114,
5,5,9,81,Medical Science,Skin Care,neural network; classification; the status of...,With the continuous development of peoples' li...,0.97867,1.111204
6,0,6,6,Computer Science,Image processing,image processing; nonlinear diffusion equatio...,A system of nonlinear parabolic equations desc...,0.995269,
7,0,0,0,Computer Science,Computer vision,chicken egg; P-Tile; computer vision; morphol...,An Omega-3 chicken egg is a chicken egg produc...,0.989819,1.004859
8,1,6,23,Electrical Engineering,Electrical network,Collinear; non-collinear; congruent; electric...,"E T Bell, the famous author of 'Men of Mathema...",0.989183,
9,1,2,19,Electrical Engineering,Electrical circuits,Frank Norris; McTeague; entropy; posthumanism...,This essay reinterprets Frank Norris's novel M...,0.987258,0.982979


### Entropy + Distance from centroid

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance+entropy'] = ordered['Entropy']+np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy + Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance+entropy']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance+entropy'] = ordered['Entropy']+np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy + Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance+entropy'])].to_dict()]) # retrieve the data of the document with the highest Entropy+Distance
      N_shots_data.append(top) # save it
      
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

In [55]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance+entropy
0,4,0,61,Civil Engineering,Ambient Intelligence,pressure sensor; ambient intelligence; eating...,We present a novel sensor system for the suppo...,0.984267,
1,4,0,61,Civil Engineering,Ambient Intelligence,Dynamic heuristics-greedy search; cloud compu...,Research supervision services like providing s...,0.976283,2.089162
2,6,2,127,Biochemistry,Human Metabolism,early warning system; driver's biocybernetic ...,The paper describes a method and a device for ...,0.975556,
3,6,2,127,Biochemistry,Human Metabolism,thermal treatament; mineral content; dynamic;...,The quality of food products is a means used f...,0.970103,2.059555
4,5,9,81,Medical Science,Skin Care,delivery; Delphi method; formulation skin car...,OBJECTIVES: What are the opportunities to inno...,0.986114,
5,5,9,81,Medical Science,Skin Care,neural network; classification; the status of...,With the continuous development of peoples' li...,0.97867,2.114093
6,0,6,6,Computer Science,Image processing,image processing; nonlinear diffusion equatio...,A system of nonlinear parabolic equations desc...,0.995269,
7,0,0,0,Computer Science,Computer vision,chicken egg; P-Tile; computer vision; morphol...,An Omega-3 chicken egg is a chicken egg produc...,0.989819,2.005013
8,1,6,23,Electrical Engineering,Electrical network,Collinear; non-collinear; congruent; electric...,"E T Bell, the famous author of 'Men of Mathema...",0.989183,
9,1,2,19,Electrical Engineering,Electrical circuits,Frank Norris; McTeague; entropy; posthumanism...,This essay reinterprets Frank Norris's novel M...,0.987258,1.982924


### $\alpha Entropy+ (1-\alpha) Distance$

In [56]:
alpha = 0.7 

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['aE+(1-a)D'] = alpha * ordered['Entropy']+(1-alpha)*np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # alpha * Entropy + (1 - alpha) Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['aE+(1-a)D']) # retrieve the id of the documnet with the highest alpha * Entropy + (1-alpha)distance
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['aE+(1-a)D'] = alpha * ordered['Entropy']+(1-alpha)*np.linalg.norm(centroid-ordered_embeddings, axis = 1) # alpha * Entropy + (1-alpha)Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['aE+(1-a)D'])].to_dict()]) # retrieve the data of the document with the highest Entropy+Distance
      N_shots_data.append(top) # save it
      
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

In [58]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,aE+(1-a)D
0,4,0,61,Civil Engineering,Ambient Intelligence,pressure sensor; ambient intelligence; eating...,We present a novel sensor system for the suppo...,0.984267,
1,4,0,61,Civil Engineering,Ambient Intelligence,Dynamic heuristics-greedy search; cloud compu...,Research supervision services like providing s...,0.976283,1.017262
2,6,2,127,Biochemistry,Human Metabolism,early warning system; driver's biocybernetic ...,The paper describes a method and a device for ...,0.975556,
3,6,2,127,Biochemistry,Human Metabolism,thermal treatament; mineral content; dynamic;...,The quality of food products is a means used f...,0.970103,1.005908
4,5,9,81,Medical Science,Skin Care,delivery; Delphi method; formulation skin car...,OBJECTIVES: What are the opportunities to inno...,0.986114,
5,5,9,81,Medical Science,Skin Care,neural network; classification; the status of...,With the continuous development of peoples' li...,0.97867,1.025696
6,0,6,6,Computer Science,Image processing,image processing; nonlinear diffusion equatio...,A system of nonlinear parabolic equations desc...,0.995269,
7,0,0,0,Computer Science,Computer vision,chicken egg; P-Tile; computer vision; morphol...,An Omega-3 chicken egg is a chicken egg produc...,0.989819,0.997431
8,1,6,23,Electrical Engineering,Electrical network,Collinear; non-collinear; congruent; electric...,"E T Bell, the famous author of 'Men of Mathema...",0.989183,
9,1,2,19,Electrical Engineering,Electrical circuits,Frank Norris; McTeague; entropy; posthumanism...,This essay reinterprets Frank Norris's novel M...,0.987258,0.98978


### Test set


In [22]:
test_dataset = pd.read_excel('test.xlsx', index_col=0)
test_docs = test_dataset['Abstract']
test_labels = get_mapped_labels(test_dataset['Domain'], REMAP_LEV1)

## Unsupervised

In [24]:
from torch import Tensor, sum, exp, mm, bmm, nn
class NContrastiveLoss(nn.Module):
    '''
    Compute generalised Contrastive Loss, where there is 1 positive and N negative labels
    for each instance. The embedding of the instance gets pulled close to the positive
    label embedding while being pushed away from each of the N negative labels embeddings.
    https://papers.nips.cc/paper/2016/hash/6b180037abbebea991d8b1232f8a8ca9-Abstract.html
    '''
    def __init__(self) -> None:
        super(NContrastiveLoss, self).__init__()

    def forward(self, anchor: Tensor, positive: Tensor, negatives: Tensor) -> torch.float:
        '''
        Pulls anchor and positive closer together and pushes anchor and negatives further
        apart.
        For each example in the batch, there is 1 anchor, 1 positive and N negatives.
        The loss formulated here optimizes the dot product.

        Parameters
        ----------
        anchor: 2D tensor
                batch of anchors embeddings
        positive: 2D tensor
                  batch of positive embedding
        negatives : 3D tensor
                    batch of N-negatives embeddings per

        Returns
        -------
        Float tensor
            Sum of N-contrastive-loss for each element of the batch.
        '''
        # Make anchor and positive tensors 3D, by expanding empty dimension 1.
        batch_size = len(anchor)
        anchor = anchor.unsqueeze(1)
        positive = positive.unsqueeze(1)
        # Compute loss.
        A = exp(bmm(anchor, positive.transpose(2, 1))).view(batch_size)
        B = sum(exp(bmm(anchor, negatives.transpose(2, 1)).squeeze(1)), dim=-1)
        return -sum(torch.log(A / (A + B)), dim=-1) / batch_size

In [29]:
def floored_cosine_knn(x, y):
  norm_x = x/np.linalg.norm(x)
  norm_y = y/np.linalg.norm(y)
  return max(0.0, 1 - np.dot(norm_x, norm_y))

### Freeze sentence transformer and train classifier

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

few_shot_unsupervised_model = FreezeSTTrainClassifier('sentence-transformers/all-mpnet-base-v2', 768, 768, device, 0.3)

lr = 1e-2
epochs = 4
batch_size = 256

criterion = NContrastiveLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, few_shot_unsupervised_model.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

training_documents = document_class(dataframes['Abstract'], dataframes['Domain'])
training_dataloader = DataLoader(training_documents, batch_size=batch_size, shuffle=True)

In [None]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf
for epoch in range(epochs):

  training_loss = 0

  few_shot_unsupervised_model.train()

  for batch, (training_labels, docs) in enumerate(training_dataloader):
    document_embeddings = few_shot_unsupervised_model(docs)
    positives = few_shot_unsupervised_model(training_labels)
    negatives = []
    for idx in range(len(positives)):
      x = torch.cat((positives[:idx],positives[idx+1:]))
      negatives.append(x)
    negatives = torch.stack(negatives)
    loss = criterion(document_embeddings, positives, negatives)
    optimizer.zero_grad()
    loss.backward()
    training_loss += loss
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
  writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  few_shot_unsupervised_model.eval()
  test_document_embeddings = []

  with torch.no_grad():
    label_embeddings = few_shot_unsupervised_model(labels)

  test_dataloader = DataLoader(list(test_docs), batch_size=256, shuffle=False)
  for docs in tqdm(test_dataloader):
    with torch.no_grad():
      embs = few_shot_unsupervised_model(docs)
    test_document_embeddings.extend(embs)

  test_document_embeddings = torch.stack(test_document_embeddings)
  knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
  knn.fit(label_embeddings.cpu(), labels)
  y_pred = knn.predict(test_document_embeddings.cpu())
  F1 = f1_score(test_labels, y_pred, average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

Batch: 0/1, epoch: 0/4. Training loss: 0.001.


100%|██████████| 37/37 [06:31<00:00, 10.59s/it]


Epoch: 0, F1 macro: 0.5851738226839022
Batch: 0/1, epoch: 1/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 1, F1 macro: 0.5847957419510921
Batch: 0/1, epoch: 2/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 2, F1 macro: 0.5844989402337981
Batch: 0/1, epoch: 3/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 3, F1 macro: 0.5834537498872712


### Only classifier

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

test_embeddings = model.encode(list(test_docs), convert_to_tensor=True, show_progress_bar = True)
lab_embeddings = model.encode(labels, convert_to_tensor=True)

In [296]:
import torch.optim as optim
from torch.utils.data import DataLoader

classifier_unsupervised = Classifier(768, 768, device)

lr = 1e-3
epochs = 3
batch_size = 256

criterion = NContrastiveLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, classifier_unsupervised.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

documents = tuple(zip(labels_embeddings, np.asarray(doc_embeddings)))
training_dataloader = DataLoader(documents, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g,)

In [297]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf
for epoch in range(epochs):

  training_loss = 0

  classifier_unsupervised.train()

  for batch, (training_labels, docs) in enumerate(training_dataloader):
    docs = docs.to(device)
    training_labels = training_labels.to(device)
    document_embeddings = classifier_unsupervised(docs)
    positives = classifier_unsupervised(training_labels)
    negatives = []
    for idx in range(len(positives)):
      x = torch.cat((positives[:idx],positives[idx+1:]))
      negatives.append(x)
    negatives = torch.stack(negatives)
    loss = criterion(document_embeddings, positives, negatives)
    optimizer.zero_grad()
    loss.backward()
    training_loss += loss
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
  writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  classifier_unsupervised.eval()
  test_document_embeddings = []

  with torch.no_grad():
    label_embeddings = classifier_unsupervised(lab_embeddings)

  test_dataloader = DataLoader(test_embeddings, batch_size=256, shuffle=False, worker_init_fn=seed_worker, generator=g,)
  for docs in tqdm(test_dataloader):
    with torch.no_grad():
      embs = classifier_unsupervised(docs)
    test_document_embeddings.extend(embs)

  test_document_embeddings = torch.stack(test_document_embeddings)
  knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
  knn.fit(label_embeddings.cpu(), labels)
  y_pred = knn.predict(test_document_embeddings.cpu())
  F1 = f1_score(test_labels, y_pred, average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

Batch: 0/1, epoch: 0/3. Training loss: 1.977.


100%|██████████| 37/37 [00:00<00:00, 979.22it/s]


Epoch: 0, F1 macro: 0.5936985521676835
Batch: 0/1, epoch: 1/3. Training loss: 1.850.


100%|██████████| 37/37 [00:00<00:00, 1070.63it/s]


Epoch: 1, F1 macro: 0.5842909769194541
Batch: 0/1, epoch: 2/3. Training loss: 1.723.


100%|██████████| 37/37 [00:00<00:00, 970.41it/s]


Epoch: 2, F1 macro: 0.5596005901266742


In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

### Finetune the sentence transformer



In [34]:
from transformers import AutoTokenizer, AutoModel
from torch import nn
class TextEncoder(nn.Module):
    def __init__(self, device, model_name: str = 'sentence-transformers/all-mpnet-base-v2') -> None:
        super(TextEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)

    # def forward(self, text: Union[str, List[str]]) -> Tensor:
    def forward(self, text) -> torch.Tensor:
        inp = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        inp = inp.to(device)
        out = self.model(**inp)[0]  # First element of model_output contains all token embeddings.
        out = self.mean_pooling(out, inp['attention_mask'])
        if isinstance(text, str):  # If input is just 1 string -> return 1D embeddings.
            out = out.squeeze(0)
        return nn.functional.normalize(out, p=2, dim=-1)

    def mean_pooling(self, token_embeddings, attention_mask):
        input_mask_expanded = \
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [35]:
class complete_model_finetuning(nn.Module):
  def __init__(self, sentence_transformer_model, st_embedding_dimension, num_classes, device):
    super().__init__()
    self.st = TextEncoder(device, sentence_transformer_model)
    self.device = device
    self.to(device)

  def forward(self, documents):
    docs = self.st(documents) # assign a score to each class for every document
    return docs

In [41]:
import torch.optim as optim
from torch.utils.data import DataLoader

finetuned_model = complete_model_finetuning('sentence-transformers/all-mpnet-base-v2', 768, 768, device)
lr = 1e-3
epochs = 3
batch_size = 128

criterion = NContrastiveLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, finetuned_model.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

training_documents = document_class(dataframes['Abstract'], dataframes['Domain'])
training_dataloader = DataLoader(training_documents, batch_size=batch_size, shuffle=True)

In [43]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf

finetuned_model.eval()
test_document_embeddings = []

with torch.no_grad():
  label_embeddings = finetuned_model(labels)

test_dataloader = DataLoader(list(test_docs), batch_size=128, shuffle=False)
for docs in tqdm(test_dataloader):
  with torch.no_grad():
    embs = finetuned_model(docs)
  test_document_embeddings.extend(embs)

test_document_embeddings = torch.stack(test_document_embeddings)
knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
knn.fit(label_embeddings.cpu(), labels)
y_pred = knn.predict(test_document_embeddings.cpu())
F1 = f1_score(test_labels, y_pred, average='macro')
print(f'Epoch: {epoch}, F1 macro: {F1}')

writer.add_scalar("F1_macro/epoch", F1, 0)

for epoch in range(epochs):
  

  training_loss = 0

  finetuned_model.train()

  for batch, (training_labels, docs) in enumerate(training_dataloader):
    document_embeddings = finetuned_model(list(docs))
    positives = finetuned_model(list(training_labels))
    negatives = []
    for idx in range(len(positives)):
      x = torch.cat((positives[:idx],positives[idx+1:]))
      negatives.append(x)
    negatives = torch.stack(negatives)
    loss = criterion(document_embeddings, positives, negatives)
    optimizer.zero_grad()
    loss.backward()
    training_loss += loss
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
  writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  finetuned_model.eval()
  test_document_embeddings = []

  with torch.no_grad():
    label_embeddings = finetuned_model(labels)

  test_dataloader = DataLoader(list(test_docs), batch_size=128, shuffle=False)
  for docs in tqdm(test_dataloader):
    with torch.no_grad():
      embs = finetuned_model(docs)
    test_document_embeddings.extend(embs)

  test_document_embeddings = torch.stack(test_document_embeddings)
  knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
  knn.fit(label_embeddings.cpu(), labels)
  y_pred = knn.predict(test_document_embeddings.cpu())
  F1 = f1_score(test_labels, y_pred, average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

100%|██████████| 74/74 [05:32<00:00,  4.49s/it]


Epoch: 1, F1 macro: 0.5958131839993924
Batch: 0/1, epoch: 0/3. Training loss: 1.947.


100%|██████████| 74/74 [05:35<00:00,  4.53s/it]


Epoch: 0, F1 macro: 0.19327256312110172
Batch: 0/1, epoch: 1/3. Training loss: 1.811.


100%|██████████| 74/74 [05:32<00:00,  4.50s/it]


Epoch: 1, F1 macro: 0.031276656241114593
Batch: 0/1, epoch: 2/3. Training loss: 1.962.


100%|██████████| 74/74 [05:32<00:00,  4.49s/it]


Epoch: 2, F1 macro: 0.030474156985147327


In [None]:
!pip install tensorboard
%load_ext tensorboard
%tensorboard --logdir=runs

## Supervised

In [None]:
from tqdm import tqdm
def predict(model, data, batch_size, device):
  model.to(device)
  model.eval()
  dataloader = DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=my_collate_fn)
  y_pred = []
  for docs, _ in tqdm(dataloader):
    with torch.no_grad():
      y_pred.extend(model(docs).cpu().numpy())
  return y_pred

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
integer_labels = le.fit_transform(labels)

int_top_1_labels = le.transform(dataframes['Domain'])

ohe = OneHotEncoder(sparse=False)
ohe.fit(integer_labels.reshape(-1,1))

ohe_top_1_labels = ohe.transform(int_top_1_labels.reshape(-1,1)) # (32889, 7)

In [None]:
int_test_labels = le.transform(test_labels)
ohe_test_labels = ohe.transform(int_test_labels.reshape(-1,1)) # (9398, 7)

test_documents = document_class(test_docs, ohe_test_labels)

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

supervised_model = FreezeSTTrainClassifier('sentence-transformers/all-mpnet-base-v2', 768, len(labels), device)

lr = 1e-1
epochs = 2
batch_size = 256

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, supervised_model.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

training_documents = document_class(dataframes['Abstract'], ohe_top_1_labels)
training_dataloader = DataLoader(training_documents, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)

In [None]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf
for epoch in range(epochs):

  training_loss = 0

  supervised_model.train()

  for batch, (docs, training_labels) in enumerate(training_dataloader):
    training_labels = training_labels.to(device)

    probabilities = supervised_model(docs)
    loss = criterion(probabilities, training_labels)
    training_loss += loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
    writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  supervised_model.eval()
  y_pred = predict(supervised_model, test_documents, 256, device)
  F1 = f1_score(int_test_labels, np.argmax(np.asarray(y_pred), axis = 1), average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs