<a href="https://colab.research.google.com/github/MorenoSara/Few-Shot_Text_Classification/blob/main/Few_shot_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers.util import cos_sim
import numpy as np
import scipy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load data

In [None]:
train_dataset = pd.read_excel('train.xlsx', index_col=0) # 32889 samples
train_dataset.head(10)

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,5,43,115,Medical,Parkinson's Disease,deep brain stimulation; basal ganglia; parkin...,The subthalamic nucleus (STN) and globus palli...
1,0,1,1,CS,Machine learning,Supervised classification; Label ranking prob...,Preference learning is the branch of machine l...
2,0,7,7,CS,Parallel computing,Track fitting; Track reconstruction; Multiple...,Modern semiconductor detectors allow for charg...
3,6,6,131,biochemistry,Polymerase chain reaction,bacterial community; calves; probiotics; rume...,The objective of this study was to assess the ...
4,6,6,131,biochemistry,Polymerase chain reaction,Bladder cancer; glycoprotein nonmetastatic me...,Glycoprotein nonmetastatic melanoma protein B ...
5,3,1,53,MAE,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...
6,6,2,127,biochemistry,Human Metabolism,BORON; MANGANESE; MEMBRANES; CELL WALLS; NADH...,Boron has been recognized since 1923 as an ess...
7,2,7,40,Psychology,Depression,putamen; basal ganglia; magnetic resonance im...,Putamen volume is seen to alter in neurologica...
8,0,4,4,CS,Operating systems,Wireless sensor networks; localized interacti...,Wireless sensor networks (WSNs) are characteri...
9,2,4,37,Psychology,Prosocial behavior,Eye contact effects; Therapeutic implications...,Introduction. The perception of a direct gaze ...


In [None]:
REMAP_LEV1 = {'CS': 'Computer Science', 
              'Civil': 'Civil Engineering', 
              'ECE': 'Electrical Engineering', 
              'Psychology': 'Psychology', 
              'MAE': 'Mechanical Engineering', 
              'Medical': 'Medical Science', 
              'biochemistry': 'Biochemistry'}

In [None]:
def get_mapped_labels(data, mapping_dict):
  labels = [l.strip() for l in data]
  return list(map(lambda l: mapping_dict[l], labels))

In [None]:
labels = get_mapped_labels(set(train_dataset['Domain']), REMAP_LEV1)
abstracts = train_dataset['Abstract']

In [None]:
labels

['Medical Science',
 'Psychology',
 'Electrical Engineering',
 'Computer Science',
 'Civil Engineering',
 'Mechanical Engineering',
 'Biochemistry']

### Compute entropy

In [None]:
st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
labels_embeddings = st_model.encode(labels)
# doc_embeddings = st_model.encode(abstracts[:200], batch_size = 32, show_progress_bar=True) # directly encode the entire documents 

In [None]:
#with open('doc_embeddings.txt','wb') as f:
    #for line in np.matrix(doc_embeddings):
        #np.savetxt(f, line)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/doc_embeddings.txt', sep = ' ', header=None) 
# retrieve corresponding document using train_dataset.iloc[i]

In [None]:
def floored_cosine_tensors(X, Y):
  sim = cos_sim(X, Y)
  return np.maximum(0, sim)

In [None]:
def get_entropies(sentences, labels_embeddings):
  probs = floored_cosine_tensors(sentences, labels_embeddings)
  normalized_probs = probs/np.linalg.norm(probs)
  normalized_entropy = scipy.stats.entropy(normalized_probs, axis = 1)/np.log(labels_embeddings.shape[0])
  return normalized_entropy

In [None]:
df = df.astype(np.float32)
doc_embeddings = np.matrix(df)
entropies = get_entropies(doc_embeddings, labels_embeddings)

  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


In [None]:
train_df = train_dataset
train_df['Entropy'] = entropies
train_df['Domain'] = get_mapped_labels(train_df['Domain'], REMAP_LEV1)
train_df.dropna(inplace=True)
train_df

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,5,43,115,Medical Science,Parkinson's Disease,deep brain stimulation; basal ganglia; parkin...,The subthalamic nucleus (STN) and globus palli...,0.577499
1,0,1,1,Computer Science,Machine learning,Supervised classification; Label ranking prob...,Preference learning is the branch of machine l...,0.784086
2,0,7,7,Computer Science,Parallel computing,Track fitting; Track reconstruction; Multiple...,Modern semiconductor detectors allow for charg...,0.543961
3,6,6,131,Biochemistry,Polymerase chain reaction,bacterial community; calves; probiotics; rume...,The objective of this study was to assess the ...,0.358490
4,6,6,131,Biochemistry,Polymerase chain reaction,Bladder cancer; glycoprotein nonmetastatic me...,Glycoprotein nonmetastatic melanoma protein B ...,0.337900
...,...,...,...,...,...,...,...,...
32884,5,26,98,Medical Science,HIV/AIDS,Biomarkers; Cytokine; HIV; Gingival crevicula...,Objective: This study evaluates the potential ...,0.349118
32885,3,3,55,Mechanical Engineering,Machine design,Red Catuai; Poisson's ratio; density; elastic...,The finite element method has been employed in...,0.766171
32886,2,18,51,Psychology,Problem-solving,health care issues; health care utilization; ...,The University Hospital of Zurich offers a tex...,0.699593
32887,5,41,113,Medical Science,Overactive Bladder,attentional and interoceptive networks; brain...,BACKGROUND: Treatment of urgency urinary incon...,0.828843


Create a dataframe for each label

In [None]:
dfs = {}
for label in labels:
  dfs[f"{label.replace(' ', '_')}"] = train_df[train_df['Domain'] == label]

In [None]:
from torch.utils.data.dataset import Dataset
class document_class(Dataset):
  def __init__(self, documents, labels):
    self.train_df = []
    for id, doc in enumerate(documents):
      curr_doc = [labels[id]]
      curr_doc.append(doc) 
      self.train_df.append(curr_doc)
  
  def __getitem__(self, index):
    return self.train_df[index] 

  def __len__(self):
    return len(self.train_df)

In [None]:
def my_collate_fn(batch):
  documents = []
  labels = []
  for doc in batch:
    documents.append(doc[1])
    labels.append(list(doc[0]))
  return (documents, torch.Tensor(labels))

In [None]:
def save_model(model, model_path):
    """Save model."""
    torch.save(model.state_dict(), model_path)

def load_model(model, model_path, use_cuda=True):
    """Load model."""
    map_location = 'cpu'
    if use_cuda and torch.cuda.is_available():
        map_location = 'cuda:0'
    model.load_state_dict(torch.load(model_path, map_location))
    return model

# Model class - freeze sentence transformer and train classification layer

In [None]:
from torch import nn
class FreezeSTTrainClassifier(nn.Module):
  def __init__(self, sentence_transformer_model, st_embedding_dimension, num_classes, device, dropout = 0.3):
    super().__init__()
    self.st = SentenceTransformer(sentence_transformer_model)
    for param in self.st.parameters():
      param.requires_grad = False
    self.classification = nn.Linear(in_features=st_embedding_dimension, out_features=num_classes)
    torch.nn.init.eye_(self.classification.weight)
    self.device = device
    self.dropout = nn.Dropout(dropout)
    self.to(device)

  def forward(self, documents):
    with torch.no_grad():
      docs = self.st.encode(documents, convert_to_tensor=True) # exploit pretrained sentence transformer
    docs = self.dropout(docs)
    probs = self.classification(docs) # assign a score to each class for every document
    return probs

# Few-shot text classification
Works also for one shot classification

In [None]:
N_shots = 1# number of examples for each class

### Random

In [None]:
from numpy.random import randint
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    N_shots_data.append(dfs[k].iloc[randint(0, len(dfs[k]), N_shots)]) # sort the values according to their entropy 
dataframes = pd.concat(N_shots_data).reset_index(drop=True)

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,6,7,132,Biochemistry,Northern blotting,alternative splicing; double sex; sex differe...,Many basic cellular processes are shared acros...,0.451398
1,6,3,128,Biochemistry,Immunology,Multiple sclerosis; immunology,Background: Glycosylation alterations have bee...,0.334992
2,6,7,132,Biochemistry,Northern blotting,Mac-2BP; hTERT; gastric cancer,Mac-2 binding protein (Mac-2BP) is a secreted ...,0.34053
3,0,14,14,Computer Science,Computer programming,computer programming; flowchart-based environ...,Intelligent tutoring and personalization are c...,0.846882
4,0,3,3,Computer Science,Cryptography,Logic synthesis; Boolean functions; affine eq...,Affine equivalence classification of Boolean f...,0.121161
5,0,14,14,Computer Science,Computer programming,Teaching with innovation; learning by doing; ...,In this paper a work related to using Web3D-ba...,0.953498
6,5,49,121,Medical Science,Smoking Cessation,recurrent preterm birth; smoking cessation ...,BACKGROUND: Women with at least 1 prior occurr...,0.423962
7,5,52,124,Medical Science,Stress Management,Information security stress; Information secu...,Organizations are intensifying their informati...,0.969191
8,5,52,124,Medical Science,Stress Management,cerebral palsy; quality of life; pediatric di...,Background: Cerebral palsy (CP) is the most co...,0.717612
9,1,13,30,Electrical Engineering,State space representation,Adaptive Control; Iterative Methods; Least-Sq...,In order to adapt to a milling machines changi...,0.666075


### Entropy

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1].values] # sort the values according to their entropy 
    ordered = ordered[ordered['Entropy'] <= 0.4][:N_shots*2]
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 
     # select the document with the highest entropy
      first = ordered.iloc[0] # the first element has the highest entropy
      N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
      N_shots_embeddings.append(ordered_embeddings[0])
      ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
      ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
0,5,7,40,Medical Science,Depression,rape; socioeconomic status; social support; d...,Objective: To investigate association of the s...,0.699905
1,2,8,41,Psychology,Borderline personality disorder,Addiction rehabilitation center; Health servi...,Objectives: This study aimed to: (1) identify ...,0.699935
2,1,14,31,Electrical Engineering,PID controller,Lower extremity; Rehabilitation exoskeleton; ...,This paper presents an active disturbance reje...,0.699475
3,0,13,13,Computer Science,Algorithm design,Aggregation; Model optimization; Reduction al...,It was an important part of the aggregation si...,0.699878
4,4,1,62,Civil Engineering,Geotextile,geotechnical applications; fabric drape; jute...,From the very inception of the Indian Jute Ind...,0.699842
5,3,4,56,Mechanical Engineering,Fluid mechanics,Smooth particle hydrodynamics (SPH); Chemotax...,"Chemotaxis, the microorganisms autonomous moti...",0.699326
6,6,1,126,Biochemistry,Cell biology,allogeneicity; bioengineered organs; extracel...,Purpose of review Organ donation in the United...,0.699368


### Distance

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance'] = np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy * Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance'] = np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy * Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance'])].to_dict()]) # retrieve the data of the document with the highest Entropy*Distance
      N_shots_data.append(top) # save it

dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance
0,0,6,6,Computer Science,Image processing,history and philosophy of astronomy; techniqu...,Astronomy has a rich tradition of using color ...,0.975657,
1,0,14,14,Computer Science,Computer programming,Boundaries; computer programming; digital jou...,Amid growing calls for greater collaboration b...,0.926477,1.127094
2,0,1,1,Computer Science,Machine learning,Additive manufacturing; Three-dimensional (3D...,Medical additive manufacturing requires standa...,0.923507,1.090925
3,5,48,120,Medical Science,Senior Health,agenda-setting; health policy; professional d...,The filtering of potential policy issues from ...,0.916925,
4,5,25,97,Medical Science,Hereditary Angioedema,Swelling; Angioedema; Trachea; Hyperelasticit...,"Angioedema, the rapid swelling of under-skin t...",0.766006,1.209832
5,5,52,124,Medical Science,Stress Management,Group-level job resources; Job demands resour...,Objectives: This study adds a multilevel persp...,0.808135,1.113796
6,3,1,53,Mechanical Engineering,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...,0.937003,
7,3,7,59,Mechanical Engineering,Materials Engineering,Virtual laboratory; Video tutorials; Concrete...,This paper presents a teaching approach aiming...,0.915513,1.146891
8,3,2,54,Mechanical Engineering,Manufacturing engineering,Functional clothing; sports wear; insulation;...,Increasing quality requirements for functional...,0.92943,1.068232
9,2,3,36,Psychology,Nonverbal communication,Language; culture; intercultural competence; ...,The article focuses on development of intercul...,0.908942,


### Entropy * Distance from centroid

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance*entropy'] = ordered['Entropy']*np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy * Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance*entropy']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance*entropy'] = ordered['Entropy']*np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy * Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance*entropy'])].to_dict()]) # retrieve the data of the document with the highest Entropy*Distance
      N_shots_data.append(top) # save it

dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance*entropy
0,0,6,6,Computer Science,Image processing,history and philosophy of astronomy; techniqu...,Astronomy has a rich tradition of using color ...,0.975657,
1,0,0,0,Computer Science,Computer vision,Scilab toolbox development; system identifica...,We present a collaborative attempt to build se...,0.928948,1.04436
2,0,14,14,Computer Science,Computer programming,Boundaries; computer programming; digital jou...,Amid growing calls for greater collaboration b...,0.926477,1.008082
3,5,48,120,Medical Science,Senior Health,agenda-setting; health policy; professional d...,The filtering of potential policy issues from ...,0.916925,
4,5,52,124,Medical Science,Stress Management,Group-level job resources; Job demands resour...,Objectives: This study adds a multilevel persp...,0.808135,0.921048
5,5,25,97,Medical Science,Hereditary Angioedema,Swelling; Angioedema; Trachea; Hyperelasticit...,"Angioedema, the rapid swelling of under-skin t...",0.766006,0.859889
6,3,1,53,Mechanical Engineering,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...,0.937003,
7,3,7,59,Mechanical Engineering,Materials Engineering,Virtual laboratory; Video tutorials; Concrete...,This paper presents a teaching approach aiming...,0.915513,1.016281
8,3,3,55,Mechanical Engineering,Machine design,Iron ore; Magnetic separation; Process instru...,In this work an ultrasound-based measurement m...,0.655212,0.676675
9,2,3,36,Psychology,Nonverbal communication,Language; culture; intercultural competence; ...,The article focuses on development of intercul...,0.908942,


In [None]:
dfs['Electrical_Engineering'].iloc[(dfs['Electrical_Engineering']['Entropy']).argsort()[::-1][:N_shots*2].values]

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy
67,1,5,22,Electrical Engineering,System identification,Fuzzy wavelet neural networks; System identif...,Food product safety is one of the most promisi...,0.946384
91,1,2,19,Electrical Engineering,Electrical circuits,acoustic signal; induction motor; feature ext...,A correct diagnosis of electrical circuits is ...,0.859619
83,1,13,30,Electrical Engineering,State space representation,System theory; process modelling; titration c...,Environmental problems have positive slope and...,0.816843
191,1,5,22,Electrical Engineering,System identification,Chaos; parameter identification; hidden attra...,Parameter estimation plays an important role i...,0.796248
95,1,9,25,Electrical Engineering,Electrical generator,Apex locator; electric pulp tester; electroca...,Spinal cord stimulation has been a therapeutic...,0.776884
80,1,9,25,Electrical Engineering,Analog signal processing,Analogue electronic circuits; Particle tracki...,This paper presents the first pixel detector r...,0.652375


### Entropy + Distance from centroid

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['distance+entropy'] = ordered['Entropy']+np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # Entropy + Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['distance+entropy']) # retrieve the id of the documnet with the highest distance*entropy
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['distance+entropy'] = ordered['Entropy']+np.linalg.norm(centroid-ordered_embeddings, axis = 1) # Entropy + Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['distance+entropy'])].to_dict()]) # retrieve the data of the document with the highest Entropy+Distance
      N_shots_data.append(top) # save it
      
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,distance+entropy
0,0,6,6,Computer Science,Image processing,history and philosophy of astronomy; techniqu...,Astronomy has a rich tradition of using color ...,0.975657,
1,0,14,14,Computer Science,Computer programming,Boundaries; computer programming; digital jou...,Amid growing calls for greater collaboration b...,0.926477,2.053572
2,0,1,1,Computer Science,Machine learning,Additive manufacturing; Three-dimensional (3D...,Medical additive manufacturing requires standa...,0.923507,2.014431
3,5,48,120,Medical Science,Senior Health,agenda-setting; health policy; professional d...,The filtering of potential policy issues from ...,0.916925,
4,5,52,124,Medical Science,Stress Management,Group-level job resources; Job demands resour...,Objectives: This study adds a multilevel persp...,0.808135,1.947855
5,5,25,97,Medical Science,Hereditary Angioedema,Swelling; Angioedema; Trachea; Hyperelasticit...,"Angioedema, the rapid swelling of under-skin t...",0.766006,1.888568
6,3,1,53,Mechanical Engineering,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...,0.937003,
7,3,7,59,Mechanical Engineering,Materials Engineering,Virtual laboratory; Video tutorials; Concrete...,This paper presents a teaching approach aiming...,0.915513,2.02558
8,3,3,55,Mechanical Engineering,Machine design,Iron ore; Magnetic separation; Process instru...,In this work an ultrasound-based measurement m...,0.655212,1.68797
9,2,3,36,Psychology,Nonverbal communication,Language; culture; intercultural competence; ...,The article focuses on development of intercul...,0.908942,


### $\alpha Entropy+ (1-\alpha) Distance$

In [None]:
alpha = 0.7 

In [None]:
N_shots_data = []

for k in dfs.keys(): # for each dataframe containing the documents relative to a single label
  if len(dfs[k]) > 0: # if the dataframe has at least an entry

    ordered = dfs[k].iloc[(dfs[k]['Entropy']).argsort()[::-1][:N_shots*2].values] # sort the values according to their entropy 
    ordered_embeddings = st_model.encode(list(ordered['Abstract']), batch_size = 256, show_progress_bar=True) # compute the documents embeddings

    N_shots_embeddings = [] # list successively used to compute the centroid

    for i in range(N_shots): 

      if i == 0: # select the document with the highest entropy
        first = ordered.iloc[0] # the first element has the highest entropy
        N_shots_data.append(pd.DataFrame.from_records([first.to_dict()])) # save the data of the document
        N_shots_embeddings.append(ordered_embeddings[0])
        ordered.drop(ordered.index[0], inplace=True) # drop the row corresponding to the element with the higest entropy for the successive iterations
        ordered['aE+(1-a)D'] = alpha * ordered['Entropy']+(1-alpha)*np.linalg.norm(ordered_embeddings[0]-ordered_embeddings[1:], axis = 1) # alpha * Entropy + (1 - alpha) Euclidean_distance(highest_entropy - all_other_points)
        ordered_embeddings = ordered_embeddings[1:] # drop the embedding corresponding to the element with the higest entropy for the successive iterations
        continue # go to the second point

      first_id = np.argmax(ordered['aE+(1-a)D']) # retrieve the id of the documnet with the highest alpha * Entropy + (1-alpha)distance
      first = ordered.iloc[first_id] # get the corresponding document
      N_shots_embeddings.append(ordered_embeddings[first_id]) # append in the list its embeddings
      centroid = np.mean(N_shots_embeddings, axis=0) # compute the centroid of the documente selected so far
      ordered.drop(ordered.index[first_id], inplace=True) # remove the document from the dataframe
      ordered_embeddings = np.delete(ordered_embeddings, first_id, 0) # remove its embedding
      ordered['aE+(1-a)D'] = alpha * ordered['Entropy']+(1-alpha)*np.linalg.norm(centroid-ordered_embeddings, axis = 1) # alpha * Entropy + (1-alpha)Euclidean_distance(centroid - remaining_points)
      top = pd.DataFrame.from_records([ordered.iloc[np.argmax(ordered['aE+(1-a)D'])].to_dict()]) # retrieve the data of the document with the highest Entropy+Distance
      N_shots_data.append(top) # save it
      
dataframes = pd.concat(N_shots_data).reset_index(drop=True) # input of concat is a list of dataframes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataframes

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract,Entropy,aE+(1-a)D
0,0,6,6,Computer Science,Image processing,history and philosophy of astronomy; techniqu...,Astronomy has a rich tradition of using color ...,0.975657,
1,0,0,0,Computer Science,Computer vision,Scilab toolbox development; system identifica...,We present a collaborative attempt to build se...,0.928948,0.987535
2,0,14,14,Computer Science,Computer programming,Boundaries; computer programming; digital jou...,Amid growing calls for greater collaboration b...,0.926477,0.974958
3,5,48,120,Medical Science,Senior Health,agenda-setting; health policy; professional d...,The filtering of potential policy issues from ...,0.916925,
4,5,52,124,Medical Science,Stress Management,Group-level job resources; Job demands resour...,Objectives: This study adds a multilevel persp...,0.808135,0.907611
5,5,51,123,Medical Science,Sprains and Strains,*Ankle Joint [surgery]; Chronic Disease; Join...,Background Chronic lateral ankle instability o...,0.777258,0.873189
6,3,1,53,Mechanical Engineering,Hydraulics,Groundwater hydraulics; Aquitard; Bangladesh;...,Identifying flow processes in multi-aquifer fl...,0.937003,
7,3,7,59,Mechanical Engineering,Materials Engineering,Virtual laboratory; Video tutorials; Concrete...,This paper presents a teaching approach aiming...,0.915513,0.973879
8,3,3,55,Mechanical Engineering,Machine design,Iron ore; Magnetic separation; Process instru...,In this work an ultrasound-based measurement m...,0.655212,0.768476
9,2,3,36,Psychology,Nonverbal communication,Language; culture; intercultural competence; ...,The article focuses on development of intercul...,0.908942,


### Test set


In [None]:
test_dataset = pd.read_excel('test.xlsx', index_col=0)
test_docs = test_dataset['Abstract']
test_labels = get_mapped_labels(test_dataset['Domain'], REMAP_LEV1)

## Unsupervised

In [None]:
from torch import Tensor, sum, exp, mm, bmm
class NContrastiveLoss(nn.Module):
    '''
    Compute generalised Contrastive Loss, where there is 1 positive and N negative labels
    for each instance. The embedding of the instance gets pulled close to the positive
    label embedding while being pushed away from each of the N negative labels embeddings.
    https://papers.nips.cc/paper/2016/hash/6b180037abbebea991d8b1232f8a8ca9-Abstract.html
    '''
    def __init__(self) -> None:
        super(NContrastiveLoss, self).__init__()

    def forward(self, anchor: Tensor, positive: Tensor, negatives: Tensor) -> torch.float:
        '''
        Pulls anchor and positive closer together and pushes anchor and negatives further
        apart.
        For each example in the batch, there is 1 anchor, 1 positive and N negatives.
        The loss formulated here optimizes the dot product.

        Parameters
        ----------
        anchor: 2D tensor
                batch of anchors embeddings
        positive: 2D tensor
                  batch of positive embedding
        negatives : 3D tensor
                    batch of N-negatives embeddings per

        Returns
        -------
        Float tensor
            Sum of N-contrastive-loss for each element of the batch.
        '''
        # Make anchor and positive tensors 3D, by expanding empty dimension 1.
        batch_size = len(anchor)
        anchor = anchor.unsqueeze(1)
        positive = positive.unsqueeze(1)
        # Compute loss.
        A = exp(bmm(anchor, positive.transpose(2, 1))).view(batch_size)
        B = sum(exp(bmm(anchor, negatives.transpose(2, 1)).squeeze(1)), dim=-1)
        return -sum(torch.log(A / (A + B)), dim=-1) / batch_size

In [None]:
def floored_cosine_knn(x, y):
  norm_x = x/np.linalg.norm(x)
  norm_y = y/np.linalg.norm(y)
  return max(0.0, 1 - np.dot(norm_x, norm_y))

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

few_shot_unsupervised_model = FreezeSTTrainClassifier('sentence-transformers/all-mpnet-base-v2', 768, 768, device, 0.3)

lr = 1e-2
epochs = 4
batch_size = 256

criterion = NContrastiveLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, few_shot_unsupervised_model.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

training_documents = document_class(dataframes['Abstract'], dataframes['Domain'])
training_dataloader = DataLoader(training_documents, batch_size=batch_size, shuffle=True)

In [None]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf
for epoch in range(epochs):

  training_loss = 0

  few_shot_unsupervised_model.train()

  for batch, (training_labels, docs) in enumerate(training_dataloader):
    document_embeddings = few_shot_unsupervised_model(docs)
    positives = few_shot_unsupervised_model(training_labels)
    negatives = []
    for idx in range(len(positives)):
      x = torch.cat((positives[:idx],positives[idx+1:]))
      negatives.append(x)
    negatives = torch.stack(negatives)
    loss = criterion(document_embeddings, positives, negatives)
    optimizer.zero_grad()
    loss.backward()
    training_loss += loss
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
  writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  few_shot_unsupervised_model.eval()
  test_document_embeddings = []

  with torch.no_grad():
    label_embeddings = few_shot_unsupervised_model(labels)

  test_dataloader = DataLoader(list(test_docs), batch_size=256, shuffle=False)
  for docs in tqdm(test_dataloader):
    with torch.no_grad():
      embs = few_shot_unsupervised_model(docs)
    test_document_embeddings.extend(embs)

  test_document_embeddings = torch.stack(test_document_embeddings)
  knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
  knn.fit(label_embeddings.cpu(), labels)
  y_pred = knn.predict(test_document_embeddings.cpu())
  F1 = f1_score(test_labels, y_pred, average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

Batch: 0/1, epoch: 0/4. Training loss: 0.001.


100%|██████████| 37/37 [06:31<00:00, 10.59s/it]


Epoch: 0, F1 macro: 0.5851738226839022
Batch: 0/1, epoch: 1/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 1, F1 macro: 0.5847957419510921
Batch: 0/1, epoch: 2/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 2, F1 macro: 0.5844989402337981
Batch: 0/1, epoch: 3/4. Training loss: 0.000.


100%|██████████| 37/37 [06:31<00:00, 10.58s/it]


Epoch: 3, F1 macro: 0.5834537498872712


In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

## Supervised

In [None]:
from tqdm import tqdm
def predict(model, data, batch_size, device):
  model.to(device)
  model.eval()
  dataloader = DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=my_collate_fn)
  y_pred = []
  for docs, _ in tqdm(dataloader):
    with torch.no_grad():
      y_pred.extend(model(docs).cpu().numpy())
  return y_pred

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
integer_labels = le.fit_transform(labels)

int_top_1_labels = le.transform(dataframes['Domain'])

ohe = OneHotEncoder(sparse=False)
ohe.fit(integer_labels.reshape(-1,1))

ohe_top_1_labels = ohe.transform(int_top_1_labels.reshape(-1,1)) # (32889, 7)

In [None]:
int_test_labels = le.transform(test_labels)
ohe_test_labels = ohe.transform(int_test_labels.reshape(-1,1)) # (9398, 7)

test_documents = document_class(test_docs, ohe_test_labels)

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader

supervised_model = FreezeSTTrainClassifier('sentence-transformers/all-mpnet-base-v2', 768, len(labels), device)

lr = 1e-1
epochs = 2
batch_size = 256

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, supervised_model.parameters()), lr=lr, betas = [0.9, 0.999], eps=1e-8)

training_documents = document_class(dataframes['Abstract'], ohe_top_1_labels)
training_dataloader = DataLoader(training_documents, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)

In [None]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from tqdm import tqdm
writer = SummaryWriter()
best_eval_loss = np.inf
for epoch in range(epochs):

  training_loss = 0

  supervised_model.train()

  for batch, (docs, training_labels) in enumerate(training_dataloader):
    training_labels = training_labels.to(device)

    probabilities = supervised_model(docs)
    loss = criterion(probabilities, training_labels)
    training_loss += loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Batch: {batch}/{len(training_dataloader)}, epoch: {epoch}/{epochs}. Training loss: {training_loss:.3f}.')
    writer.add_scalar("Loss/epoch", training_loss, epoch+1)

  supervised_model.eval()
  y_pred = predict(supervised_model, test_documents, 256, device)
  F1 = f1_score(int_test_labels, np.argmax(np.asarray(y_pred), axis = 1), average='macro')
  print(f'Epoch: {epoch}, F1 macro: {F1}')

  writer.add_scalar("F1_macro/epoch", F1, epoch+1)

writer.flush()

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs