<a href="https://colab.research.google.com/github/MorenoSara/Few-Shot_Text_Classification/blob/main/zero_shot_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers.util import cos_sim
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_dataset = pd.read_excel('train.xlsx', index_col=0) # 32889 samples
train_dataset.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,5,43,115,Medical,Parkinson's Disease,deep brain stimulation; basal ganglia; parkin...,The subthalamic nucleus (STN) and globus palli...
1,0,1,1,CS,Machine learning,Supervised classification; Label ranking prob...,Preference learning is the branch of machine l...
2,0,7,7,CS,Parallel computing,Track fitting; Track reconstruction; Multiple...,Modern semiconductor detectors allow for charg...
3,6,6,131,biochemistry,Polymerase chain reaction,bacterial community; calves; probiotics; rume...,The objective of this study was to assess the ...
4,6,6,131,biochemistry,Polymerase chain reaction,Bladder cancer; glycoprotein nonmetastatic me...,Glycoprotein nonmetastatic melanoma protein B ...


In [4]:
REMAP_LEV1 = {'CS': 'Computer Science', 
              'Civil': 'Civil Engineering', 
              'ECE': 'Electrical Engineering', 
              'Psychology': 'Psychology', 
              'MAE': 'Mechanical Engineering', 
              'Medical': 'Medical Science', 
              'biochemistry': 'Biochemistry'}

In [5]:
def get_mapped_labels(data, mapping_dict):
  labels = set(data)
  labels = [l.strip() for l in labels]
  return list(map(lambda l: mapping_dict[l], labels))

In [6]:
labels = get_mapped_labels(train_dataset['Domain'], REMAP_LEV1)
abstracts = train_dataset['Abstract']

In [7]:
labels

['Psychology',
 'Computer Science',
 'Electrical Engineering',
 'Mechanical Engineering',
 'Civil Engineering',
 'Biochemistry',
 'Medical Science']

In [8]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

In [9]:
labels_embeddings = model.encode(labels)
doc_embeddings = model.encode(abstracts[:5]) # directly encode the entire documents 

In [10]:
def floored_cosine_knn(x, y):
  norm_x = x/np.linalg.norm(x)
  norm_y = y/np.linalg.norm(y)
  return max(0.0, 1 - np.dot(norm_x, norm_y))

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
knn.fit(labels_embeddings, labels)
knn.predict(doc_embeddings)

array(['Psychology', 'Computer Science', 'Electrical Engineering',
       'Biochemistry', 'Biochemistry'], dtype='<U22')

### Document embeddings with entropy calculation

In [12]:
import nltk
nltk.download('punkt')
import scipy
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def floored_cosine(X, Y):
  norm_x = X / np.linalg.norm(X, axis=1, keepdims=True)
  norm_y = Y / np.linalg.norm(Y, axis=1, keepdims=True)
  return np.maximum(0, np.matmul(norm_x, norm_y.T))

def floored_cosine_tensors(X, Y):
  sim = cos_sim(X, Y)
  return np.maximum(0, sim)

In [14]:
def get_entropies(sentences, labels_embeddings):
  # probs = [[floored_cosine_knn(s, y) for y in labels_embeddings] for s in sentences]
  # probs = floored_cosine(sentences, labels_embeddings)
  probs = floored_cosine_tensors(sentences, labels_embeddings)
  normalized_probs = probs/np.linalg.norm(probs)
  normalized_entropy = scipy.stats.entropy(normalized_probs, axis = 1)/np.log(labels_embeddings.shape[0])
  return 1 - normalized_entropy

In [15]:
docs = []
for abs in abstracts[:5]:
  sentences = nltk.tokenize.sent_tokenize(abs)
  sent_embs = model.encode(sentences)
  entropies = get_entropies(sent_embs, labels_embeddings)
  docs.append(np.dot(entropies, sent_embs)/sum(entropies))

docs_embeddings = np.array(docs) # shape (num_docs, 768)

In [16]:
knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
knn.fit(labels_embeddings, labels)
knn.predict(docs_embeddings) # almost the same results obtained without entropy 

array(['Psychology', 'Computer Science', 'Mechanical Engineering',
       'Biochemistry', 'Biochemistry'], dtype='<U22')