<a href="https://colab.research.google.com/github/MorenoSara/Few-Shot_Text_Classification/blob/main/zero_shot_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [261]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers.util import cos_sim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_dataset = pd.read_excel('train.xlsx', index_col=0) # 32889 samples

In [29]:
REMAP_LEV1 = {'CS': 'Computer Science', 
              'Civil': 'Civil Engineering', 
              'ECE': 'Electrical Engineering', 
              'Psychology': 'Psychology', 
              'MAE': 'Mechanical Engineering', 
              'Medical': 'Medical Science', 
              'biochemistry': 'Biochemistry'}

In [31]:
def get_mapped_labels(data, mapping_dict):
  labels = set(data)
  labels = [l.strip() for l in labels]
  return list(map(lambda l: mapping_dict[l], labels))

In [46]:
labels = get_mapped_labels(train_dataset['Domain'], REMAP_LEV1)
abstracts = train_dataset['Abstract']

In [69]:
labels

['Medical Science',
 'Computer Science',
 'Electrical Engineering',
 'Biochemistry',
 'Psychology',
 'Mechanical Engineering',
 'Civil Engineering']

In [256]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device = device)

In [259]:
labels_embeddings = model.encode(labels)
doc_embeddings = model.encode(abstracts[:50]) # directly encode the entire documents 

In [253]:
from scipy import spatial
def floored_cosine_knn(x, y):
  norm_x = x/np.linalg.norm(x)
  norm_y = y/np.linalg.norm(y)
  return max(0.0, 1 - np.dot(norm_x, norm_y))

In [254]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
knn.fit(labels_embeddings, labels)
knn.predict(doc_embeddings)

array(['Psychology', 'Computer Science', 'Electrical Engineering',
       'Biochemistry'], dtype='<U22')

### Document embeddings with entropy calculation

In [None]:
import nltk
nltk.download('punkt')
import scipy
import numpy as np

In [262]:
def floored_cosine(X, Y):
  norm_x = X / np.linalg.norm(X, axis=1, keepdims=True)
  norm_y = Y / np.linalg.norm(Y, axis=1, keepdims=True)
  return np.maximum(0, np.matmul(norm_x, norm_y.T))

def floored_cosine_tensors(X, Y):
  sim = cos_sim(X, Y)
  return np.maximum(0, sim)

In [266]:
def get_entropies(sentences, labels_embeddings):
  # probs = [[floored_cosine_knn(s, y) for y in labels_embeddings] for s in sentences]
  # probs = floored_cosine(sentences, labels_embeddings)
  probs = floored_cosine_tensors(sentences, labels_embeddings)
  normalized_probs = probs/np.linalg.norm(probs)
  normalized_entropy = scipy.stats.entropy(normalized_probs, axis = 1)/np.log(labels_embeddings.shape[0])
  return 1 - normalized_entropy

In [267]:
docs = []
for abs in abstracts[:4]:
  sentences = nltk.tokenize.sent_tokenize(abs)
  sent_embs = model.encode(sentences)
  entropies = get_entropies(sent_embs, labels_embeddings)
  docs.append(np.dot(entropies, sent_embs)/sum(entropies))

docs_embeddings = np.array(docs) # shape (num_docs, 768)

In [268]:
knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', metric = floored_cosine_knn)
knn.fit(labels_embeddings, labels)
knn.predict(docs_embeddings) # same results obtained without entropy 

array(['Psychology', 'Computer Science', 'Mechanical Engineering',
       'Biochemistry'], dtype='<U22')