# imports

In [1]:
import collections.abc
collections.Iterable = collections.abc.Iterable

from sentence_transformers import SentenceTransformer

# import collections.abc
# collections.Iterable = collections.abc.Iterable
from domain_classifier.classifier import CorpusClassifier
# from domain_classifier.active_learner import ActiveLearner
# from domain_classifier.query_strategy import *

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
# 
import pandas as pd
import numpy as np

import pickle
import os
from pathlib import Path
import json
import time
import sys

import matplotlib.pyplot as plt

#USE ENTIRE SCREEN
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# functions

In [2]:
def getLocation(name='',*args):
    return Path({ 'testcorpus': 'testdata/given/corpus.feather',
                  'embeddings': 'testdata/buffer/embeddings/embeddings.pkl',
                  'models': 'testdata/buffer/models'}[name])
def get_weak_soft_labels(df_dataset,keywords):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    #embeddings_fname = getPath('embeddings')
    embeddings_fname = getLocation('embeddings')
    embeddings_fname.parent.mkdir(parents=True, exist_ok=True)
    if embeddings_fname.exists():
        with open(embeddings_fname, "rb") as f_in:
            doc_embeddings = pickle.load(f_in)
    else:
        n_docs = len(df_dataset['text'].to_numpy())
        batch_size = 32
        doc_embeddings = model.encode(df_dataset['text'].values[0:n_docs],batch_size=batch_size,show_progress_bar=True)
        with open(embeddings_fname, 'wb') as f_Out:
            print('5')
            pickle.dump(doc_embeddings,f_Out)

    keyword_embeddings = model.encode(keywords)
    distances = cosine_similarity(doc_embeddings, keyword_embeddings)
    return np.mean(distances, axis=1)
def get_weak_labels(df_corpus,threshold):
    return (df_corpus['weak_soft_label'] > threshold)*1
def concatTitleAndText(df_corpus):
    return (df_corpus['title'] + '. '+ df_corpus['description'])
def printInfo(df_corpus):
    positive_labels = df_corpus['weak_label'].sum()
    negative_labels = len(df_corpus) - positive_labels
    info = { 'positiveLabels':positive_labels,'negativeLabels': negative_labels }
    print(info)
def oversample_minority_class(df_dataset,oversampling_rate=10,col_label='labels'):
    if oversampling_rate < 0:
        oversampling_rate = 10**10
    iPositiveCount = df_dataset.loc[:][col_label].sum()
    iNegativeCount = len(df_dataset)-iPositiveCount
    iClassCount = int(oversampling_rate * np.min([iPositiveCount,iNegativeCount]))
    iClassCount = int(np.min([iClassCount,np.max([iPositiveCount,iNegativeCount])]))
    oversampling_rate = iClassCount/np.min([iPositiveCount,iNegativeCount])
    
    condition_positive = df_dataset.loc[:][col_label]==1
    condition_negative = df_dataset.loc[:][col_label]==0
    df_positive = df_dataset[condition_positive]
    df_negative = df_dataset[condition_negative]
    
    n_repeat = iClassCount // len(df_positive)
    idx_positive = df_positive.loc[df_positive.index.repeat(n_repeat)].index
    n_sample = np.mod(iClassCount,len(df_positive))
    idx_positive = np.concatenate([idx_positive,df_positive[:n_sample].index])
    #idx_positive = np.concatenate([idx_positive,df_positive.sample(n_sample).index])
    
    n_repeat = iClassCount // len(df_negative)
    idx_negative = df_negative.loc[df_negative.index.repeat(n_repeat)].index
    n_sample = np.mod(iClassCount,len(df_negative))
    idx_negative = np.concatenate([idx_negative,df_negative[:n_sample].index])
    #idx_negative = np.concatenate([idx_negative,df_negative.sample(n_sample).index])
    
    indices = np.hstack([idx_positive,idx_negative])
    return [df_dataset.loc[indices].reset_index(drop=True),oversampling_rate] 
    

In [3]:
df_corpus = pd.read_feather(getLocation('testcorpus'))
df_corpus.loc[:,['text']] = concatTitleAndText(df_corpus)
df_corpus.drop(['acronym', 'title', 'description'], inplace=True, axis=1)
df_corpus.loc[:,['weak_soft_label']] = get_weak_soft_labels(df_corpus,['Deep Learning'])
df_corpus = df_corpus.sort_values(by=['weak_soft_label'], ascending = False)

In [4]:
#FIND GOOD THRESHOLD
threshold = 0.3
n_display = 1000
pd.options.display.max_rows = n_display
df_corpus[df_corpus['weak_soft_label']<threshold].head(n_display)

Unnamed: 0,id,text,weak_soft_label
41616,203187,Embedded Neuromorphic Sensory Processor. Neuro...,0.299942
57938,225924,The Nº1 Social Media Profiling Solution: under...,0.299916
17481,102004,Perception and Action in Accelerating Environm...,0.2997
45724,211018,Advanced Data Modeling and Analysis Applied to...,0.299694
55742,203135,Neurocomputational mechanisms underlying age-r...,0.299418
21533,89837,Task Specific Description of Visual Color Info...,0.299254
51839,232004,EXPLAINABLE AI PIPELINES FOR BIG COPERNICUS DA...,0.299243
50595,230890,Gravitational-wave data mining. Gravitational-...,0.29924
32132,232242,Acquiring assembly skills by robot learning. P...,0.298846
4904,110613,Design Principles in Encoding Complex Noisy En...,0.298769


In [5]:
df_corpus.loc[:,['weak_label']] = get_weak_labels(df_corpus,threshold)
printInfo(df_corpus)

{'positiveLabels': 520, 'negativeLabels': 60491}


In [6]:
df_classifier = pd.DataFrame(df_corpus[['id','text','weak_label']].to_numpy(),columns=['id','text','labels'])

In [7]:
#80/20
df_train, df_val = train_test_split( df_classifier, test_size=0.2, random_state=42, stratify = df_classifier['labels'].to_numpy())
#40/40
df_train, df_test = train_test_split( df_classifier, test_size=0.5, random_state=42, stratify = df_classifier['labels'].to_numpy())
df_train,_ = oversample_minority_class(df_train,oversampling_rate=1)
clf = CorpusClassifier(path2transformers=getLocation('models'))
clf.train_loop(df_train,df_val)
clf.predict_proba(df_test)


Train batch:   0%|                                                                             | 0/520 [00:00<?, ?it/s]

> [1;32mc:\users\tilma\tfm\git\domain_classification\src\mass_data\domain_classifier\custom_model.py[0m(358)[0;36mtrain_model[1;34m()[0m
[1;32m    356 [1;33m            [0mpdb[0m[1;33m.[0m[0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    357 [1;33m[1;33m[0m[0m
[0m[1;32m--> 358 [1;33m            [0mtext[0m [1;33m=[0m [0mdata[0m[1;33m.[0m[0mget[0m[1;33m([0m[1;34m"text"[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    359 [1;33m            sample_weight = data.get(
[0m[1;32m    360 [1;33m                "sample_weight", torch.tensor(1)).to(device)
[0m
ipdb> n
> [1;32mc:\users\tilma\tfm\git\domain_classification\src\mass_data\domain_classifier\custom_model.py[0m(359)[0;36mtrain_model[1;34m()[0m
[1;32m    357 [1;33m[1;33m[0m[0m
[0m[1;32m    358 [1;33m            [0mtext[0m [1;33m=[0m [0mdata[0m[1;33m.[0m[0mget[0m[1;33m([0m[1;34m"text"[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--

Train batch:   0%|                                                                             | 0/520 [08:07<?, ?it/s]


BdbQuit: 

In [9]:
#200532

#embs.shape: torch.Size([1, 512, 768])
#embs.sum() tensor(705.0136, device='cuda:0')
#attention_mask.shape torch.Size([1, 512])
#attention_mask.sum() tensor(389, device='cuda:0')

#         text = data.get("text")
#             sample_weight = data.get(
#                 "sample_weight", torch.tensor(1)).to(device)

#             # Tokenize
#             tokenized = self.tokenizer(
#                 text, padding="max_length", truncation=True)
#             # print(tokenized["input_ids"])
#             input_ids = torch.tensor(tokenized["input_ids"]).to(device)
#             attention_mask = torch.tensor(
#                 tokenized["attention_mask"]).to(device)
#             # Embeddings
#             embs = self.embeddings(input_ids)

#             # zero the parameter gradients
#             optimizer.zero_grad()

#             # forward + backward + optimize

#             outputs = self.forward(embs, attention_mask)

keywords = ['Deep Learning']
model = SentenceTransformer('all-MiniLM-L6-v2')
#embeddings_fname = getPath('embeddings')
embeddings_fname = getLocation('embeddings')
embeddings_fname.parent.mkdir(parents=True, exist_ok=True)
if embeddings_fname.exists():
    with open(embeddings_fname, "rb") as f_in:
        doc_embeddings = pickle.load(f_in)
else:
    n_docs = len(df_dataset['text'].to_numpy())
    batch_size = 32
    doc_embeddings = model.encode(df_dataset['text'].values[0:n_docs],batch_size=batch_size,show_progress_bar=True)
    with open(embeddings_fname, 'wb') as f_Out:
        print('5')
        pickle.dump(doc_embeddings,f_Out)

keyword_embeddings = model.encode(keywords)
distances = cosine_similarity(doc_embeddings, keyword_embeddings)

In [11]:
doc_embeddings.shape

(61011, 384)