In [1]:
from clustering_tool.embedders.bert import *
from clustering_tool.embedders.word2vec import *
from clustering_tool.embedders.common import *
from clustering_tool.embedders.tfidf import *
from clustering_tool.datasets import *

In [2]:
device='cpu'
if torch.cuda.is_available():
    device='cuda'

In [3]:
bertTokenizer = create_tokenizer()
bertModel = create_model()

def get_file_paths(dataset_name, emb_name):
    input_path = 'data/{}.txt'.format(dataset_name)
    output_path = 'data/{}_{}.npy'.format(dataset_name, emb_name)
    return input_path, output_path

def tf_idf(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'tf_idf')
    tfidf_embedder = TfIdfEmbedder(tfidf_vectorizer(input_path))
    read_n_save(input_path, output_path, embedder=tfidf_embedder)
    
    
def word2vec_md_mean(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'word2vec_md_avg')
    word2vecEmbedder = Word2VecEmbedder('en_core_web_md')
    read_n_save(input_path, output_path, embedder=word2vecEmbedder)
    
def word2vec_lg_mean(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'word2vec_lg_avg')
    word2vecEmbedder = Word2VecEmbedder('en_core_web_lg')
    read_n_save(input_path, output_path, embedder=word2vecEmbedder)
    
def word2vec_md_weighted(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'word2vec_md_weighted')
    spacyModel=spacy.load('en_core_web_md')
    wordFreq = get_unigrams(input_path, SpacyTokenizer(spacyModel.tokenizer))
    word2vecEmbedder = Word2VecEmbedder('en_core_web_md', WeightedAverageEmbeddings(wordFreq))
    read_n_save(input_path, output_path, embedder=word2vecEmbedder)
    
def word2vec_lg_weighted(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'word2vec_lg_weighted')
    spacyModel=spacy.load('en_core_web_lg')
    wordFreq = get_unigrams(input_path, SpacyTokenizer(spacyModel.tokenizer))
    word2vecEmbedder = Word2VecEmbedder('en_core_web_lg', WeightedAverageEmbeddings(wordFreq))
    read_n_save(input_path, output_path, embedder=word2vecEmbedder)
    
def bert_mean(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'bert_avg')
    bertEmbedder = BertEmbedder(bertTokenizer, bertModel, device=device, embedding_strategy=bert_avg_embeddings)
    read_n_save(input_path, output_path, embedder=bertEmbedder)
    
def bert_cls(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'bert_cls')
    bertEmbedder = BertEmbedder(bertTokenizer, bertModel, device=device, embedding_strategy=bert_cls_embeddings)
    read_n_save(input_path, output_path, embedder=bertEmbedder)
    
def bert_pooler(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'bert_pooler')
    bertEmbedder = BertEmbedder(bertTokenizer, bertModel, device=device, embedding_strategy=bert_pooler_embeddings)
    read_n_save(input_path, output_path, embedder=bertEmbedder)
    
def bert_weighted(dataset_name):
    input_path, output_path = get_file_paths(dataset_name, 'bert_pooler')
    unigram=get_unigrams(input_path, bertTokenizer)
    bertEmbedder = BertEmbedder(bertTokenizer, bertModel, device=device, embedding_strategy=BertWeightedEmbeddings(unigram, device=device))
    read_n_save(input_path, output_path, embedder=bertEmbedder)
    
def all_embeds(dataset_name):
    tf_idf(dataset_name)
    word2vec_md_mean(dataset_name)
    word2vec_lg_mean(dataset_name)
    word2vec_md_weighted(dataset_name)
    word2vec_lg_weighted(dataset_name)
    bert_mean(dataset_name)
    bert_cls(dataset_name)
    bert_pooler(dataset_name)
    bert_weighted(dataset_name)
    

In [None]:
datasets = ['SearchSnippets', 'Biomedical', 'StackOverflow']
for dataset in datasets:
    all_embeds(dataset)
    
    labels = []
    with open('data/{}_gnd.txt') as label_file:
        for line in label_file:
            labels.append(int(line.strip()))
            
    np.save('data/{}_label.npy', np.array(labels))