In [1]:
#from rouge import Rouge
from nltk.metrics.distance import jaccard_distance
from rouge_score import rouge_scorer

In [2]:
import os
import re
import sys
import pandas as pd
import numpy as np
import multiprocessing as mp
import tqdm as progressbar

In [3]:
df = pd.read_csv('../preprocessed/attribute.seqs.tsv', sep = '\t', names=['name', 'text'], dtype=object, encoding='utf-8').dropna()

In [11]:
classes=df.groupby(by=["name"]).count().reset_index()
sorted_classes = classes.sort_values(by='text', ascending=False).name.tolist()

In [5]:
!mkdir -p ../attributes/ ../scores

In [6]:
rouger1 = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [7]:
def dump_attributes():
    for a in sorted_classes:
        data = df[df.name == a].text.to_frame()
        
        if data.shape[0] < 1:
            print(f'WARNING: {a} has zero elements?!')
            continue
        
        data.to_csv(f'../attributes/{a}.entries.tsv', sep = '\t', index=False, header=False)
dump_attributes()

In [8]:
!ls -altr ../attributes/*.entries.tsv|head -3

-rw-r--r-- 1 juno juno 780776 Feb 29 04:09 ../attributes/description_sam.entries.tsv
-rw-r--r-- 1 juno juno 331733 Feb 29 04:09 ../attributes/misc_param_sam.entries.tsv
-rw-r--r-- 1 juno juno 115265 Feb 29 04:09 ../attributes/bio_material_sam.entries.tsv


In [9]:
def r_scorer(a):
    chunk = pd.read_csv(f'../attributes/{a}.entries.tsv', sep = '\t', names=['text'])

    sentences = chunk.text.values.tolist()
    dim = len(sentences)

    rou = np.zeros((dim, dim)).astype(np.float32)

    tokens = [set(s.split()) for s in sentences]

    for i, j in zip(*np.tril_indices(dim, -1)):
        rou[i, j] = rouger1.score(sentences[j], sentences[i])['rouge1'].precision
            
    np.save(f'../scores/{a}.rouge1.scores', rou, allow_pickle = False)

In [21]:
def j_scorer(a):
    chunk = pd.read_csv(f'../attributes/{a}.entries.tsv', sep = '\t', names=['text'])

    sentences = chunk.text.values.tolist()
    dim = len(sentences)

    jac = np.zeros((dim, dim)).astype(np.float32)

    tokens = [set(s.split()) for s in sentences]

    for i, j in zip(*np.tril_indices(dim, -1)):
        jac[i, j] = 1 - jaccard_distance(tokens[j], tokens[i])
            
    np.save(f'../scores/{a}.jaccard.scores.tsv', jac, allow_pickle = False)