### Libraries

In [1]:
import sys
sys.path.append(r"C:\Pro\Stages\A4 - DVRC\Work\Supervised-Learning-using-Unsupervised-Learning-Metrics-in-the-absence-of-Annotated-Data\myLibraries")

In [34]:
from transformers import BertTokenizer, BertModel
import torch
from umap import UMAP
import plotly.express as px
import plotly.graph_objects as go
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from MARScore.utils import *
import hdbscan
from custom_score.utils import cleanString 

### Datasets

In [3]:
dataset_url="https://drive.google.com/file/d/1Wd0M3qepNF6B4YwFYrpo7CaSERpudAG_/view?usp=share_link"
dataset_url='https://drive.google.com/uc?export=download&id=' + dataset_url.split('/')[-2]
dataset = pd.read_json(dataset_url, lines=True)
dataset = dataset.loc[:, ["text", "summary"]]

### Corpus embedding

In [None]:
def tokenizeCorpus(corpus, model=BertModel.from_pretrained('bert-base-uncased', 
                                                           output_hidden_states=True), 
                           tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), 
                           model_input_size=512):
    def flatten(l):
        return [item for sublist in l for item in sublist]
    input_size = model_input_size - 1
    corpusWords = corpus.split(" ")
    splited = [" ".join(corpusWords[i:i+input_size]) for i in range(0, len(corpusWords), input_size)]

    input_ids = []
    attention_masks = []
    for sentence in splited:
        encoded = tokenizer.encode_plus(sentence, 
                                        add_special_tokens=True,
                                        max_length=input_size+1,
                                        padding="max_length",
                                        return_attention_mask=True,
                                        return_tensors='pt',
                                        truncation=True)
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    #inputs_ids = torch.Tensor(len(input_ids),1, max_len+1)
    #torch.cat(input_ids, out=inputs_ids)
    inputs_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    temp = flatten([batch.tolist() for batch in input_ids])
    labels = np.array(temp)
    labels = labels.reshape((labels.shape[0]*labels.shape[1]))
    labels = tokenizer.convert_ids_to_tokens(labels)
    with torch.no_grad():
        output = model(inputs_ids, attention_mask=attention_masks)
    return output, labels

def vectorizeCorpus(model_output, allStates=True):
    if allStates==True:
        hidden_states = model_output.hidden_states
    else:
        hidden_states = [model_output.last_hidden_state]
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = token_embeddings.permute(1,2,0,3)
    embs = []
    for batch in token_embeddings:
        for token in batch:
            emb = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            embs.append(emb)
    return embs

def visualizeCorpus(embs, labels, embs_gold=None, labels_gold=None, dim=2):
    comp_gold = True if embs_gold != None and labels_gold != None else False

    formated_embs = [token.tolist() for token in embs]
    formated_embs = np.array(formated_embs)
    formated_embs_gold = [token.tolist() for token in embs_gold]
    formated_embs_gold = np.array(formated_embs_gold)
    token_indexes = [i for i in range(len(labels)) if labels[i] != "[PAD]" and labels[i] != "[CLS]" and labels[i] != "[SEP]" and len(labels[i])>2]

    if dim == 1:
        umap1D = UMAP(n_components=1, init='random', random_state=0)
        proj1D = umap1D.fit_transform(formated_embs).T

        data = {"x": proj1D[0],
                "labels": labels}
        
        for k in data.keys():
            data[k] = [data[k][i] for i in range(len(data[k])) if i in token_indexes]

        if comp_gold:
            token_indexes_gold = [i for i in range(len(labels_gold)) if labels_gold[i] != "[PAD]" and labels_gold[i] != "[CLS]" and labels_gold[i] != "[SEP]" and len(labels_gold[i])>2]
            proj1D_gold = umap1D.fit_transform(formated_embs_gold).T
            data_gold = {"x": proj1D_gold[0],
                        "labels": labels_gold}
            for k in data_gold.keys():
                data_gold[k] = [data_gold[k][i] for i in range(len(data_gold[k])) if i in token_indexes_gold]

        traces = []
        for i in range(len(data['x'])):
            if comp_gold:
                color = 'green' if data["labels"][i] in data_gold["labels"] else 'red'
            else:
                color = 'red'
            trace = go.Scatter(
                x=[data['x'][i]],
                mode='markers',
                marker=dict(size=6, color=color),
                text=[data['labels'][i]],
                name=data['labels'][i]
            )
            traces.append(trace)
        if comp_gold:
            for i in range(len(data_gold['x'])):
                trace = go.Scatter(
                    x=[data_gold['x'][i]],
                    mode='markers',
                    marker=dict(size=6, color='gold'),
                    text=[data_gold['labels'][i]],
                    name=data_gold['labels'][i]
                )
                traces.append(trace)

        layout = go.Layout(
            title='1D Scatter Plot',
            scene=dict(
                xaxis=dict(title='X')
            )
        )
        fig = go.Figure(data=traces, layout=layout)
        fig.show()

    elif dim == 2:
        umap2D = UMAP(n_components=2, init='random', random_state=0)
        proj2D = umap2D.fit_transform(formated_embs).T

        data = {"x": proj2D[0],
                "y": proj2D[1],
                "labels": labels}
        
        for k in data.keys():
            data[k] = [data[k][i] for i in range(len(data[k])) if i in token_indexes]

        if comp_gold:
            token_indexes_gold = [i for i in range(len(labels_gold)) if labels_gold[i] != "[PAD]" and labels_gold[i] != "[CLS]" and labels_gold[i] != "[SEP]" and len(labels_gold[i])>2]
            proj2D_gold = umap2D.fit_transform(formated_embs_gold).T
            data_gold = {"x": proj2D_gold[0],
                        "y": proj2D_gold[1],
                        "labels": labels_gold}
            for k in data_gold.keys():
                data_gold[k] = [data_gold[k][i] for i in range(len(data_gold[k])) if i in token_indexes_gold]

        traces = []
        for i in range(len(data['x'])):
            if comp_gold:
                color = 'green' if data["labels"][i] in data_gold["labels"] else 'red'
            else:
                color = 'red'
            trace = go.Scatter(
                x=[data['x'][i]],
                y=[data['y'][i]],
                mode='markers',
                marker=dict(size=6, color=color),
                text=[data['labels'][i]],
                name=data['labels'][i]
            )
            traces.append(trace)
        if comp_gold:
            for i in range(len(data_gold['x'])):
                trace = go.Scatter(
                    x=[data_gold['x'][i]],
                    y=[data_gold['y'][i]],
                    mode='markers',
                    marker=dict(size=6, color='gold'),
                    text=[data_gold['labels'][i]],
                    name=data_gold['labels'][i]
                )
                traces.append(trace)

        layout = go.Layout(
            title='2D Scatter Plot',
            scene=dict(
                xaxis=dict(title='X'),
                yaxis=dict(title='Y')
            )
        )
        fig = go.Figure(data=traces, layout=layout)
        fig.show()

In [None]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
dataset_url="https://drive.google.com/file/d/1Wd0M3qepNF6B4YwFYrpo7CaSERpudAG_/view?usp=share_link"
dataset_url='https://drive.google.com/uc?export=download&id=' + dataset_url.split('/')[-2]
dataset = pd.read_json(dataset_url, lines=True)
dataset = dataset.loc[:, ["text", "summary"]]

elem0 = dataset.iloc[0, 0]
print(elem0)

In [None]:
gold0 = dataset.iloc[0, 1]
print(gold0)

In [None]:
o, l = tokenizeCorpus(elem0)
ogold, lgold = tokenizeCorpus(gold0)
v = vectorizeCorpus(o)
vgold = vectorizeCorpus(ogold)
visualizeCorpus(v, l, vgold, lgold, dim=1)

### Clustering with HDBScan

In [3]:
dataset_url="https://drive.google.com/file/d/1Wd0M3qepNF6B4YwFYrpo7CaSERpudAG_/view?usp=share_link"
dataset_url='https://drive.google.com/uc?export=download&id=' + dataset_url.split('/')[-2]
dataset = pd.read_json(dataset_url, lines=True)
dataset = dataset.loc[:, ["text", "summary"]]

In [4]:
elem0 = dataset.iloc[0, 0]
gold0 = dataset.iloc[0, 1]

In [5]:
o, l = tokenizeCorpus(elem0)
ogold, lgold = tokenizeCorpus(gold0)
v = vectorizeCorpus(o)
vgold = vectorizeCorpus(ogold)

In [6]:
v = cleanVectors(v, l)
vgold = cleanVectors(vgold, lgold)

In [7]:
tf_values = tf(l)



In [9]:
clusterer = hdbscan.HDBSCAN()
clusterer.fit(v)
clabels = clusterer.labels_

In [15]:
visualizeCorpus(v, l, vgold, lgold, clabels, tf_values, dim=1)

### Compute TF of words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def tf(words):
    text = ' '.join(words)
    vectorizer = TfidfVectorizer(use_idf=False, norm=None)
    tf_values = vectorizer.fit_transform([text]).toarray()[0]
    tf_dict = {word: tf_values[index] for word, index in vectorizer.vocabulary_.items()}
    return tf_dict

In [None]:
tf(l)

In [None]:
def create_word_dictionary(words):
    word_dict = {}
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
    return word_dict

In [None]:
create_word_dictionary(["Banana", "Banana", "Apple", "Mango"])

### TF des clusters

In [4]:
elem0 = dataset.iloc[0, 0]
gold0 = dataset.iloc[0, 1]
o, l = tokenizeCorpus(elem0)
ogold, lgold = tokenizeCorpus(gold0)
v = vectorizeCorpus(o)
vgold = vectorizeCorpus(ogold)

In [5]:
tf_values = tf(l)
clusterer = hdbscan.HDBSCAN()
clusterer.fit(v)
clabels = clusterer.labels_



In [6]:
def clusters_tf(tf_values, labels, clabels):
    clusters_tf_values = {}
    for label, clabel in zip(labels, clabels):
        if clabel in clusters_tf_values.keys():
            clusters_tf_values[clabel] += tf_values[label]
        else:
            clusters_tf_values[clabel] = tf_values[label]
    return clusters_tf_values

In [8]:
clusters_tf_values = clusters_tf(tf_values, l, clabels)

### ILP

In [17]:
from cvxopt.glpk import ilp

In [31]:
print(l)

['[CLS]', 'section', '1', '.', 'short', 'title', '.', 'this', 'act', 'may', 'be', 'cited', 'as', 'the', '`', '`', 'national', 'science', 'education', 'tax', 'incentive', 'for', 'businesses', 'act', 'of', '2007', "'", "'", '.', 'sec', '.', '2', '.', 'credits', 'for', 'certain', 'contributions', 'benefit', '##ing', 'science', ',', 'technology', ',', 'engineering', ',', 'and', 'mathematics', 'education', 'at', 'the', 'elementary', 'and', 'secondary', 'school', 'level', '.', '(', 'a', ')', 'in', 'general', '.', '-', '-', 'sub', '##par', '##t', 'd', 'of', 'part', 'iv', 'of', 'sub', '##cha', '##pt', '##er', 'a', 'of', 'chapter', '1', 'of', 'the', 'internal', 'revenue', 'code', 'of', '1986', '(', 'relating', 'to', 'business', 'related', 'credits', ')', 'is', 'amended', 'by', 'adding', 'at', 'the', 'end', 'the', 'following', 'new', 'section', ':', '`', '`', 'sec', '.', '45', '##o', '.', 'contributions', 'benefit', '##ing', 'science', ',', 'technology', ',', 'engineering', ',', 'and', 'mathemat

In [38]:
sentences = list(map(str.strip, list(map(cleanString, " ".join(l).split(".")))))

In [46]:
len(clabels)

2560

In [None]:
len(clusters_tf_values)

In [68]:
def to_ilp_format(labels, clabels, clusters_tf_values):
    #define scoring function
    output = "Maximize\nscore:"
    for i, k in enumerate(sorted(clusters_tf_values.keys())):
        if int(clusters_tf_values[k]) < 0:
            output += f" - {-int(clusters_tf_values[k])} c{i}"
        else:
            output += f" + {int(clusters_tf_values[k])} c{i}"

    #create sentences and sentence dictionnary
    sentence_index = 0
    sentences_map = {0: set()}
    for cluster_index, token in zip(clabels, labels):
        if cluster_index in sentences_map.keys():
            sentences_map[cluster_index].add(sentence_index)
        else:
            sentences_map[cluster_index] = {sentence_index}
        
        if token == ".":
            sentence_index += 1
            

    #define constraints
    output += "\n\nSubject To\n"
    for i, k in enumerate(sorted(sentences_map.keys())):
        output += f"index_{i}:"
        for cluster_index in sorted(sentences_map[k]):
            output += f" s{cluster_index} +"
        output = output[:-2] + f" - c{k}" + "\n"
        
    #define sentence length length
    
    #define variables
    output += "\n\nBinary\n"
    for i in range(len(clusters_tf_values.keys())):
        output += f"c{i}\n"
    return output

In [69]:
res = to_ilp_format(l, clabels, clusters_tf_values)
print(res)

Maximize
score: + 49209 c0 + 49 c1 + 36 c2 + 42 c3 + 1225 c4 + 144 c5 + 144 c6 + 156 c7 + 144 c8 + 40 c9 + 121 c10 + 121 c11 + 49 c12 + 19 c13 + 258 c14 + 36 c15 + 120 c16 + 196 c17 + 48 c18 + 829 c19 + 69 c20 + 91 c21 + 638 c22 + 89 c23 + 4485 c24 + 6825 c25 + 1092 c26 + 320 c27 + 352 c28 + 65 c29 + 225 c30 + 133 c31 + 205 c32 + 121 c33 + 1932 c34 + 759 c35 + 45 c36 + 334 c37 + 95 c38 + 510 c39 + 306 c40 + 75 c41 + 500 c42 + 996 c43 + 357 c44 + 2060 c45 + 345 c46 + 288 c47 + 867470 c48 + 3132 c49

Subject To
index_0: s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 + s11 + s12 + s13 + s15 + s16 + s18 + s19 + s20 + s21 + s22 + s23 + s24 + s25 + s26 + s27 + s28 + s29 + s30 + s31 + s33 + s34 + s36 + s37 + s38 + s39 + s40 + s41 + s42 + s44 + s45 + s47 + s48 + s49 + s51 + s52 + s53 - c-1
index_1: s18 - c0
index_2: s11 + s21 + s34 + s36 + s40 - c1
index_3: s11 + s13 + s15 + s19 + s27 + s33 - c2
index_4: s11 + s12 + s13 + s14 + s16 + s17 + s18 + s19 + s20 + s21 + s22 + s23 + s24 + s25 +