In [None]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset, DatasetDict, concatenate_datasets
from sklearn.cluster import KMeans

import numpy as np
import pandas as pd
import random
import os
import re
from nltk import sent_tokenize

BASEPATH = os.path.dirname(os.getcwd())
DATASETPATH = os.path.join(BASEPATH,"datasets")

In [83]:
datasetTrain = pd.read_csv(f"{DATASETPATH}/train.csv")
datasetTest = pd.read_csv(f"{DATASETPATH}/test.csv")
datasetValidation = pd.read_csv(f"{DATASETPATH}/validation.csv")

datasetTrain = Dataset.from_pandas(datasetTrain)
datasetTest = Dataset.from_pandas(datasetTest)
datasetValidation = Dataset.from_pandas(datasetValidation)

datasetTrain = concatenate_datasets([datasetTrain, datasetTest])
datasetFull = DatasetDict(
    {
        "train" : datasetTrain,
        "test" : datasetTest
    }
)
datasetFull
shuffled_train = list(range(0,298603))
shuffled_test = list(range(0,11490))

random.seed(42)
random.shuffle(shuffled_train)
random.shuffle(shuffled_train)
random.shuffle(shuffled_test)
random.shuffle(shuffled_test)
datasetFull["train"] = datasetFull["train"].select(shuffled_train[:5000])
datasetFull["test"] = datasetFull["test"].select(shuffled_test[:1100])

In [84]:
model_name = "BAAI/bge-small-en-v1.5"
model_embeddings = SentenceTransformer(model_name, device = "cuda")

In [85]:
model_embeddings.half()

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [86]:
a = []
resp = []

for data_test in range(100) : 
    q = datasetFull["train"][data_test]["article"]
    a.append(datasetFull["train"][data_test]["highlights"])
    sentences = sent_tokenize(q, language = "english")
    embeds = model_embeddings.encode_document(sentences)
    model_means = KMeans(n_clusters=3,random_state=42)
    hasil_model_means = model_means.fit_transform(embeds)

    hasil_string = ""
    index_list = []

    for i in range(3) : 
        index = int(np.argmin(np.linalg.norm(embeds - model_means.cluster_centers_[i], axis = 1)))
        index_list.append(index)
    index_list.sort()

    for i in index_list : 
        hasil_string += sentences[i]
    resp.append(hasil_string)

In [87]:
import evaluate

bertscore = evaluate.load("bertscore")
score = bertscore.compute(predictions= resp, references=a, lang = "en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [88]:
print(f"BertScore : {float(round(np.mean(score['f1']),2)*100)}%")

BertScore : 85.0%


In [89]:
import evaluate

rouge_score = evaluate.load("rouge")
score = rouge_score.compute(predictions= resp, references=a)
score

{'rouge1': np.float64(0.2908578517600332),
 'rouge2': np.float64(0.09500590682535162),
 'rougeL': np.float64(0.19061431652923047),
 'rougeLsum': np.float64(0.23987567687682892)}

In [92]:
model_embeddings.save_pretrained(f"{BASEPATH}/modelExtractive")