In [1]:
from datasets import load_from_disk, concatenate_datasets, Dataset, DatasetDict

#textile patent documents
dataset = load_from_disk('../../Data/Textile_Patents_(70-20-10)')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 7905
Validation dataset size: 1130
Test dataset size: 2259


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'abstract'],
        num_rows: 7905
    })
    validation: Dataset({
        features: ['description', 'abstract'],
        num_rows: 1130
    })
    test: Dataset({
        features: ['description', 'abstract'],
        num_rows: 2259
    })
})

In [4]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nevidujayatilleke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nevidujayatilleke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

In [7]:
def sentence_similarity(sentence1, sentence2):
    stop_words = set(stopwords.words('english'))
    filtered_sentence1 = [w for w in sentence1 if w not in stop_words]
    filtered_sentence2 = [w for w in sentence2 if w not in stop_words]
    all_words = list(set(filtered_sentence1 + filtered_sentence2))
    vector1 = [filtered_sentence1.count(word) for word in all_words]
    vector2 = [filtered_sentence2.count(word) for word in all_words]
    return 1 - cosine_distance(vector1, vector2)

In [8]:
def build_similarity_matrix(sentences):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
    return similarity_matrix

In [9]:
def apply_lexrank(similarity_matrix, damping=0.85, threshold=0.2, max_iter=100):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, alpha=damping, tol=threshold, max_iter=max_iter)
    return scores

In [10]:
def get_top_sentences(sentences, scores):
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    top_sentences = [sentence for score, sentence in ranked_sentences]
    return top_sentences

In [11]:
def extract_important_sentences(text):
    preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    scores = apply_lexrank(similarity_matrix)
    top_sentences = get_top_sentences(preprocessed_sentences, scores)
    paragraph = ' '.join([' '.join(sentence) for sentence in top_sentences])
    return paragraph

In [12]:
def update_text_lexrank(row):
    #inputs = ["summarize: " + extract_important_sentences(item) for item in sample["description"]]
    row['description'] = extract_important_sentences(row['description'])
    return row

In [13]:
processed_dataset = DatasetDict({
    split_name: dataset.map(update_text_lexrank)
    for split_name, dataset in dataset.items()
})

Map: 100%|██████████| 7905/7905 [16:21:26<00:00,  7.45s/ examples]      
Map: 100%|██████████| 1130/1130 [44:08<00:00,  2.34s/ examples] 
Map: 100%|██████████| 2259/2259 [5:54:56<00:00,  9.43s/ examples]     


In [14]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['description', 'abstract'],
        num_rows: 7905
    })
    validation: Dataset({
        features: ['description', 'abstract'],
        num_rows: 1130
    })
    test: Dataset({
        features: ['description', 'abstract'],
        num_rows: 2259
    })
})

In [15]:
# save datasets to disk for later easy loading
processed_dataset["train"].save_to_disk("../../Data/Textile_Patent_(70-20-10)_LexRank_thres_3/train")
processed_dataset["validation"].save_to_disk("../../Data/Textile_Patent_(70-20-10)_LexRank_thres_3/validation")
processed_dataset["test"].save_to_disk("../../Data/Textile_Patent_(70-20-10)_LexRank_thres_3/test")

Saving the dataset (1/1 shards): 100%|██████████| 7905/7905 [00:00<00:00, 19110.27 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1130/1130 [00:00<00:00, 23759.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2259/2259 [00:00<00:00, 24438.51 examples/s]


In [16]:
processed_dataset["test"].to_pandas()

Unnamed: 0,description,abstract
0,"if desired , part of the hydrolysate can be re...",Processes for preparing pulp from lignin-conta...
1,no . in one embodiment of the present inventio...,A durable erosion control blanket featuring a ...
2,no . [ 0025 ] in a further preferred embodimen...,A method for spinning a multifilament yarn fro...
3,no . no . no . no . a . a . repair . fig . fig...,A surgical repair device having a length to wi...
4,"fig . fig . in fig . fig . fig . thereafter , ...",A transporting carriage for conveying a coiler...
...,...,...
2254,"as seen in the foregoing , according to the fu...",A clothes washing machine includes a main wash...
2255,viewing the arrangements of connecting webs in...,A compressible cheese center for dyeing purpos...
2256,"1 and which is , therefore , the yarn which wi...",A yarn feeder for a circular knitting machine ...
2257,"1 , in the embodiment of fig . at the outlet e...",A method and an apparatus for stuffer box crim...
