# Пишем токенайзер

In [None]:
import nltk
import re
import pandas as pd
import numpy as np
from string import punctuation
punctuation = punctuation + '\n'
import requests
from bs4 import BeautifulSoup

In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(["ga_dataset.txt"], min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<mask>",
])


In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(["gd.txt"], min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<mask>",
])


In [None]:
!mkdir gdwikimodel
tokenizer.save_model("gdwikimodel")

mkdir: cannot create directory ‘gdwikimodel’: File exists


['gdwikimodel/vocab.json', 'gdwikimodel/merges.txt']

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./gdwikimodel/vocab.json",
    "./gdwikimodel/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
import torch

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_000,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./gdwikimodel")

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="gd.txt",
    block_size=64,
)



CPU times: user 16.3 s, sys: 1.23 s, total: 17.5 s
Wall time: 15.4 s


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gdwikimodel",
    overwrite_output_dir=True,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=5,
    prediction_loss_only=True,
    no_cuda=True,
)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

Step,Training Loss
500,7.4932
1000,6.8832
1500,6.5742
2000,6.3529
2500,6.3839


KeyboardInterrupt: 

In [None]:
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./gdwikimodel", max_len=512)
model = RobertaForMaskedLM.from_pretrained('./gdwikimodel/checkpoint-2500')

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Text Rank

In [None]:
from nltk.stem import WordNetLemmatizer
import re
lem = WordNetLemmatizer()

In [None]:
def clean_tr(sentence):
  sentence = sentence.lower()
  sentence = re.sub(r'http\S+',' ',sentence)
  sentence = re.sub(r'[^a-zA-Z]',' ',sentence)
  sentence = sentence.split()
  sentence = [lem.lemmatize(word) for word in sentence]
  sentence = ' '.join(sentence)
  return sentence

In [128]:
from nltk.probability import FreqDist
def textrank(text, num_sentences=3):
    # Text into sentences
    sentences = text.split('.')

    # Text into words
    prepared_sentences = [clean_tr(sentence) for sentence in sentences]
    words = [sentence.split() for sentence in prepared_sentences]
    words = sum(words, []) #flatten the list
    # calculate word frequencies
    fdist = FreqDist(words)

    # Assign scores to sentences based on word frequencies
    sentence_scores = {}
    for i, sentence in enumerate(prepared_sentences):
        for word in sentence.split():
            if word in fdist:
                if i in sentence_scores:
                    sentence_scores[i] += fdist[word]
                else:
                    sentence_scores[i] = fdist[word]

    # Sort sentences by scores in descending order
    sorted_sentences = sorted(sentence_scores, key=lambda x: sentence_scores[x], reverse=True)

    # Select the top `num_sentences` sentences for the summary
    summary_sentences = sorted(sorted_sentences[:num_sentences])

    # Create the summary
    summary = ' '.join([sentences[i] for i in summary_sentences])

    return summary

In [None]:
def extract_text_from_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    paragraphs = soup.find_all("p")
    text = " ".join([p.get_text() for p in paragraphs])

    return text

In [None]:
url = 'https://gd.wikipedia.org/wiki/B%C3%A0nrigh_Ealasaid_II'
text = extract_text_from_url(url)
summary = textrank(text, 5)
print(summary)

In [None]:
text = "'S i Ealasaid II (Ealasaid Alexandra Màiri; breith 21 an Giblean, 1926–8 an t-Sultain, 2022) Bànrigh an Rìoghachd Aonaichte agus iomadach dhùthcha eile mun cuairt an t-saoghal ris an canar na Riaghaltan Co-Fhlaitheas; mar eisimpleir Canada. Gu lèir, bha i na monarc tarsainn air Antigua agus Barbuda, Astràilia, Na h-Eileanan Bhathama, Belize, Canada, Grenada, Iaimeuca, Sealainn Nuadh, Papua Gini-Nuadh, Naomh Crìstean agus Nibheis , Naomh Lùisia, Naomh Bhionsant agus Eileanan Greanadach, Na h-Eileanan Sholaimh, Tuvalu is an Rìoghachd Aonaichte, am Breatainn Mòr agus Èirinn a Tuath. Chaidh a h-athair, Seòras VI, a chrùnadh an deidh do a bhràthair, Eideard VIII, leigeil an cathair rìoghail seachad ann an 1936. Riaghail Seòras VI gu 1952 agus, an deidh do bhàsachadh (an 6mh den Gearran), chaidh a chrùn gu an nighean is sine aige; Ealasaid. Chaidh a crùnadh mar banrigh air 2 an t-Ògmhios 1953. 'S e an duine aice, a phòs i ann an 1947, am Prionnsa Philip, Diùc Dhùn Èideann agus 's ann 'on a' Ghrèig a tha e. Tha ceithir chlann aca le Teàrlach III as sine. Tha dà mhac eile aca, Anndra agus Èideard, agus aon nighean, Anna. Chaochail i 8 an t-Sultain 2022 ann am Baile Mhorail. 'S i a' chiad leanabh a bh' aig Prionnsa Albert, Diùc York agus a bhean, Ealasaid. 'S e a h-athair an dàrna mac aig Rìgh Seòras V agus a' Bhanrigh Màiri. Rugadh i aig 2.40 sa mhadainn (GMT) air 21 dhan Ghiblean 1926 aig taigh a seanair: 17 Sràid Bruton, Mayfair, Lunnainn. Bha aon phiuthar aice, Mairead, a bha ceithir bliadhna nas òige na i. Thachair Ealasaid ri Prionnsa Philip ann an 1934. Phòs iad air 20 dhan t-Samhain 1947 ann an Westminster Abbey. Fhuair iad thairis air 2,500 prèasantan bho air feadh an t-saoghal. Rugadh Teàrlach air 14 dhan t-Samhain 1948, agus a Bhana-phrionnsa Anna ann an 1950."
summary = textrank(text, 5)
print(summary)

# SumBasic

In [None]:
def clean_sb(sentence):
  sentence = sentence.lower()
  sentence = re.sub(r'http\S+',' ',sentence)
  sentence = re.sub(r'[^a-zA-Z]',' ',sentence)
  sentence = sentence.split()
  sentence = [lem.lemmatize(word) for word in sentence]
  sentence = ' '.join(sentence)
  return sentence

In [None]:
def init_probability(sentences):
    probability_dict = {}
    #words = '. '.join(sentences)
    words = [sentence.split() for sentence in sentences]
    words = sum(words, []) #flatten the list
    total_words = len(set(words))
    for word in words:
        if word!='.':
            if not probability_dict.get(word):
                probability_dict[word] = 1
            else:
                probability_dict[word] += 1

    for word,count in probability_dict.items():
        probability_dict[word] = count/total_words

    return probability_dict

In [None]:
def update_probability(probability_dict,word):
	if probability_dict.get(word):
		probability_dict[word] = probability_dict[word]**2
	return probability_dict

In [None]:
def average_sentence_weights(sentences,probability_dict):
	sentence_weights = {}
	for index,sentence in enumerate(sentences):
		if len(sentence) != 0:
			average_proba = sum([probability_dict[word] for word in sentence if word in probability_dict.keys()])
			average_proba /= len(sentence)
			sentence_weights[index] = average_proba
	return sentence_weights

In [None]:
def generate_summary(sentence_weights,probability_dict,cleaned_article,tokenized_article,summary_length = 30):
    summary = ""
    current_length = 0
    prev_sentence = []
    while current_length < summary_length :

        highest_probability_word = max(probability_dict)
        #print(highest_probability_word)
        sentences_with_max_word= [index for index,sentence in enumerate(cleaned_article) if highest_probability_word in sentence.split(' ')]
        sentence_list = sorted([[index,sentence_weights[index]] for index in sentences_with_max_word],key=lambda x:x[1],reverse=True)
        #while ((sentence_list[0][0]) not in prev_sentence):
        summary += cleaned_article[sentence_list[0][0]] + ". "
            #prev_sentence.append(sentence_list[0][0])
            #sentence_list[0].pop(0)
        for word in cleaned_article[sentence_list[0][0]]:
            probability_dict = update_probability(probability_dict,word)
        current_length+=1
    return summary

In [127]:
def sumbasic(article, required_length):
    cleaned_article = []
    for i in article.split('.'):
        cleaned_article.append(clean_sb(i))
    tokenized_article = tokenizer.encode(article)
    #cleaned_article = clean(tokenized_article)
    probability_dict = init_probability(cleaned_article)
    #print(probability_dict.get('b'))
    sentence_weights = average_sentence_weights(cleaned_article,probability_dict)
    summary = generate_summary(sentence_weights,probability_dict,cleaned_article,tokenized_article,required_length)
    return summary

# luhn sum

In [None]:
def clean_lh(article):
	lem = WordNetLemmatizer()
	article =  re.sub(r'\[[^\]]*\]','',article)
	article = article.split('.')
	cleaned_list=[]
	for sent in article:
		sent  = sent.lower()
		word_list = []
		words = sent.split()
		for word in words:
			word_list.append(lem.lemmatize(word.lower()))
		cleaned_list.append(' '.join(word_list))
	return cleaned_list

In [None]:
def get_frequency_dictionary(content):
	frequency = {}
	for sentence in content:
		word_list = sentence.split()#word_tokenize(sentence)
		for word in word_list:
			if word not in [',','.',';','%',')','(','``']:
				if frequency.get(word) is None:
					frequency[word]=1
				else:
					frequency[word]+=1
	return frequency

In [None]:
def get_score(content,frequency_dictionary):
    sentence_score={}
    for sentence in content:
        score=0
        word_list = sentence.split()
        start_idx,end_idx = -1,len(word_list)+1
        index_list=[]
        for word in word_list:
            if word not in [',','.',';','%',')','(','``'] and word in frequency_dictionary.keys():
                index_list.append(word_list.index(word)+1)
            if index_list:
                if max(index_list) != min(index_list):
                    score = len(index_list)**2/(max(index_list) - min(index_list))
                else:
                    score = len(index_list)**2/max(index_list)
        sentence_score[content.index(sentence)] = score
    return sentence_score

In [None]:
def get_summary_luhn(sentence_scores,content,threshold):
    summary = ""
    sentence_indexes = sorted(sentence_scores,key=sentence_scores.get,reverse=True)[:threshold-1]
    for index in sentence_indexes:
        summary+=content[index]+" "
    return summary

In [126]:
def luhn(content, word_limit):
    cleaned_content = clean_lh(content)
    threshold = len(cleaned_content)//40
    frequency_dictionary = get_frequency_dictionary(cleaned_content)
    sorted_dictionary = {key: frequency_dictionary[key] for key in sorted(frequency_dictionary,key=frequency_dictionary.get,reverse=True)[:word_limit]}
    sentence_scores = get_score(cleaned_content,sorted_dictionary)
    summary = get_summary_luhn(sentence_scores,cleaned_content,threshold)
    return summary

# Сравнение

In [115]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])

In [124]:
import bz2
import json
original_list = []
test_summary_list = []
length_list = []
with bz2.open('gd_test.tar.bz2', 'rt', encoding='UTF-8') as f:
  for i in range(10):
    a = f.readline()
    b = a[a.find('{'):]
    c = json.loads(b)
    original_list.append(c['text'])
    test_summary_list.append(c['summary'])
    length_list.append(len(c['summary'].split('.')))

In [129]:
rogue1_precision_textrank = []
rogueL_precision_textrank = []
rogue1_recall_textrank = []
rogueL_recall_textrank = []
rogue1_precision_sumbasic = []
rogueL_precision_sumbasic = []
rogueL_recall_sumbasic = []
rogue1_recall_sumbasic = []
rogue1_precision_luhn = []
rogueL_precision_luhn = []
rogue1_recall_luhn = []
rogueL_recall_luhn = []
for i in range(10):
    original = original_list[i]
    test_summary = test_summary_list[i]
    length = length_list[i]
    summary_luhn = luhn(original, length)
    summary_sumbasic = sumbasic(original, length)
    summary_textrank = textrank(original, length)
    scores_luhn = scorer.score(test_summary, summary_luhn)
    print(f'Luhn in text {i}')
    print(scores_luhn)
    scores_sumbasic = scorer.score(test_summary, summary_sumbasic)
    print(f'Sumbasic in text {i}')
    print(scores_sumbasic)
    scores_textrank = scorer.score(test_summary, summary_textrank)
    print(f'Textrank in text {i}')
    print(scores_textrank)
    rogue1_precision_textrank.append(scores_textrank['rouge1'].precision)
    rogueL_precision_textrank.append(scores_textrank['rougeL'].precision)
    rogue1_recall_textrank.append(scores_textrank['rouge1'].recall)
    rogueL_recall_textrank.append(scores_textrank['rougeL'].recall)
    rogue1_precision_sumbasic.append(scores_sumbasic['rouge1'].precision)
    rogueL_precision_sumbasic.append(scores_sumbasic['rougeL'].precision)
    rogue1_recall_sumbasic.append(scores_sumbasic['rouge1'].recall)
    rogueL_recall_sumbasic.append(scores_sumbasic['rougeL'].recall)
    rogue1_precision_luhn.append(scores_luhn['rouge1'].precision)
    rogueL_precision_luhn.append(scores_luhn['rougeL'].precision)
    rogue1_recall_luhn.append(scores_luhn['rouge1'].recall)
    rogueL_recall_luhn.append(scores_luhn['rougeL'].recall)

Luhn in text 0
{'rouge1': Score(precision=0.06936416184971098, recall=0.5217391304347826, fmeasure=0.12244897959183672), 'rougeL': Score(precision=0.05202312138728324, recall=0.391304347826087, fmeasure=0.09183673469387756)}
Sumbasic in text 0
{'rouge1': Score(precision=0.2222222222222222, recall=0.17391304347826086, fmeasure=0.1951219512195122), 'rougeL': Score(precision=0.2222222222222222, recall=0.17391304347826086, fmeasure=0.1951219512195122)}
Textrank in text 0
{'rouge1': Score(precision=0.125, recall=0.391304347826087, fmeasure=0.18947368421052635), 'rougeL': Score(precision=0.06944444444444445, recall=0.21739130434782608, fmeasure=0.10526315789473684)}
Luhn in text 1
{'rouge1': Score(precision=0.09202453987730061, recall=0.5769230769230769, fmeasure=0.15873015873015872), 'rougeL': Score(precision=0.06748466257668712, recall=0.4230769230769231, fmeasure=0.1164021164021164)}
Sumbasic in text 1
{'rouge1': Score(precision=0.11290322580645161, recall=0.2692307692307692, fmeasure=0.1