In [19]:
from keybert import KeyBERT
from transformers import RobertaTokenizer, RobertaModel
import transformers
import nltk
from keyphrase_vectorizers import KeyphraseCountVectorizer
from pdfminer.high_level import extract_text
import pandas as pd
import re
import spacy
from nltk.stem import PorterStemmer
from collections import Counter

In [20]:
def extract_pdf(filename):
    text = extract_text(filename)
    return text

In [21]:
def filter_gutenberg_parts(paragraphs):
    # Define regular expressions to match Gutenberg-related parts
    gutenberg_regex = r'[Gg][Uu][Tt][Ee][Nn][Bb][Ee][Rr][Gg]'

    # Remove Gutenberg-related parts from each paragraph
    filtered_paragraphs = []
    for paragraph in paragraphs:
        if not re.search(gutenberg_regex, paragraph):
            filtered_paragraphs.append(paragraph)

    return filtered_paragraphs

In [22]:
def foreword_split(text):
    foreword_regex = r'[Ff][Oo][Rr][Ee][Ww][Oo][Rr][Dd]'
    if re.search(foreword_regex, text) != None:
        return text.split(re.search(foreword_regex, text).group(0))[1]
    return text

In [23]:
def splitParagraph(paragraphs):
    i = 0
    while i < len(paragraphs):
        current_paragraph = paragraphs[i]
        current_paragraph_word_count = len(current_paragraph.split())

        if current_paragraph_word_count <= 50:
            # Check the previous paragraph
            if i > 0 and i < len(paragraphs) - 1:
                if len(paragraphs[i-1].split()) <= len(paragraphs[i+1].split()):
                    merged_paragraph = paragraphs[i-1] + "\n" + current_paragraph
                    paragraphs[i-1] = merged_paragraph
                    del paragraphs[i]
                    continue
                else:
                    merged_paragraph = current_paragraph + "\n" + paragraphs[i+1]
                    paragraphs[i+1] = merged_paragraph
                    del paragraphs[i]
                    continue
            elif i == 0:
                merged_paragraph = current_paragraph + "\n" + paragraphs[1]
                paragraphs[1] = merged_paragraph
                del paragraphs[0]
                continue
            elif i == len(paragraphs) - 1:
                merged_paragraph = paragraphs[i-1] + "\n" + current_paragraph
                paragraphs[i-1] = merged_paragraph
                del paragraphs[i]
                continue


        i += 1

In [24]:
def findRelevantParagraphs(paragraphs, keywords,text,model,stop_words,n = 10,relevant_paragraphs = []):
    for paragraph in paragraphs:
        count_keywords = 0
        for keyword in keywords:
            if keyword[0] in paragraph:
                count_keywords += 1
        if count_keywords >= 2 and paragraph not in relevant_paragraphs:
            relevant_paragraphs.append(paragraph)
    
    if(len(relevant_paragraphs) < 10):        
        keywords = model.extract_keywords(text, keyphrase_ngram_range=(1,1), diversity=0.2,top_n=n+5,stop_words=stop_words,use_maxsum=True,use_mmr=True)
        findRelevantParagraphs(paragraphs,keywords,text,model,stop_words,n+5,relevant_paragraphs)
    return relevant_paragraphs
        

In [25]:
def process_text(text):
    stops = ["zi", "zz"]

    #gerekli modüller
    nlp = spacy.load("en_core_web_sm")
    stemmer = PorterStemmer()
    doc = nlp(text)
    

    #irregular verbler için üç listeye ihtiyacımız var , verbs,stemmed_verbs,lemmatized verbs
    verbs = []
    stemmed_verbs = []
    lemmatized_verbs = []
    
    #verbs listesi için pos tag ile metinden verbleri çekiyoruz ve listeleri oluşturuyoruz
    for token in doc:
        if token.pos_ == 'VERB':
            verbs.append(token.text)
            stemmed_verbs.append(stemmer.stem(token.text))
            lemmatized_verbs.append(token.lemma_)

     #karşılaştırma sistemi ile irregular verb tespit ediyoruz
    irregular_verbs = []
    for x, y in zip(stemmed_verbs, lemmatized_verbs):
        if x != y:
            irregular_verbs.append(x)
    
    #irregular verbleri ayrı bir listeye aldıktan sonra normal listeden temizliyoruz
    normal_verbs = []
    for i in verbs:
        if i not in irregular_verbs:
            normal_verbs.append(i)

    #metineki şehir isimleri hariç özel isimleri silen ve geriye kalan isimleri listeye alan listeyi oluşturuyoruz
    lemmas = []
    for token in doc:
        if not token.is_punct and token.pos_ != 'VERB':
            if token.pos_ == 'PROPN':
                if token.ent_type_ == 'GPE':
                    lemmas.append(token.text.lower())
            else:
                lemmas.append(token.lemma_.lower())


    #normal fiilleri içeren listemiz normal_verbs
    #irregular verbleri içeren listemiz irregular_verbs
    #fiil harici kelimeleri içeren listemiz lemmas

    #burada da her bir fiili lemma halinde çıktı aldık           
    lemma_verbs = []
    for verb in normal_verbs:
        doc = nlp(verb)
        lemmav = doc[0].lemma_
        lemma_verbs.append(lemmav)

    

    #birleştirme ve dicte dönüştürme
    combined_dict = lemmas + lemma_verbs + irregular_verbs
           
    cleaned_list = [word for word in combined_dict if "\n" not in word]

    return cleaned_list

In [26]:
# Getting rid of the Gutenberg parts
text = extract_pdf('pdfs\\alchemist.pdf')
paragraphs = text.split("\n\n")
splitParagraph(paragraphs)
paragraphs = filter_gutenberg_parts(paragraphs)
text = '\n\n'.join(paragraphs)
text = foreword_split(text)
paragraphs = text.split("\n\n")

In [27]:
stop_words = [
    "a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", 
    "are", "aren", "as", "at", "be", "because", "been", "before", "being", "below", "between", 
    "both", "but", "by", "can", "couldn", "d", "did", "didn", "do", "does", "doesn", "doing", "don", 
    "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "has", "hasn", "have", 
    "haven", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", 
    "i", "if", "in", "into", "is", "isn", "it", "itself", "just", "ll", "m", "ma", "me", "mightn", 
    "more", "most", "mustn", "my", "myself", "needn", "no", "nor", "not", "now", "o", "of", "off", 
    "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", 
    "s", "same", "shan", "she", "should", "shouldn", "so", "some", "such", "t", "than", "that", 
    "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", 
    "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "we", 
    "were", "weren", "what", "when", "where", "which", "while", "who", "whom", "why", "will", 
    "with", "wouldn", "y", "you", "your", "yours", "yourself", "yourselves","gutenberg","ebook",
    "author","ebooks","illustrated","manuscript","literature","book","proofreading",
    "books","illustrations","project","online","edition","title","release","rights","reserved",
    "editions","edition","chapter","chapters","contents","contents","table","contents","table",
    "proofreaders","_italic","italic","punctuation","punctuation","transcriber","pretext"
]

In [28]:
kw_model = KeyBERT()
keywordsKeybert = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,1), diversity=0.2,top_n=5,stop_words=stop_words,use_maxsum=True,use_mmr=True)
print(keywordsKeybert)

[('alchemist', 0.4243), ('nocturnal', 0.3706), ('moors', 0.3617), ('recounted', 0.2781), ('wassantiago', 0.2486)]


In [29]:
relevant_paragraphs = findRelevantParagraphs(paragraphs,keywordsKeybert,text,kw_model,stop_words)

In [30]:
relevant_paragraphs = sorted(relevant_paragraphs, key=lambda p: sum([1 for keyword in keywordsKeybert if keyword[0] in p]), reverse=True)

In [31]:
model_name = 'roberta-base'
keybert_model = KeyBERT(model=model_name)

No sentence-transformers model found with name C:\Users\paris/.cache\torch\sentence_transformers\roberta-base. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\paris/.cache\torch\sentence_transformers\roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
keyword_dict = {}

for paragraph in relevant_paragraphs:
    try:
        keywordsRoberta = keybert_model.extract_keywords(paragraph, keyphrase_ngram_range=(1, 1), stop_words=stop_words, use_maxsum=True, top_n=5, use_mmr=True, diversity=0.4,vectorizer=KeyphraseCountVectorizer())
    except:
        keywordsRoberta = kw_model.extract_keywords(paragraph, keyphrase_ngram_range=(1, 1), stop_words=stop_words, use_maxsum=True, top_n=5, use_mmr=True, diversity=0.4,vectorizer=KeyphraseCountVectorizer())
    
    keywords =  [keyword[0] for keyword in keywordsRoberta]

    keyword_dict[tuple(keywords)] = paragraph

# Print the keywords and corresponding paragraphs
for keywords, paragraph in keyword_dict.items():
    print("Keywords:", keywords)
    print("Paragraph:", paragraph)
    print()


Keywords: ('strange people', 'philosopher', 'religions', 'major religions', 'esperanto')
Paragraph:  But he had to move on. He believed in omens. All his life and all his studies were aimed at finding the
one true language of the universe. First he had studied Esperanto, then the world's religions, and now it
was alchemy. He knew how to speak Esperanto, he understood all the major religions well, but he wasn't
yet an alchemist. He had unraveled the truths behind important questions, but his studies had taken him to
a point beyond which he could not seem to go. He had tried in vain to establish a relationship with an
alchemist. But the alchemists were strange people, who thought only about themselves, and almost
always refused to help him. Who knows, maybe they had failed to discover the secret of the Master
Work—the Philosopher's Stone—and for this reason kept their knowledge to themselves.

Keywords: ('arabian alchemist', 'enormous amounts', 'archaeological expedition', 'exceptional p

In [33]:
nlp = spacy.load('en_core_web_sm')  # or another model, depending on your language

def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [63]:
def score_paragraph(paragraph, score_data,df):
    words = tokenize(paragraph)  # Split the paragraph into words
    scores = []

    scores = {}

    for word in words:
        score = score_data.get(word)  # Get the score for the word
        if score is not None:  # If the word exists in your data
            scores[word] = score  # Add it to the scores dictionary with word as the key

    # Now, scores is a dictionary of word: score pairs
    sorted_scores = sorted(scores.items(), key=lambda item: item[1])

    top_70_percent = sorted_scores[int(len(sorted_scores)*0.4):]  # Discard the bottom 30%

    top_5 = top_70_percent[-5:]  # Get the top 5 scores

    top_5_words = [word for word, score in top_5]  # Get the words from the top 5 scored tuples


    total_score = sum(score for word, score in top_70_percent)

    average_score = total_score / len(top_70_percent)  # Normalize by the number of words

    return total_score, average_score, top_5_words

In [64]:
df = pd.read_csv('output.csv')

score_data = dict(zip(df['word'], df['final_score']))

for keywords, paragraph in keyword_dict.items():
    total_score, average_score, top_5 = score_paragraph(paragraph, score_data,df)
    print("Keywords:", keywords)
    print("Paragraph:", paragraph)
    print("Total score:", total_score)
    print("Average score:", average_score)
    print("Words you probably do not know:", top_5)
    print()

Keywords: ('strange people', 'philosopher', 'religions', 'major religions', 'esperanto')
Paragraph:  But he had to move on. He believed in omens. All his life and all his studies were aimed at finding the
one true language of the universe. First he had studied Esperanto, then the world's religions, and now it
was alchemy. He knew how to speak Esperanto, he understood all the major religions well, but he wasn't
yet an alchemist. He had unraveled the truths behind important questions, but his studies had taken him to
a point beyond which he could not seem to go. He had tried in vain to establish a relationship with an
alchemist. But the alchemists were strange people, who thought only about themselves, and almost
always refused to help him. Who knows, maybe they had failed to discover the secret of the Master
Work—the Philosopher's Stone—and for this reason kept their knowledge to themselves.
Total score: 66824.86336195069
Average score: 1670.621584048767
Words you probably do not know: 

In [None]:
# Tokenize the paragraphs into sentences
nltk.download('punkt')

sentences = []
for para in relevant_paragraphs:
    para_sentences = nltk.sent_tokenize(para)
    sentences.append(para_sentences)

# Tokenize the sentences into words
words = []
for sentence in sentences:
    sentence_words = nltk.word_tokenize(" ".join(sentence))
    words.append(sentence_words)
    print(sentence_words)

# Output the words (including punctuation) in an array
output_array = words

['[', '5', ']', 'A', 'refinement', 'and', 'modification', 'of', 'these', 'views', 'does', 'not', 'become', 'necessary', 'until', 'we', 'come', 'to', 'deal', 'with', 'the', 'general', 'theory', 'of', 'relativity', ',', 'treated', 'in', 'the', 'second', 'part', 'of', 'this', 'book', '.', 'We', 'thus', 'obtain', 'the', 'following', 'result', ':', 'Every', 'description', 'of', 'events', 'in', 'space', 'involves', 'the', 'use', 'of', 'a', 'rigid', 'body', 'to', 'which', 'such', 'events', 'have', 'to', 'be', 'referred', '.', 'The', 'resulting', 'relationship', 'takes', 'for', 'granted', 'that', 'the', 'laws', 'of', 'Euclidean', 'geometry', 'hold', 'for', '“', 'distances', ';', '”', 'the', '“', 'distance', '”', 'being', 'represented', 'physically', 'by', 'means', 'of', 'the', 'convention', 'of', 'two', 'marks', 'on', 'a', 'rigid', 'body', '.']
['If', '_K_', 'is', 'a', 'Galileian', 'co-ordinate', 'system', '.', 'then', 'every', 'other', 'co-ordinate', 'system', '_Kʹ_', 'is', 'a', 'Galileian', 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
