In [1]:
!pip install tqdm pandas



In [2]:
from abc import ABC, abstractmethod
from copy import deepcopy
from tqdm import tqdm
import pandas as pd
import random
import time
import json
import os
import re

# Crawler

In [3]:
!pip install selenium



In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException

## Crawler
For crawling, I used [Ganjoor](https://ganjoor.net/). I crawled the poems of Ferdowsi and Khayyam.
For Khayyam, I just crawled the Robaee poems.
For Ferdowsi, I crawled all the poems in Shahname. I also stored the chapters of the book. So now I know which stanza belongs to which chapter.
I chose these two poets by my own interest.

**Note:** You may need to change the Safari driver to your desired web driver.

In [5]:
class Crawler:
    def __init__(self, main_url, starting_urls=None):
        """
        Parameters
        ----------
        main_url: str
            The main URL of the poet page. In this page there are chapters of Shahname for the Ferdowsi and Robaee, Statistics, ... for the Khayyam.
        starting_urls: list
            The starting URLs for the poet. For Ferdowsi, it is the chapters of Shahname which will be crawled automatically from the main_url. For Khayyam, it is the Robaee poems. For Khayyam, it should be provided manually and it is just one URL.
        """

        self.driver = None
        # it is a 2d list for Khayyam -> all_poems[0] = a Robaee poem, all_poems(list[0]) = 4, all_poems[0][0] = a stanza
        # it is a 3d list for Ferdowsi -> all_poems[0] = a chapter, all_poems[0][0] = a poem, all_poems[0][0][0] = a stanza
        self.all_poems = []
        # It is empty for Khayyam and it is filled for Ferdowsi
        self.chapters = []
        self.starting_urls = starting_urls
        self.main_url = main_url

    def start_driver(self):
        driver = webdriver.Safari()
        driver.maximize_window()
        self.driver = driver

    def close_driver(self):
        self.driver.close()

    def get_urls_from_poet_page(self, is_start_driver=True):
        """
        Get all the chapters (starting URLs) of the book from the main page of the poet.
        This method won't call for Khayyam because the starting URLs are provided manually.
        """

        if is_start_driver:
            self.start_driver()
        self.driver.get(self.main_url)

        all_subjects = self.driver.find_elements(By.CSS_SELECTOR, '.part-title-block a')
        all_subjects_urls = [(subject.text, subject.get_attribute('href')) for subject in all_subjects]

        if is_start_driver:
            self.close_driver()

        print(f'Number of subjects: {len(all_subjects_urls)}')
        return all_subjects_urls

    def add_chapters_and_starting_urls(self, chapter_starting_urls):
        """
        Add chapters and starting URLs for the poet (Add output of get_urls_from_poet_page method to the object).
        """

        self.chapters = []
        self.starting_urls = []

        for chapter, starting_url in chapter_starting_urls:
            self.chapters.append(chapter)
            self.starting_urls.append(starting_url)

    def get_current_poem(self):
        """
        It gets the current poem from the page.
        """

        current_poem = []
        verse_number = 1

        while True:
            try:
                current_verse = self.driver.find_element(By.ID, f'bn{verse_number}')
            except NoSuchElementException:
                break

            current_poem.append(current_verse.find_element(By.CLASS_NAME, 'm1').text)
            current_poem.append(current_verse.find_element(By.CLASS_NAME, 'm2').text)

            verse_number += 1

        return current_poem

    def get_all_poems(self, starting_url, is_start_driver=True):
        """
        Get all the poems from the starting URL. Starting URL is a chapter's URL. The method retrieves all the poems in the chapter.
        """

        if is_start_driver:
            self.start_driver()
        self.driver.get(starting_url)

        first_poem = self.driver.find_element(By.CLASS_NAME, 'poem-excerpt').find_element(By.TAG_NAME, 'a')
        next_navigation_href = first_poem.get_attribute('href')
        poet_poems = []

        progress_bar = tqdm()
        while re.search('/sh\d+', next_navigation_href):
            self.driver.get(next_navigation_href)

            current_poem = self.get_current_poem()
            poet_poems.append(current_poem)

            try:
                next_poem_button = self.driver.find_element(By.CSS_SELECTOR, '.navleft a')
            except NoSuchElementException:
                break

            next_navigation_href = next_poem_button.get_attribute('href')
            progress_bar.update(1)

        progress_bar.close()
        if is_start_driver:
            self.close_driver()
        return poet_poems

    def crawl_all_starting_urls(self, is_get_starting_urls=True):
        """
        Crawl all the poems from all starting URLs. It is the main method for crawling.
        """
        
        self.start_driver()
        self.chapters = []
        all_poems = []

        if is_get_starting_urls:
            chapter_starting_urls = self.get_urls_from_poet_page(False)
            self.add_chapters_and_starting_urls(chapter_starting_urls)

        for starting_url in self.starting_urls:
            all_poems.append(self.get_all_poems(starting_url, False))
            time.sleep(2)

        self.all_poems = all_poems[0] if len(all_poems) == 1 else all_poems
        self.close_driver()

## Poet
I created a class for the poet. It has two abstract methods. One for getting the poems from the crawler and the other for concatenating the poems for simple processing.
I also added two methods for dumping the poems to a file and loading them from a file.

In [6]:
def print_file_size(file_path):
    print(f'File size: {os.path.getsize(file_path) / 1024 / 1024} MB')


class Poet(ABC):
    def __init__(self):
        self.crawler: Crawler = None
        self.all_poems = []
        # It is a 1d list for Khayyam.
        # It is a 2d list for Ferdowsi. -> concatenated_poems[0] = a chapter, concatenated_poems[0][0] = poems
        self.concatenated_poems = []
        self.chapters = []

    @abstractmethod
    def get_poems(self):
        pass

    @abstractmethod
    def concatenate_poems(self):
        pass

    def dump_poems(self, file_path):
        json.dump(self.all_poems, open(file_path, 'w'), ensure_ascii=False)
        print_file_size(file_path)

    def load_poems(self, file_path):
        self.all_poems = json.load(open(file_path, 'r'))
        print_file_size(file_path)

    @abstractmethod
    def load_all(self, file_path):
        pass

In [7]:
class Ferdowsi(Poet):
    def __init__(self):
        super().__init__()
        self.crawler = Crawler('https://ganjoor.net/ferdousi/shahname')

    def get_poems(self):
        self.crawler.crawl_all_starting_urls()
        self.all_poems = deepcopy(self.crawler.all_poems)
        self.chapters = deepcopy(self.crawler.chapters)

    def concatenate_poems(self):
        self.concatenated_poems = []

        for book_chapter in self.all_poems:
            book_chapter_stanzas = []

            for poem in book_chapter:
                for stanza in poem:
                    book_chapter_stanzas.append(stanza)

            self.concatenated_poems.append(book_chapter_stanzas)

    def load_all(self, file_path):
        chapter_starting_urls = self.crawler.get_urls_from_poet_page()
        self.crawler.add_chapters_and_starting_urls(chapter_starting_urls)
        self.chapters = deepcopy(self.crawler.chapters)
        self.load_poems(file_path)
        self.concatenate_poems()

In [8]:
class Khayyam(Poet):
    def __init__(self):
        super().__init__()
        self.crawler = Crawler('https://ganjoor.net/khayyam', ['https://ganjoor.net/khayyam/robaee'])

    def get_poems(self):
        self.crawler.crawl_all_starting_urls(False)
        self.all_poems = deepcopy(self.crawler.all_poems)
        self.chapters = deepcopy(self.crawler.chapters)

    def concatenate_poems(self):
        self.concatenated_poems = []

        for poem in self.all_poems:
            for stanza in poem:
                self.concatenated_poems.append(stanza)

    def load_all(self, file_path):
        self.crawler.chapters = []
        self.load_poems(file_path)
        self.concatenate_poems()

As you can see, There are 61 chapters in Shahname. Each chapter has a name. I crawled all the poems in each chapter. I stored the poems in a 3d list. The first dimension is for the chapters. The second dimension is for the poems in the chapter. The third dimension is for the stanzas in the poem.

In [101]:
ferdowsi = Ferdowsi()
ferdowsi.get_poems()

Number of subjects: 61


12it [00:13,  1.14s/it]
2it [00:02,  1.07s/it]
3it [00:02,  1.23it/s]
1it [00:01,  1.30s/it]
4it [00:04,  1.14s/it]
12it [00:12,  1.05s/it]
20it [00:21,  1.08s/it]
28it [00:34,  1.24s/it]
13it [00:13,  1.00s/it]
1it [00:01,  1.01s/it]
5it [00:05,  1.00s/it]
5it [00:05,  1.03s/it]
17it [00:19,  1.12s/it]
12it [00:13,  1.15s/it]
21it [00:24,  1.18s/it]
22it [01:05,  2.96s/it]
3it [00:06,  2.02s/it]
25it [00:26,  1.07s/it]
16it [00:29,  1.83s/it]
17it [00:26,  1.56s/it]
1it [00:02,  2.86s/it]
14it [00:22,  1.59s/it]
38it [00:50,  1.32s/it]
47it [01:02,  1.34s/it]
17it [00:20,  1.18s/it]
33it [00:37,  1.12s/it]
15it [00:17,  1.17s/it]
31it [00:35,  1.16s/it]
8it [00:08,  1.03s/it]
5it [00:04,  1.10it/s]
7it [00:07,  1.01s/it]
4it [00:03,  1.20it/s]
10it [00:09,  1.02it/s]
47it [00:46,  1.00it/s]
21it [00:19,  1.07it/s]
14it [00:15,  1.11s/it]
3it [00:02,  1.09it/s]
2it [00:02,  1.05s/it]
2it [00:01,  1.60it/s]
1it [00:00,  1.32it/s]
1it [00:00,  1.78it/s]
1it [00:00,  1.35it/s]
1it [00:00,

In [105]:
ferdowsi.concatenate_poems()

I dumped the poems to a file. It is JSON format. Its size in a TXT file is surely more than 1 MB, the minimum size for the crawled data.

In [106]:
ferdowsi.dump_poems('ferdowsi.json')

File size: 4.643608093261719 MB


You can load the Ferdowsi poems from the file by running the following cell.

In [9]:
ferdowsi = Ferdowsi()
ferdowsi.load_all('ferdowsi.json')

Number of subjects: 61
File size: 4.643608093261719 MB


In [107]:
khayyam = Khayyam()
khayyam.get_poems()

178it [02:41,  1.10it/s]


In [108]:
khayyam.concatenate_poems()

In [109]:
khayyam.dump_poems('khayyam.json')

File size: 0.038249969482421875 MB


You can load the Khayyam poems from the file by running the following cell.

In [10]:
khayyam = Khayyam()
khayyam.load_all('khayyam.json')

File size: 0.038249969482421875 MB


## See some poems

In [11]:
random.seed(0)

In the following cell, each poem is shown with its chapter

In [12]:
ferdowsi_samples = []
for i, chapter in enumerate(ferdowsi.concatenated_poems):
    ferdowsi_samples.append(random.sample(chapter, 1)[0] + ' -> ' + ferdowsi.chapters[i])

print(len(ferdowsi_samples))
ferdowsi_samples

61


['نخستین برادرش که\u200cتر به سال -> آغاز کتاب',
 'بر آن برترین نام یزدانش را -> کیومرث',
 'پراگند پس تخم و کشت و درود -> هوشنگ',
 'به خوبی چه مایه سخن\u200cها براند -> طهمورث',
 'خور و خواب و آرامتان از من است -> جمشید',
 'وزان گرز پیکر بدیشان نمود -> ضحاک',
 'سپه نیز با او هم آواز شد -> فریدون',
 'کزین آگهی یافت سام سوار -> منوچهر',
 'سرانجام نوذر گرفتار شد -> پادشاهی نوذر',
 'ز کار گذشته نیارند یاد -> پادشاهی زوطهماسپ',
 'از امروز و فردا نیامدش یاد -> پادشاهی گرشاسپ',
 'فرستاده آمد بسان پلنگ -> کیقباد',
 'جهان چون بهشتی شد آراسته -> پادشاهی کی\u200cکاووس و رفتن او به مازندران',
 'نبد روزگار سکون و درنگ -> رزم کاووس با شاه هاماوران',
 'به رنج و به درد و گداز آمدند -> سهراب',
 'دل شاه کاووس ازان تنگ شد -> داستان سیاوش',
 'سزاوار بنوشت نام گوان -> پادشاهی کیخسرو شصت سال بود',
 'بیفتاد و برگشت زو اسپ تیز -> گفتار اندر داستان فرود سیاوش',
 'گر ایدونک بیچاره\u200cای را زمان -> داستان کاموس کشانی',
 'عمودی بزد بر سرش پیلتن -> داستان خاقان چین',
 'سوی آبت اندازم ار سوی کوه -> داستان اکوان د

In [13]:
Khayyam_samples = random.sample(khayyam.concatenated_poems, len(ferdowsi_samples))
print(len(Khayyam_samples))
Khayyam_samples

61


['در طبل زمین و حقهٔ خاک نهاد',
 'افسوس که سرمایه ز کف بیرون شد',
 'دریاب که هفتهٔ\xa0دگر خاک شده\u200cست',
 'رفتم که در این منزلِ بیداد بُدَن',
 'این کوزه\u200cگر دهر چنین جام لطیف',
 'برخیز و به جام باده کن عزم درست',
 'خیام ، که گفت دوزخی خواهد بود',
 'عجز است به دست هرکه از مادر زاد',
 'من می نه ز بهر تنگدستی نخورم',
 'در دهر چه صد ساله چه یکروزه شویم',
 'در بی\u200cخبری مرد چه هشیار و چه مست',
 'دهقان قضا بسی چو ما کشت و درود',
 'قانع به یک استخوان چو کرکس بودن',
 'گویند قرابه\u200cگر مسلمان نبود',
 'می خور که چنین فسانه\u200cها کوته نیست',
 'رندی دیدم نشسته بر خنگ زمین',
 'گر یک نفست ز زندگانی گذرد',
 'جز خوردن غصه نیست تا کندن جان',
 'چون عمر به سر رسد چه شیرین و چه تلخ',
 'وین عمر به خوشدلی گذارم یا نه',
 'خشتی دو نهند بر مغاک من و تو',
 'کار من و تو چنان\u200cکه رای من و توست',
 'کو بانگ جرس\u200cها و کجا ناله کوس\u200c\u200c؟',
 'مستی و قلندری و گمراهی به',
 'چون می\u200cدانی که مدت عالم خاک',
 'کس مشکل اسرار اجل را نگشاد',
 'وقت خوش خود بسنگ محنت سودن',
 'کاحوال مسافران دنیا

# Preprocessing

In [14]:
!pip install hazm && pip install hazm --upgrade



In [15]:
from hazm import *

## Preprocessor
I created a preprocessor class for preprocessing the poems. It has the following methods:
- normalize: It normalizes the text.
- lemmatize: It lemmatizes the word.
- tokenize: It tokenizes the text.
- preprocess: It normalizes, tokenizes, and lemmatizes the text.
- find_stopwords: It finds the stopwords in the text. It sets stopwords_count which is a dictionary of words and their counts. It also sets stopwords which is a list of stopwords. It runs on output of preprocess method.
- remove_stopwords: It removes the stopwords from the text.

In [16]:
class Preprocessor:
    def __init__(self):
        self.normalizer = Normalizer()
        self.lemmatizer = Lemmatizer()
        self.word_tokenizer = WordTokenizer()

        self.stopwords_count = {}
        self.stopwords = []

    def normalize(self, text):
        return self.normalizer.normalize(text)

    def lemmatize(self, word):
        return self.lemmatizer.lemmatize(word)

    def tokenize(self, text):
        return self.word_tokenizer.tokenize(text)

    def preprocess(self, poem: list):
        preprocessed_text = []

        for stanza in poem:
            normalized_stanza = self.normalize(stanza)
            tokenized_stanza = self.tokenize(normalized_stanza)
            preprocessed_stanza = [self.lemmatize(word) for word in tokenized_stanza]

            preprocessed_text.append(preprocessed_stanza)

        return preprocessed_text

    def find_stopwords(self, poem: list):
        self.stopwords_count = {}

        for stanza in poem:
            for word in stanza:
                if word in self.stopwords_count:
                    self.stopwords_count[word] += 1
                else:
                    self.stopwords_count[word] = 1

        self.stopwords_count = list(sorted(self.stopwords_count.items(), key=lambda x: x[1], reverse=True))
        self.stopwords = [word for word, _ in self.stopwords_count]

    def remove_stopwords(self, text: list, number_of_stopwords=10):
        return [[word for word in stanza if word not in self.stopwords[:number_of_stopwords]] for stanza in text]


def run_preprocess(poem, is_print=True):
    """
    It runs a preprocess the poem. The poem should be a 1d list. It returns the preprocessed dataframe and the preprocessor object.
    The dataframe contains the original, preprocessed, and preprocessed_no_stopwords columns.
    And we need the preprocessor object to access the stopwords and stopwords_count.
    """
    
    dataframe = pd.DataFrame(poem, columns=['original'])
    preprocessor = Preprocessor()

    preprocessed = preprocessor.preprocess(poem)
    dataframe['preprocessed'] = preprocessed

    preprocessor.find_stopwords(preprocessed)
    preprocessed_no_stopwords = preprocessor.remove_stopwords(preprocessed)
    dataframe['preprocessed_no_stopwords'] = preprocessed_no_stopwords

    if is_print:
        print(preprocessor.stopwords_count[:200])
        print(preprocessor.stopwords[:10])
    return dataframe, preprocessor

## Preprocess the poems of Khayyam
In the output of the following cell, you can see the stopwords_count and the top 10 stopwords.
Then, there's a dataframe for comparing the original, preprocessed, and preprocessed_no_stopwords.

As you can see in the first row, the word 'ز' is removed in the preprocessed_no_stopwords column. Or in the third row, we can see the word 'کنیم' is lemmitized.

In [17]:
khayyam_dataframe, khayyam_preprocessor = run_preprocess(khayyam.concatenated_poems)
khayyam_dataframe.head(10)

[('و', 230), ('که', 185), ('به', 122), ('از', 95), ('در', 84), ('ز', 69), ('تو', 55), ('است', 54), ('این', 51), ('بر', 51), ('من', 50), ('را', 48), ('شد#شو', 45), ('آن', 36), ('چه', 36), ('تا', 33), ('هر', 32), ('چو', 32), ('خاک', 32), ('چون', 31), ('بود#است', 31), ('با', 31), ('یک', 29), ('کرد#کن', 29), ('نه', 29), ('ما', 27), ('دل', 26), ('خوش', 24), ('گل', 22), ('آمد#آ', 21), ('نیست', 21), ('گفت#گو', 21), ('دست', 20), ('خورد#خور', 19), ('عمر', 19), ('سر', 18), ('جهان', 18), ('همه', 17), ('باده', 17), ('او', 16), ('سبزه', 16), ('دانست#دان', 16), ('کوزه', 15), ('گر', 15), ('دید#بین', 15), ('ای', 15), ('کس', 15), ('غم', 15), ('رفت#رو', 14), ('اگر', 14), ('دو', 14), ('باشید#باش', 14), ('پیش', 13), ('مکن', 13), ('چند', 13), ('مرا', 13), ('جام', 13), ('باش', 13), ('چرخ', 13), ('دی', 13), ('بهشت', 13), ('داشت#دار', 13), ('یا', 13), ('صد', 12), ('فلک', 12), ('زمین', 12), ('خود', 12), ('بوده\u200cست', 11), ('؟', 11), ('گدشت#گذر', 11), ('هم', 10), ('پر', 10), ('می\u200cنوش', 10), ('باز', 10),

Unnamed: 0,original,preprocessed,preprocessed_no_stopwords
0,برخیز بتا بیا ز بهر دل ما,"[برخیز, بتا, بیا, ز, بهر, دل, ما]","[برخیز, بتا, بیا, بهر, دل, ما]"
1,حل کن به جمال خویشتن مشکل ما,"[حل, کن, به, جمال, خویشتن, مشکل, ما]","[حل, کن, جمال, خویشتن, مشکل, ما]"
2,یک کوزه شراب تا به هم نوش کنیم,"[یک, کوزه, شراب, تا, به, هم, نوش, کرد#کن]","[یک, کوزه, شراب, تا, هم, نوش, کرد#کن]"
3,زآن پیش که کوزه‌ها کنند از گل ما,"[زآن, پیش, که, کوزه, کرد#کن, از, گل, ما]","[زآن, پیش, کوزه, کرد#کن, گل, ما]"
4,چون عهده نمی‌شود کسی فردا را,"[چون, عهده, شد#شو, کسی, فردا, را]","[چون, عهده, شد#شو, کسی, فردا, را]"
5,حالی خوش دار این دل پر سودا را,"[حالی, خوش, دار, این, دل, پر, سودا, را]","[حالی, خوش, دار, دل, پر, سودا, را]"
6,می نوش به ماهتاب ای ماه که ماه,"[می‌نوش, به, ماهتاب, ماه, که, ماه]","[می‌نوش, ماهتاب, ماه, ماه]"
7,بسیار بتابد و نیابد ما را,"[بسیار, تافت#تاب, و, یابید#یاب, ما, را]","[بسیار, تافت#تاب, یابید#یاب, ما, را]"
8,قرآن که مهین کلام خوانند آن را,"[قرآن, که, مهین, کلام, خواند#خوان, آن, را]","[قرآن, مهین, کلام, خواند#خوان, آن, را]"
9,گهگاه نه بر دوام خوانند آن را,"[گهگاه, نه, بر, دوام, خواند#خوان, آن, را]","[گهگاه, نه, دوام, خواند#خوان, آن, را]"


## Preprocess the poems of Ferdowsi (All)
First, I concatenated all the poems of Ferdowsi into a single 1d list.
In the output of the following cell, you can see the stopwords_count and the top 10 stopwords.
Then, there's a dataframe for comparing the original, preprocessed, and preprocessed_no_stopwords.

As you can see in the first row, the word 'و' is removed in the preprocessed_no_stopwords column. Or in the second row, we can see the word 'نگذرد' is lemmitized.

In [18]:
all_ferdowsi = []
for chapter in ferdowsi.concatenated_poems:
    all_ferdowsi.extend(chapter)

all_ferdowsi_dataframe, all_ferdowsi_preprocessor = run_preprocess(all_ferdowsi)
all_ferdowsi_dataframe.head(10)

[('و', 24138), ('به', 17809), ('که', 12569), ('ز', 12017), ('از', 10540), ('بر', 8330), ('را', 7676), ('چو', 6423), ('شد#شو', 5475), ('گفت#گو', 5471), ('با', 4663), ('شاه', 4302), ('بود#است', 4233), ('کرد#کن', 4174), ('تو', 4072), ('همی', 3943), ('او', 3812), ('آمد#آ', 3770), ('آن', 3656), ('یکی', 3389), ('همه', 3253), ('اندر', 2960), ('من', 2876), ('در', 2739), ('سر', 2727), ('تا', 2651), ('پیش', 2554), ('این', 2543), ('چنین', 2512), ('دل', 2482), ('بد', 2154), ('پر', 2059), ('بدو', 2031), ('هر', 2017), ('جهان', 1939), ('سپاه', 1837), ('داد', 1794), ('چون', 1791), ('سخن', 1766), ('پس', 1723), ('دو', 1695), ('نه', 1645), ('لشکر', 1637), ('دید#بین', 1572), ('راه', 1553), ('سوی', 1534), ('کار', 1525), ('روی', 1515), ('تخت', 1482), ('داشت#دار', 1460), ('چه', 1456), ('ما', 1404), ('باشید#باش', 1382), ('جنگ', 1374), ('جای', 1315), ('مرد', 1301), ('مرا', 1267), ('گرد', 1262), ('دست', 1257), ('گشت', 1207), ('ماند#مان', 1200), ('بیامد', 1199), ('هم', 1197), ('همان', 1192), ('زمین', 1153), ('آو

Unnamed: 0,original,preprocessed,preprocessed_no_stopwords
0,به نام خداوند جان و خرد,"[به, نام, خداوند, جان, و, خرد]","[نام, خداوند, جان, خرد]"
1,کز این برتر اندیشه بر نگذرد,"[کز, این, برتر, اندیشه, بر, گدشت#گذر]","[کز, این, برتر, اندیشه, گدشت#گذر]"
2,خداوند نام و خداوند جای,"[خداوند, نام, و, خداوند, جای]","[خداوند, نام, خداوند, جای]"
3,خداوند روزی ده رهنمای,"[خداوند, روزی‌ده, رهنمای]","[خداوند, روزی‌ده, رهنمای]"
4,خداوند کیوان و گَردان سپهر,"[خداوند, کیوان, و, گردان, سپهر]","[خداوند, کیوان, گردان, سپهر]"
5,فروزندهٔ ماه و ناهید و مهر,"[فروزنده, ماه, و, ناهید, و, مهر]","[فروزنده, ماه, ناهید, مهر]"
6,ز نام و نشان و گمان برتر است,"[ز, نام, و, نشان, و, گمان, برتر, است]","[نام, نشان, گمان, برتر, است]"
7,نگارندهٔ بر شده پیکر است,"[نگارنده, بر, شده, پیکر, است]","[نگارنده, شده, پیکر, است]"
8,به بینندگان آفریننده را,"[به, بینندگان, آفریننده, را]","[بینندگان, آفریننده]"
9,نبینی مرنجان دو بیننده را,"[دید#بین, مرنج, دو, بیننده, را]","[دید#بین, مرنج, دو, بیننده]"


## Preprocess the poems of Ferdowsi (Chapter Base)
First, I preprocessed the poems of each chapter separately.
In the output of the following cell, you can see the stopwords (count more than 10) of each chapter.
Then, there's a dataframe for comparing the stopwords of each chapter (all stopwords and top 20).

Finally finding in common stopwords of all chapters (more accurately, chapters with more than 10 stopwords).

In [19]:
ferdowsi_dataframes = []
ferdowsi_preprocessors = []

progress_bar = tqdm(ferdowsi.concatenated_poems, desc='Ferdowsi, Shahname Chapters Preprocessing')
for chapter in progress_bar:
    ferdowsi_dataframe, ferdowsi_preprocessor = run_preprocess(chapter, False)
    ferdowsi_dataframes.append(ferdowsi_dataframe)
    ferdowsi_preprocessors.append(ferdowsi_preprocessor)
    
    
ferdowsi_stopwords_chapter_based = []
for ferdowsi_preprocessor in ferdowsi_preprocessors:
    ferdowsi_stopwords_chapter_based.append([ferdowsi_preprocessor.stopwords for ferdowsi_preprocessor.stopwords, count in ferdowsi_preprocessor.stopwords_count if count > 10])

Ferdowsi, Shahname Chapters Preprocessing: 100%|██████████| 61/61 [03:24<00:00,  3.36s/it]


In the output of the following cell, as we can see, there are some common stopwords such as 'و' and 'به' and 'از'.

In [20]:
dataframe = pd.DataFrame()
for i in range(len(ferdowsi_stopwords_chapter_based)):
    dataframe[f'chapter_{i + 1}, {ferdowsi.chapters[i]}'] = pd.Series(ferdowsi_stopwords_chapter_based[i])

dataframe

Unnamed: 0,"chapter_1, آغاز کتاب","chapter_2, کیومرث","chapter_3, هوشنگ","chapter_4, طهمورث","chapter_5, جمشید","chapter_6, ضحاک","chapter_7, فریدون","chapter_8, منوچهر","chapter_9, پادشاهی نوذر","chapter_10, پادشاهی زوطهماسپ",...,"chapter_52, پادشاهی کسری نوشین روان چهل و هشت سال بود","chapter_53, پادشاهی هرمزد دوازده سال بود","chapter_54, پادشاهی خسرو پرویز","chapter_55, پادشاهی شیرویه","chapter_56, پادشاهی اردشیر شیروی","chapter_57, پادشاهی فرایین","chapter_58, پادشاهی پوران دخت","chapter_59, پادشاهی آزرم دخت","chapter_60, پادشاهی فرخ زاد","chapter_61, پادشاهی یزدگرد"
0,و,و,و,به,و,به,و,و,به,و,...,و,که,و,و,و,به,به,,,و
1,به,به,کرد#کن,و,به,و,به,به,و,ز,...,که,و,که,به,به,از,,,,به
2,از,بر,به,را,از,که,از,از,که,به,...,به,به,به,که,که,که,,,,که
3,بر,را,از,او,ز,از,ز,که,ز,,...,ز,را,ز,ز,ز,را,,,,ز
4,ز,که,,بر,را,ز,که,بر,از,,...,از,ز,از,از,شد#شو,و,,,,از
5,که,از,,از,چو,بر,را,ز,شد#شو,,...,را,از,را,را,,,,,,بر
6,را,او,,,بر,را,چو,را,چو,,...,شاه,شاه,گفت#گو,تو,,,,,,را
7,او,بود#است,,,او,آن,بر,چو,را,,...,بر,گفت#گو,بر,بر,,,,,,شد#شو
8,چو,آن,,,کرد#کن,چو,شاه,شد#شو,بر,,...,بود#است,شد#شو,شد#شو,چو,,,,,,چو
9,این,ز,,,که,بود#است,شد#شو,گفت#گو,با,,...,با,بر,چو,آن,,,,,,آمد#آ


In [21]:
dataframe_top_20 = pd.DataFrame()
for i in range(len(ferdowsi_stopwords_chapter_based)):
    dataframe_top_20[f'chapter_{i + 1}, {ferdowsi.chapters[i]}'] = pd.Series(ferdowsi_stopwords_chapter_based[i][:20])
    
dataframe_top_20

Unnamed: 0,"chapter_1, آغاز کتاب","chapter_2, کیومرث","chapter_3, هوشنگ","chapter_4, طهمورث","chapter_5, جمشید","chapter_6, ضحاک","chapter_7, فریدون","chapter_8, منوچهر","chapter_9, پادشاهی نوذر","chapter_10, پادشاهی زوطهماسپ",...,"chapter_52, پادشاهی کسری نوشین روان چهل و هشت سال بود","chapter_53, پادشاهی هرمزد دوازده سال بود","chapter_54, پادشاهی خسرو پرویز","chapter_55, پادشاهی شیرویه","chapter_56, پادشاهی اردشیر شیروی","chapter_57, پادشاهی فرایین","chapter_58, پادشاهی پوران دخت","chapter_59, پادشاهی آزرم دخت","chapter_60, پادشاهی فرخ زاد","chapter_61, پادشاهی یزدگرد"
0,و,و,و,به,و,به,و,و,به,و,...,و,که,و,و,و,به,به,,,و
1,به,به,کرد#کن,و,به,و,به,به,و,ز,...,که,و,که,به,به,از,,,,به
2,از,بر,به,را,از,که,از,از,که,به,...,به,به,به,که,که,که,,,,که
3,بر,را,از,او,ز,از,ز,که,ز,,...,ز,را,ز,ز,ز,را,,,,ز
4,ز,که,,بر,را,ز,که,بر,از,,...,از,ز,از,از,شد#شو,و,,,,از
5,که,از,,از,چو,بر,را,ز,شد#شو,,...,را,از,را,را,,,,,,بر
6,را,او,,,بر,را,چو,را,چو,,...,شاه,شاه,گفت#گو,تو,,,,,,را
7,او,بود#است,,,او,آن,بر,چو,را,,...,بر,گفت#گو,بر,بر,,,,,,شد#شو
8,چو,آن,,,کرد#کن,چو,شاه,شد#شو,بر,,...,بود#است,شد#شو,شد#شو,چو,,,,,,چو
9,این,ز,,,که,بود#است,شد#شو,گفت#گو,با,,...,با,بر,چو,آن,,,,,,آمد#آ


Here there is common stopwords in all chapters.

In [22]:
all_common = set(ferdowsi_stopwords_chapter_based[0])
for i in range(1, len(ferdowsi_stopwords_chapter_based)):
    if len(ferdowsi_stopwords_chapter_based[i]) <= 10:
        continue
    
    all_common &= set(ferdowsi_stopwords_chapter_based[i])
    
print(f'Number of stopwords in common: {len(all_common)}')
all_common

Number of stopwords in common: 8


{'از', 'بر', 'به', 'ز', 'شد#شو', 'و', 'چو', 'که'}

## Compare the stopwords

### Khayyam and Ferdowsi All

In the following cells, I compared the stopwords of Khayyam and Ferdowsi. I showed a dataframe, common stopwords, and common stopwords in the top 20.

As we can see, there are many common stopwords and somehow, there are many many stopwords in Khayyam poems.

In [23]:
stopwords_dataframe = pd.DataFrame(khayyam_preprocessor.stopwords_count, columns=['khayyam stopwords', 'khayyam count'])
stopwords_dataframe[['ferdowsi stopwords', 'ferdowsi count']] = all_ferdowsi_preprocessor.stopwords_count[
                                                                :len(khayyam_preprocessor.stopwords_count)]
stopwords_dataframe.head(20)

Unnamed: 0,khayyam stopwords,khayyam count,ferdowsi stopwords,ferdowsi count
0,و,230,و,24138
1,که,185,به,17809
2,به,122,که,12569
3,از,95,ز,12017
4,در,84,از,10540
5,ز,69,بر,8330
6,تو,55,را,7676
7,است,54,چو,6423
8,این,51,شد#شو,5475
9,بر,51,گفت#گو,5471


In [24]:
# I choose 10, by sense. When I changed it to 30, Number of stopwords in Khayyam = Number of stopwords in common = 22!

khayyam_stop_words = [khayyam_preprocessor.stopwords for khayyam_preprocessor.stopwords, count in khayyam_preprocessor.stopwords_count if count > 10]
ferdowsi_stopwords = [current_stopwords for current_stopwords, count in all_ferdowsi_preprocessor.stopwords_count if count > 10]

In [25]:
common_stopwords = set(khayyam_stop_words) & set(ferdowsi_stopwords)
print(f'Number of stopwords in Khayyam: {len(khayyam_stop_words)}')
print(f'Number of stopwords in Ferdowsi: {len(ferdowsi_stopwords)}')
print(f'Number of stopwords in common: {len(common_stopwords)}')
print(f'Stopwords in common: {set(khayyam_stop_words) & set(ferdowsi_stopwords)}')

Number of stopwords in Khayyam: 70
Number of stopwords in Ferdowsi: 3361
Number of stopwords in common: 67
Stopwords in common: {'فلک', 'دانست#دان', 'همه', 'نیست', 'داشت#دار', 'و', 'که', 'خوش', 'با', 'بر', 'کرد#کن', 'غم', 'ز', 'یک', 'جهان', 'شد#شو', '؟', 'خورد#خور', 'در', 'چرخ', 'گفت#گو', 'از', 'هر', 'دو', 'گل', 'خود', 'کس', 'یا', 'چه', 'چو', 'پیش', 'آمد#آ', 'آن', 'بهشت', 'مکن', 'سبزه', 'دست', 'این', 'بود#است', 'اگر', 'باده', 'چند', 'جام', 'صد', 'سر', 'است', 'را', 'نه', 'ما', 'دل', 'رفت#رو', 'باش', 'گدشت#گذر', 'گر', 'تا', 'به', 'چون', 'خاک', 'من', 'او', 'دید#بین', 'باشید#باش', 'دی', 'مرا', 'ای', 'زمین', 'تو'}


In [26]:
common_top_20_stopwords = set(khayyam_stop_words[:20]) & set(ferdowsi_stopwords[:20])
print(f'Number of stopwords in common in top 20: {len(common_top_20_stopwords)}')
print(f'Stopwords in common in top 20: {set(khayyam_stop_words[:20]) & set(ferdowsi_stopwords[:20])}')

Number of stopwords in common in top 20: 11
Stopwords in common in top 20: {'آن', 'به', 'از', 'و', 'که', 'بر', 'را', 'چو', 'ز', 'شد#شو', 'تو'}


### Ferdowsi Chapter Based and Ferdowsi All

Here, I compared the stopwords of Ferdowsi chapter-based and all Ferdowsi in top 20 stopwords.
The result is a dataframe that shows the common stopwords between each chapter and all Ferdowsi, number of them and their percentage.

In [27]:
print('Ferdowsi all stopwords:')
all_ferdowsi_stopwords_20 = all_ferdowsi_preprocessor.stopwords[:20]
all_ferdowsi_stopwords_20

Ferdowsi all stopwords:


['و',
 'به',
 'که',
 'ز',
 'از',
 'بر',
 'را',
 'چو',
 'شد#شو',
 'گفت#گو',
 'با',
 'شاه',
 'بود#است',
 'کرد#کن',
 'تو',
 'همی',
 'او',
 'آمد#آ',
 'آن',
 'یکی']

In [28]:
print('Ferdowsi chapter based stopwords:')
dataframe_top_20

Ferdowsi chapter based stopwords:


Unnamed: 0,"chapter_1, آغاز کتاب","chapter_2, کیومرث","chapter_3, هوشنگ","chapter_4, طهمورث","chapter_5, جمشید","chapter_6, ضحاک","chapter_7, فریدون","chapter_8, منوچهر","chapter_9, پادشاهی نوذر","chapter_10, پادشاهی زوطهماسپ",...,"chapter_52, پادشاهی کسری نوشین روان چهل و هشت سال بود","chapter_53, پادشاهی هرمزد دوازده سال بود","chapter_54, پادشاهی خسرو پرویز","chapter_55, پادشاهی شیرویه","chapter_56, پادشاهی اردشیر شیروی","chapter_57, پادشاهی فرایین","chapter_58, پادشاهی پوران دخت","chapter_59, پادشاهی آزرم دخت","chapter_60, پادشاهی فرخ زاد","chapter_61, پادشاهی یزدگرد"
0,و,و,و,به,و,به,و,و,به,و,...,و,که,و,و,و,به,به,,,و
1,به,به,کرد#کن,و,به,و,به,به,و,ز,...,که,و,که,به,به,از,,,,به
2,از,بر,به,را,از,که,از,از,که,به,...,به,به,به,که,که,که,,,,که
3,بر,را,از,او,ز,از,ز,که,ز,,...,ز,را,ز,ز,ز,را,,,,ز
4,ز,که,,بر,را,ز,که,بر,از,,...,از,ز,از,از,شد#شو,و,,,,از
5,که,از,,از,چو,بر,را,ز,شد#شو,,...,را,از,را,را,,,,,,بر
6,را,او,,,بر,را,چو,را,چو,,...,شاه,شاه,گفت#گو,تو,,,,,,را
7,او,بود#است,,,او,آن,بر,چو,را,,...,بر,گفت#گو,بر,بر,,,,,,شد#شو
8,چو,آن,,,کرد#کن,چو,شاه,شد#شو,بر,,...,بود#است,شد#شو,شد#شو,چو,,,,,,چو
9,این,ز,,,که,بود#است,شد#شو,گفت#گو,با,,...,با,بر,چو,آن,,,,,,آمد#آ


In [29]:
in_common_dataframe = pd.DataFrame()
number_of_in_common = []
for i in range(len(ferdowsi_stopwords_chapter_based)):
    in_common_series = pd.Series(list(set(dataframe_top_20.iloc[:, i]) & set(all_ferdowsi_preprocessor.stopwords)))
    in_common_dataframe[f'chapter_{i + 1}, {ferdowsi.chapters[i]}'] = in_common_series
    number_of_in_common.append(len(in_common_series))
    

first_row = []
for i in range(len(ferdowsi_stopwords_chapter_based)):
    first_row.append(f'{number_of_in_common[i]} -> {number_of_in_common[i] / len(all_ferdowsi_stopwords_20) * 100}%')
in_common_dataframe.loc[-1] = first_row
in_common_dataframe.index = in_common_dataframe.index + 1
in_common_dataframe.sort_index(inplace=True)
in_common_dataframe.rename(index={-1: 'Number of stopwords in common'}, inplace=True)
in_common_dataframe

Unnamed: 0,"chapter_1, آغاز کتاب","chapter_2, کیومرث","chapter_3, هوشنگ","chapter_4, طهمورث","chapter_5, جمشید","chapter_6, ضحاک","chapter_7, فریدون","chapter_8, منوچهر","chapter_9, پادشاهی نوذر","chapter_10, پادشاهی زوطهماسپ",...,"chapter_52, پادشاهی کسری نوشین روان چهل و هشت سال بود","chapter_53, پادشاهی هرمزد دوازده سال بود","chapter_54, پادشاهی خسرو پرویز","chapter_55, پادشاهی شیرویه","chapter_56, پادشاهی اردشیر شیروی","chapter_57, پادشاهی فرایین","chapter_58, پادشاهی پوران دخت","chapter_59, پادشاهی آزرم دخت","chapter_60, پادشاهی فرخ زاد","chapter_61, پادشاهی یزدگرد"
0,20 -> 100.0%,10 -> 50.0%,4 -> 20.0%,6 -> 30.0%,20 -> 100.0%,20 -> 100.0%,20 -> 100.0%,20 -> 100.0%,20 -> 100.0%,3 -> 15.0%,...,20 -> 100.0%,20 -> 100.0%,20 -> 100.0%,20 -> 100.0%,5 -> 25.0%,5 -> 25.0%,1 -> 5.0%,0 -> 0.0%,0 -> 0.0%,20 -> 100.0%
1,و,او,به,او,و,و,همه,و,روی,به,...,همه,و,و,همه,به,به,به,,,و
2,که,به,از,به,که,که,و,که,و,و,...,داد,که,که,و,و,از,,,,که
3,بر,از,کرد#کن,از,بر,بر,که,با,که,ز,...,و,با,با,که,که,و,,,,با
4,ز,بود#است,و,و,کرد#کن,ز,با,بر,با,,...,که,بر,بر,بر,ز,که,,,,بر
5,جهان,و,,بر,ز,همی,بر,کرد#کن,بر,,...,با,کرد#کن,کرد#کن,کرد#کن,شد#شو,را,,,,کرد#کن
6,همی,که,,را,جهان,شد#شو,کرد#کن,ز,کرد#کن,,...,بر,ز,ز,ز,,,,,,ز
7,شد#شو,بر,,,شد#شو,گفت#گو,ز,همی,ز,,...,چنین,شد#شو,شد#شو,شد#شو,,,,,,شد#شو
8,از,را,,,گفت#گو,از,شد#شو,شد#شو,همی,,...,کرد#کن,گفت#گو,گفت#گو,در,,,,,,گفت#گو
9,هر,ز,,,از,چو,گفت#گو,گفت#گو,شد#شو,,...,ز,از,از,بد,,,,,,از


# Task 1: Keyword Extraction

In [30]:
!pip install yake



In [31]:
import yake

In [32]:
class KeywordExtractor:
    def __init__(self, n=3, top=20):
        self.yake_extractor = yake.KeywordExtractor(lan='fa', n=n, top=top)

    def extract_keywords(self, chapter: list):
        return self.yake_extractor.extract_keywords(' '.join([' '.join(stanza) for stanza in chapter]))


def run_keyword_extraction(poem, poem_no_stopwords):
    """
    It runs the keyword extraction on the poem and poem_no_stopwords. It returns a dataframe of keywords and their scores both for the poem and poem_no_stopwords.
    """
    
    keyword_extractor = KeywordExtractor()
    keywords = keyword_extractor.extract_keywords(poem)
    keywords_no_stopwords = keyword_extractor.extract_keywords(poem_no_stopwords)

    dataframe = pd.DataFrame(keywords, columns=['keywords', 'score'])
    dataframe[['keywords_no_stopwords', 'score_no_stopwords']] = keywords_no_stopwords
    return dataframe

In [33]:
khayyam_preprocessed = khayyam_dataframe['preprocessed']
khayyam_preprocessed_no_stopwords = khayyam_dataframe['preprocessed_no_stopwords']
khayyam_keyword_dataframe = run_keyword_extraction(khayyam_preprocessed, khayyam_preprocessed_no_stopwords)

## Keywords of Khayyam

In the following cell, you can see the top 20 keywords of Khayyam. I also print the stopwords to see their effect on the keywords.

As we can see, the keywords are different in the preprocessed and preprocessed_no_stopwords columns because of removing stopwords and also their score, change.
Scores get higher when we remove the stopwords.

In [34]:
print(khayyam_preprocessor.stopwords[:10])
khayyam_keyword_dataframe

تیرمه


Unnamed: 0,keywords,score,keywords_no_stopwords,score_no_stopwords
0,آمد#آ خوش باش,9e-06,خوش باش دانست#دان,5.3e-05
1,خوش باش دانست#دان,1e-05,آمد#آ خوش باش,5.9e-05
2,دانست#دان گفت#گو این,1.3e-05,خوش باش عمر,0.00012
3,خوش نیست خوش,1.4e-05,نیست خوش باش,0.000121
4,گفت#گو این کوزه,1.5e-05,خوش باش دمی,0.000135
5,نیست خوش باش,1.8e-05,خوش نیست خوش,0.000136
6,بوده‌ست این کوزه,2e-05,یافت#یاب خوش باش,0.000139
7,کرد#کن این عقل,2e-05,مستی خوش باش,0.000139
8,بود#است این عقل,2e-05,خوش باش کارگه,0.000139
9,این چرخ فلک,2e-05,باده لعل باش,0.00014


## Keywords of Ferdowsi (All)

In the following cell, you can see the top 20 keywords of Ferdowsi. I also print the stopwords to see their effect on the keywords.

As we can see, the keywords are different in the preprocessed and preprocessed_no_stopwords columns because of removing stopwords and also their score, change.
Here, because of the length of the document, the scores are much lower than the Khayyam, and after removing the stopwords, the scores get wore, so maybe here we need to change the number of stopwords (increase 10 to 20 or 30).

In [35]:
all_ferdowsi_preprocessed = all_ferdowsi_dataframe['preprocessed']
all_ferdowsi_preprocessed_no_stopwords = all_ferdowsi_dataframe['preprocessed_no_stopwords']
all_ferdowsi_keyword_dataframe = run_keyword_extraction(all_ferdowsi_preprocessed,
                                                        all_ferdowsi_preprocessed_no_stopwords)

In [36]:
print(all_ferdowsi_preprocessor.stopwords[:10])
all_ferdowsi_keyword_dataframe

['و', 'به', 'که', 'ز', 'از', 'بر', 'را', 'چو', 'شد#شو', 'گفت#گو']


Unnamed: 0,keywords,score,keywords_no_stopwords,score_no_stopwords
0,بدو گفت#گو شاه,1.544084e-09,سپاه اندر آمد#آ,0.0
1,چنین گفت#گو کای,2.177486e-09,اندر آمد#آ سپاه,0.0
2,چنین گفت#گو شاه,2.578779e-09,اندر آمد#آ جای,0.0
3,بدو گفت#گو کای,3.112367e-09,اندر آمد#آ یکی,0.0
4,سپاه اندر آمد#آ,3.917133e-09,اندر آمد#آ همی,0.0
5,شد#شو بدو گفت#گو,4.35011e-09,جنگ اندر آمد#آ,0.0
6,گفت#گو کای شاه,4.628183e-09,آمد#آ نزدیک شاه,0.0
7,کرد#کن بدو گفت#گو,4.718772e-09,برو آفرین کرد#کن,0.0
8,کرد#کن چنین گفت#گو,4.816064e-09,خاک اندر آمد#آ,0.0
9,شاه بدو گفت#گو,4.941068e-09,بدو شاه این,0.0


# Task 2: Convert the Poems to Clear Sentence

I run this just on Khayyam to see its performance. It doesn't seem so good, just not bad :). I just sort them based on a common tag occurrences in a persian sentences. I may use som better heuristics in the future!

In [105]:
!mkdir resources
!wget -O resources/pos_tagger.model https://drive.usercontent.google.com/download\?id\=1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3\&export\=download\&authuser\=0

--2024-04-02 07:15:03--  https://drive.usercontent.google.com/download?id=1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3&export=download&authuser=0
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 198.18.0.182
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|198.18.0.182|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19246648 (18M) [application/octet-stream]
Saving to: ‘resources/pos_tagger.model’


2024-04-02 07:15:26 (925 KB/s) - ‘resources/pos_tagger.model’ saved [19246648/19246648]



First, let's find all tags in Khayyam poems.
I don't use the preprocessed texts because I want to convert the original poem to a sentence.

In [108]:
tagger = POSTagger(model='resources/pos_tagger.model')

In [109]:
all_tags = set()

for stanza in khayyam_dataframe['original']:
    current_pos_tagger = tagger.tag(word_tokenize(stanza))
    all_tags = all_tags.union(set([tag for _, tag in current_pos_tagger]))
    
all_tags

{'ADJ',
 'ADJ,EZ',
 'ADP',
 'ADP,EZ',
 'ADV',
 'ADV,EZ',
 'CCONJ',
 'DET',
 'DET,EZ',
 'INTJ',
 'NOUN',
 'NOUN,EZ',
 'NUM',
 'NUM,EZ',
 'PRON',
 'PUNCT',
 'SCONJ',
 'VERB'}

Now, we use the output and sort it based on common tag occurences in a persian sentence.

In [93]:
tag_order = [
    'NOUN',
    'NOUN,EZ',
    'ADJ',
    'ADJ,EZ',
    'ADV',
    'ADV,EZ',
    'PRON',
    'DET',
    'DET,EZ',
    'NUM',
    'NUM,EZ',
    'VERB',
    'ADP',
    'ADP,EZ',
    'CCONJ',
    'SCONJ',
    'INTJ',
    'PUNCT'
]

def run_pos_tagger(poem):
    tagged = tagger.tag(word_tokenize(poem))
    
    # I removed some tags because I think they are not useful for this task and they need some better heuristics.
    tagged_clean = [(word, tag) for word, tag in tagged if tag != 'ADP' and tag != 'ADP,EZ' and
                    tag != 'CCONJ' and tag != 'SCONJ' and tag != 'INTJ' and tag != 'PUNCT']
    dataframe = pd.DataFrame(tagged_clean, columns=['word', 'tag'])
    dataframe['ordered sentence'] = dataframe['tag'].apply(lambda x: tag_order.index(x))
    dataframe = dataframe.sort_values(by='ordered sentence')
    return dataframe

The indexes are the true order of the poem. The visualized order is the order of the tags in the tag_order list.

In [94]:
run_pos_tagger(khayyam_dataframe['original'][0])

Unnamed: 0,word,tag,ordered sentence
3,دل,"NOUN,EZ",1
1,بتا,ADV,4
4,ما,PRON,6
0,برخیز,VERB,11
2,بیا,VERB,11


In [95]:
for stanza in khayyam_dataframe['original']:
    print(run_pos_tagger(stanza))

    word      tag  ordered sentence
3     دل  NOUN,EZ                 1
1    بتا      ADV                 4
4     ما     PRON                 6
0  برخیز     VERB                11
2    بیا     VERB                11
     word      tag  ordered sentence
1      کن     NOUN                 0
0      حل  NOUN,EZ                 1
2    جمال  NOUN,EZ                 1
4    مشکل  NOUN,EZ                 1
3  خویشتن     PRON                 6
5      ما     PRON                 6
   word      tag  ordered sentence
2  شراب     NOUN                 0
1  کوزه  NOUN,EZ                 1
4   نوش      ADJ                 2
3    هم     PRON                 6
0    یک      NUM                 9
5  کنیم     VERB                11
      word      tag  ordered sentence
0      زآن     NOUN                 0
2  کوزه‌ها     NOUN                 0
4       گل  NOUN,EZ                 1
1      پیش      ADV                 4
5       ما     PRON                 6
3     کنند     VERB                11
      word   t