# Novels

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm
import re

## Novel links

In [2]:
url = 'https://papadiamantis.net'

page = requests.get(url + '/aleksandros-papadiamantis/syggrafiko-ergo/mythistorimata')
page.status_code

200

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
links = [item.find('a')['href'] for item in soup.find_all('div', class_='el-item')]
links

['/aleksandros-papadiamantis/syggrafiko-ergo/mythistorimata/389-metanastis-1879',
 '/aleksandros-papadiamantis/syggrafiko-ergo/mythistorimata/390-o-mporoi-t-n-thn-n-1882',
 '/aleksandros-papadiamantis/syggrafiko-ergo/mythistorimata/387-gyftopoyla-meros-i-1884',
 '/aleksandros-papadiamantis/syggrafiko-ergo/mythistorimata/388-gyftopoyla-meros-ii-1884']

## Get the content

In [5]:
def get_content(link):
    time.sleep(5)
    page = requests.get(url + link)
    if page.status_code == 200:
        soup = BeautifulSoup(page.content, 'html.parser')
        title = soup.find('div', class_='uk-width-3-4@m').find('h1').text.strip()
        content = soup.find('div', id='template-nUDsG6AM#0').find_all(['p', 'h2'])
        content = "\n".join([item.text for item in content]).split("ΣΗΜΕΙΩΣΕΙΣ")[0].split('Συνέχεια...')[0].strip()
        # Replace sequences of more than two newlines with just two newlines
        content = re.sub(r'\n{3,}', '\n\n', content)
        # Remove all instances of the Unicode thin space character \u2009
        content = content.replace("\u2009", "")
        return title, content
    else:
        print(page.status_code)
        print(link)

In [6]:
novels = dict()
for link in tqdm(links):
    title, content = get_content(link)
    novels[title] = content

100%|██████████| 4/4 [00:25<00:00,  6.43s/it]


In [7]:
novels.keys()

dict_keys(['Ἡ Μετανάστις (1879)', 'Οἱ Ἔμποροι τῶν ἐθνῶν (1882)', 'Ἡ Γυφτοπούλα [μέρος I] (1884)', 'Ἡ Γυφτοπούλα [μέρος ΙΙ] (1884)'])

In [8]:
novels['Ἡ Γυφτοπούλα (1884)'] = novels['Ἡ Γυφτοπούλα [μέρος I] (1884)'] + novels['Ἡ Γυφτοπούλα [μέρος ΙΙ] (1884)']

In [9]:
novels.keys()

dict_keys(['Ἡ Μετανάστις (1879)', 'Οἱ Ἔμποροι τῶν ἐθνῶν (1882)', 'Ἡ Γυφτοπούλα [μέρος I] (1884)', 'Ἡ Γυφτοπούλα [μέρος ΙΙ] (1884)', 'Ἡ Γυφτοπούλα (1884)'])

In [10]:
del novels['Ἡ Γυφτοπούλα [μέρος I] (1884)']
del novels['Ἡ Γυφτοπούλα [μέρος ΙΙ] (1884)']

In [11]:
novels.keys()

dict_keys(['Ἡ Μετανάστις (1879)', 'Οἱ Ἔμποροι τῶν ἐθνῶν (1882)', 'Ἡ Γυφτοπούλα (1884)'])

In [12]:
for title, text in novels.items():
    # Replace spaces or special characters in title to make it filename-safe
    safe_title = "".join(c if c.isalnum() else "_" for c in title)
    
    # Open a text file in write mode
    with open(f"{safe_title}.txt", "w", encoding="utf-8") as file:
        # Write the title as the first line
        file.write(title + "\n\n")  # Add extra line break after the title if desired
        # Write the main text
        file.write(text)