# Parse and analyze texts on Samlib


## Import tools and implement basic classes


In [4]:
!pip install jsons
!pip install jsonlines


Collecting jsons
[?25l  Downloading https://files.pythonhosted.org/packages/d9/a3/50fb6dbe6bef19c346d779ae04618d9bf7c215f5d065e90ca0f05588b089/jsons-1.1.2-py3-none-any.whl (53kB)
[K     |██████▏                         | 10kB 19.6MB/s eta 0:00:01[K     |████████████▎                   | 20kB 3.2MB/s eta 0:00:01[K     |██████████████████▍             | 30kB 4.0MB/s eta 0:00:01[K     |████████████████████████▌       | 40kB 3.1MB/s eta 0:00:01[K     |██████████████████████████████▊ | 51kB 3.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
[?25hCollecting typish>=1.3.1
  Downloading https://files.pythonhosted.org/packages/6d/4d/28aa60f0f56e7a707c17900bbafdb09c3b9054bd2ac18f1611d68342cf0b/typish-1.4.0-py3-none-any.whl
Installing collected packages: typish, jsons
Successfully installed jsons-1.1.2 typish-1.4.0
Collecting jsonlines
  Downloading https://files.pythonhosted.org/packages/4f/9a/ab96291470e305504aa4b7a2e0ec132e930da89eb3ca7a82fbe03167c131/json

In [0]:
import re
import os

import requests
import pandas as pd
import jsons
import json
import jsonlines

from lxml import html
from bs4 import BeautifulSoup


In [0]:
class ConsoleLogger:

    def __init__(self, enable_logging=True):
        self.enable_logging = enable_logging

    def info(self, message):
        if not self.enable_logging:
            return

        print(message)
  
    def empty_line(self):
        if not self.enable_logging:
            return

        print()


In [0]:
class UrlWrapper:

    def __init__(self, url, encoding):
        self.url = url
        self.encoding = encoding

    def __str__(self):
        return f"url: [{self.url}], encoding: [{self.encoding}]"

    def __repr__(self):
        return self.__str__()


def combine_url_and_href(url_wrapper, href):
    full_url = url_wrapper.url
    if href.startswith("/"):
        full_url += href[1:]
    else:
        full_url += href

    return UrlWrapper(full_url, url_wrapper.encoding)


## Parsing main page with authors


In [0]:
# del default_url
# del default_logger

default_url = UrlWrapper("http://samlib.ru/", "cp1251")
default_logger = ConsoleLogger(enable_logging=False)


In [0]:
session = requests.Session() 
session.headers.update({
    "Referer": default_url.url,
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
})


In [0]:
class Author:

    def __init__(self, author_name, href, url_wrapper):
        self.author_name = author_name
        self.href = href
        self.url_wrapper = combine_url_and_href(url_wrapper, href)

        self.pieces_of_literature = None

    def __str__(self):
        return f"\"{self.author_name}\": {self.url_wrapper.url}"

    def __repr__(self):
        return self.__str__()

    def get_parsed_author_page(self):
        return get_parsed_page(self.url_wrapper)

    def get_parsed_pieces_of_literature(self, use_cache=True):
        if use_cache and self.pieces_of_literature is not None:
            return self.pieces_of_literature

        parsed_author_page = self.get_parsed_author_page()
        work_node = parsed_author_page.xpath("//dl/dl/dt/li/a")

        pieces_of_literature = []
        for item in work_node:
            text = item.xpath("b/text()")
            work_name = text[0] if text else None
            href = item.get("href")

            if work_name and href:
                piece_of_literature = PieceOfLiterature(work_name, href, self.url_wrapper)
                pieces_of_literature.append(piece_of_literature)

        self.pieces_of_literature = pieces_of_literature
        return pieces_of_literature

    def parse_author_data_element(self):
        pieces_of_literature = self.get_parsed_pieces_of_literature()
        return AuthorData(self, pieces_of_literature)


def try_parse_author_element(link_element, url_wrapper, remove_full_links=True):
    text = link_element.xpath("text()")
    author_name = text[0] if text else None
    href = link_element.get("href")

    if not text or not href:
        return None
    if remove_full_links and href.startswith("http"):
        return None

    return Author(author_name, href, url_wrapper)


In [0]:
def get_parsed_page(url_wrapper):
    response = requests.get(url_wrapper.url)
    parsed_main_page = html.document_fromstring(
        response.content.decode(url_wrapper.encoding).encode("utf-8").decode("utf-8")
    )
    return parsed_main_page

def get_converted_authors_collection(url_wrapper):
    parsed_main_page = get_parsed_page(url_wrapper)
    results = parsed_main_page.xpath("//p/a")

    authors = []
    for link_element in results:
        author = try_parse_author_element(link_element, url_wrapper)
        if author:
            authors.append(author)

    return authors


In [0]:
get_converted_authors_collection(default_url)[:10]


## Parsing every author page with texts info


In [0]:
class PieceOfLiterature:

    def __init__(self, name, href, url_wrapper):
        self.name = name
        self.href = href
        self.url_wrapper = combine_url_and_href(url_wrapper, href)

        self.parsed_text = None

    def __str__(self):
        return f"\"{self.name}\": {self.url_wrapper.url}"

    def __repr__(self):
        return self.__str__()

    def get_parsed_text_page(self):
        return get_parsed_page(self.url_wrapper)

    def get_novel(self, parsed_page):
        parsed_novel = self._get_content_from_page(parsed_page, "//dd")
        return parsed_novel

    def get_poetry(self, parsed_page):
        parsed_poetry = self._get_content_from_page(parsed_page, "//pre")
        return parsed_poetry
    
    def get_poetry_bold(self, parsed_page):
        head = self._get_content_from_page(parsed_page, "//b/xxx7")
        tail = self._get_content_from_page(parsed_page, "//b/xxx7/p")
        if not tail:
            return head
        return head + " \n " + tail

    def get_parsed_text(self, use_cache=True):
        if use_cache and self.parsed_text is not None:
            return self.parsed_text

        parsed_page = self.get_parsed_text_page()

        parsed_text = self.get_novel(parsed_page)
        if not parsed_text:
            parsed_text = self.get_poetry(parsed_page)
        if not parsed_text:
            parsed_text = self.get_poetry_bold(parsed_page)

        self.parsed_text = parsed_text
        return parsed_text

    def _get_content_from_page(self, parsed_page, xpath):
        result = parsed_page.xpath(xpath)

        parsed_text = str()
        for node in result:
            text = node.xpath("text()")
            content = text[0] if text else None

            if content:
                parsed_text += content

        return parsed_text.strip()


class AuthorData:

    def __init__(self, author, pieces_of_literature):
        self.author = author
        self.pieces_of_literature = pieces_of_literature

    def __str__(self):
        return f"author: [{self.author}], works: {self.pieces_of_literature}"

    def __repr__(self):
        return self.__str__()


In [0]:
def get_converted_authors_data_collection(url_wrapper, logger, split=None):
    authors_collection = get_converted_authors_collection(url_wrapper)
    if split:
        lower_bound, upper_bound = split
        authors_collection = authors_collection[lower_bound:upper_bound]

    authors_data = []
    for author in authors_collection:
        logger.info(f"Get works of author {author}")
        logger.info(author.url_wrapper)

        author_data = author.parse_author_data_element()
        authors_data.append(author_data)

        logger.empty_line()

    return authors_data


In [15]:
authors_data_collection = get_converted_authors_data_collection(default_url, default_logger)
authors_data_collection


[author: ["Список известности России": http://samlib.ru/r/ru1000/ru.shtml], works: [],
 author: ["Интервью СИ": http://samlib.ru/a/aktualxnoe_i/], works: ["Вступительное слово": http://samlib.ru/a/aktualxnoe_i/1slovo.shtml, "Доска объявлений": http://samlib.ru/a/aktualxnoe_i/2doska.shtml],
 author: ["Звезды Самиздата": http://samlib.ru/z/zwezdy_s/], works: ["Супертоп фантастики и фэнтези": http://samlib.ru/z/zwezdy_s/c-2.shtml, "Они оторвались от коллектива": http://samlib.ru/z/zwezdy_s/c-3.shtml, "Хулиганы и просто редиски": http://samlib.ru/z/zwezdy_s/c-4.shtml, "В желтой майке лидера": http://samlib.ru/z/zwezdy_s/c-5.shtml, "Как попасть в рекламный подвал Самиздата "Новые книги авторов Си, вышедшие из печати"": http://samlib.ru/z/zwezdy_s/l-2.shtml, "Расчет супертопа": http://samlib.ru/z/zwezdy_s/n-1.shtml],
 author: ["Фильм про "Самиздат"": http://samlib.ru/s/samizdat10/samizdat10.shtml], works: [],
 author: ["А З.": http://samlib.ru/a/a_z/], works: ["куранты": http://samlib.ru/a/a

In [0]:
texts = []
for authro_data in authors_data_collection:
    for piece_of_work in authro_data.pieces_of_literature[:3]:
        texts.append(piece_of_work.get_parsed_text())


## Looking at the results


In [21]:
'''print(texts[0])
print("--------------------")
print("You can compare this with pure call:")
print("--------------------\n")
texts[0]
'''


--------------------
You can compare this with pure call:
--------------------



''

In [0]:
'''json_dict = jsons.dump(authors_data_collection[0])
json_dict
'''

In [0]:
my_json = []
for author_data in authors_data_collection:
    for piece_of_work in author_data.pieces_of_literature[:3]:
        piece_of_work.get_parsed_text()  # Save parsed text into internal caches.
        my_json.append(f"{piece_of_work.url_wrapper.url}: {piece_of_work.parsed_text}")


In [0]:
with open('texts_data.jsonlines', 'a', encoding='utf-8') as file:
    for item in my_json:
      json.dump(item, file, ensure_ascii=False)
      file.write('\n')
