In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
import re
from clean_text import clean_text
from concurrent.futures import ThreadPoolExecutor

In [None]:
session_obj = requests.Session()

In [None]:
def extract_text_from_section(text_link):
    src = session_obj.get(text_link).text
    soup = BeautifulSoup(src,'lxml')

    txt = soup.find('div', id='ftwp-postcontent')
    if txt:
        pass
    else:
        txt = soup.find(class_='entry-content entry-content-single')
    
    if not txt:
        txt = soup.find(class_="ld-tab-content ld-visible")


    # or use txt.get_text(separator="\n",strip=True)
    extracted_text = txt.text

    # For extension purpose, if ever needed
    # text = ''
    # for e in txt.descendants:
    #     if isinstance(e, str):
    #         text += e
    #     elif e.name == 'br' or e.name == 'p':
    #         text += '\n'
    # extracted_text = text

    return extracted_text


def scrape_all_sections_and_write_file(book_name,sections,parent_directory):
    book_name_original = book_name
    # book_name_Example = 'ময়ূরকণ্ঠী (১৯৫২)'
    book_name = book_name.split("(")[0].strip()
    illegals =["\\","/", ":", "*", "?", "<", ">", "|"]
    for illegal in illegals:
        if illegal in book_name:
            book_name = re.sub(illegal,"-",book_name)
    file_name = os.path.join(parent_directory,f'{book_name}.json')

    if os.path.isfile(file_name): 
        print("Book with same name already exists! Renaming book by adding '_2'")
        book_name=book_name+"_2"
        # update filename with new name
        file_name = os.path.join(parent_directory,f'{book_name}.json')
    
    # # using for loop
    # book_text = ""
    # for text_link in sections.values():
    #     # print(text_link)
    #     txt = extract_text_from_section(text_link)
    #     book_text = book_text + txt + '\n\n'
    
    # using multithreading
    text_links = list(sections.values())
    output_list = None
    with ThreadPoolExecutor() as executor:
        output_list = list(executor.map(extract_text_from_section, text_links))
    book_text = '\n\n'.join(output_list)
    del output_list
    
    # data = re.split("\n", book_text)
    # temp = []
    # for _data in data:
    #     temp.extend(_data.split(" "))
    # data = temp

    # wordList = data
    # uncomment to apply data cleaning
    # wordList = clean_text(data)
    
    try:
        ## TOO SLOW
        ## file_name=file_name[:-4]+"txt"
        ## chunk_size = 1024 * 1024  # 1MB
        ## with open(file_name, 'a', encoding='utf-8') as f:
        ##     for i in range(0, len(book_text), chunk_size):
        ##         f.write(book_text[i:i + chunk_size])

        # Save as text (JSON TEXT)
        with open(file_name, "w", encoding='utf=8') as outfile:
            json.dump({book_name_original: book_text}, outfile, ensure_ascii=False)
        print(book_name, "successfully saved", end='\n\n')
        
        # Save as json 
        # with open(file_name, "w", encoding='utf=8') as outfile:
        #     json.dump({book_name_original:wordList}, outfile, ensure_ascii=False)
        # print(book_name, "successfully saved", end='\n\n')

    except Exception as e:
        print(e)
        if os.path.exists(file_name):
            os.remove(file_name)


def scrape_other_way(lins,book_name,path):
    parts={}
    for lin in lins:
        name = lin.find(class_="ld-item-title").text.strip()
        link = lin.attrs["href"]
        parts[name] = link
    
    for key,value in parts.items():
        value = "#_url_of_the_book" # comment out to see actual url
        print(key,'\t',value)

    print(len(parts),"Sections ...")
    
    scrape_all_sections_and_write_file(book_name,parts,path)


def go_inside_category(book_name,book_link,flag=0, writer=None):
    # print(category_name,category_link)

    parent_directory = f"G:\_Somikoron\Web Scraping\{writer}-TEXT" #create this folder yourself
    directory = book_name
    # delete the following line to have subfolders for each file
    directory=""
    path = os.path.join(parent_directory,directory)
    print(book_name) #path
    if not os.path.exists(path):
        os.mkdir(path)

    src = session_obj.get(book_link).text
    soup2 = BeautifulSoup(src,'lxml')

    # new
    lins = soup2.findAll('a',class_="ld-item-name ld-primary-color-hover")
    if lins:
        scrape_other_way(lins,book_name,path)
        return
    
    ol2= soup2.find_all(class_='entry-title-link')

    sections = {}
    cnt =1
    for i in ol2:
        try:
            item = i
            s_name = item.text
            if s_name in sections:
                cnt += 1
                s_name = s_name + f"-{cnt}"
            else:
                cnt= 1
            sections[s_name] = item['href']

        except Exception as e:
            print(e)
    
    pagination = soup2.find(attrs ={"role" : "navigation"})
    if pagination and pagination.find(class_="active") and pagination.find(class_="pagination-next"):
        next_url=pagination.find(class_="pagination-next").find('a')['href']
        new_sections=go_inside_category(book_name,next_url,flag=1, writer=writer)
        for key in new_sections.keys():
            sections[key]=new_sections[key]
    
    if flag==1:
        return sections

    for key,value in sections.items():
        value = "#_url_of_the_book" # comment out to see actual url
        print(key,'\t',value)

    print(len(sections),"Sections ...")

    scrape_all_sections_and_write_file(book_name,sections,path)
        
        #uncomment to work only one book
        # break 


def scrape_all_writers(writers):
    for writer in writers:
        base_urls = [f"https://#website/link1/{writer}", f"https://#website/link2/{writer}"]
        # base_urls=[base_urls[1]]
        main_category = {}

        not_found = 0
        for cnt,base_url in enumerate(base_urls,1):
            source = session_obj.get(base_url).text
            soup = BeautifulSoup(source,'lxml')

            if not soup:
                print("Link", cnt, "doesn't exist")
                continue

            ol= soup.find(class_='archive-description taxonomy-archive-description')
            if ol: ol= ol.ol
            etls = soup.find_all("a", class_="entry-title-link")

            if (not ol) and etls:
                for etl in etls:
                    main_category[etl.text]=etl.attrs["href"]        
            
            if not ol and not etls:
                    not_found += 1

            if not (ol or etls):
                print("link", cnt, "doesn't exist")
                continue
            
            if not ol:
                continue
            
            for i in ol:
                if str(type(i))=="<class 'bs4.element.NavigableString'>":
                    continue
                try:
                    item = i.findChildren()[0]
                    main_category[item.text] = item['href']
                except Exception as e:
                    print(e)
        
        if not_found == 2:
            print(writer, "was not scraped, find correct link")

        for key,value in main_category.items():
            value = "#_url_of_the_book" # comment out to see actual url
            print(key,'\t',value)
        
        for category_name, category_link in main_category.items():
            try:
                go_inside_category(category_name,category_link, writer=writer)
            except Exception as e:
                print(f"{category_name} was not scraped, {e}")

            # break #break after 1 iteration
    
    locals().clear()

In [None]:
# # scrape only one book
# book_name = ""
# book_link = ""
# writer = ""
# try:
#     go_inside_category(book_name,book_link=book_link, writer=writer)
#     # scrape_all_writers([writer])
# except Exception as e:
#     print(f"{writer} was not scraped, {e}")

In [None]:
d = {}
with open("./kallol_writers_scraped.json", encoding="UTF-8") as fp:
    d = json.load(fp)
writers = list(d.keys())
writers = [re.sub(" ", "-", i) for i in writers]
# writers

In [None]:
# already scraped from 0 to 260, start from 261
i= 260
writers_list = writers[i:i+1]
# writers_list = [""]
scrape_all_writers(writers_list)
writers[i]