In [5]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import requests
import json
import uuid
import os
import time
from playsound import playsound
import wave
import sys
from pydub import AudioSegment
import soundfile as sf
import pyrubberband as pyrb

# Getting Date From Web and Parse Html to text

In [2]:
class GetWebPage:
    __init__(url):
        self.url = url
        self.title = ''
        self.author = ''
        self.path = ''
        
    def create_dir(self):
        "create dir for save parsed text and mp3 files"
        dir_name = str(uuid.uuid1())
        path = os.path.join(os.getcwd(), dir_name)

        try:
            os.mkdir(path)
        except:
            pass
        self.path = path

    
    def get_data(self):
        "send request to specified url and pars it. these method also find author and title of article"
        req = Request(self.url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req)
        webpage = str(webpage.read().decode('utf-8'))
        soup = BeautifulSoup(webpage, 'html.parser')
        
        # find title and author of article
        pattern = re.compile(r'([\w ]+) \| (by [\w ]+) \|')
        match = re.search(pattern, soup.title.string)
        self.title = match.group(1)
        self.author = match.group(2)
        
        return soup
        
    def medium(self, soup):
        "this method for find main content of article in medium website"
        class_div = 'ab ac ae af ag dr ai aj'
        article = soup.find('body').find('div', class_=class_div)
        
    # this two below function for replace br tag to \n
    def make_br_pattern(self):
        br_pattern = []
        br_pattern.append(re.compile(r'<br *\/>'))
        br_pattern.append(re.compile(r'<br *>'))
        return tuple(br_pattern)
    
    def replace_br(self, tag, br_pattern):
        for pattern in br_pattern:
            tag = re.sub(pattern, '\n', str(tag))

        tag = BeautifulSoup(tag, 'html.parser')
        tag = list(tag.children)
        return tag[0]
    
    def get_text_value(self, tag, heading_hierarchy):
        "give the text of every tag with attention of kind tag back"
        headings = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
        if tag.name == 'figure':
            text_of_img = list()
            for img in tag.find_all('img'):
                if len(img.attrs['alt']) > 0:
                    text_of_img.append('image description: %s' %img.attrs['alt'])
                else:
                    text_of_img.append('this image doesn\'t have any description')
            return '\n'.join(text_of_img)

        elif tag.name == 'p':
            return tag.get_text()

        elif tag.name in headings:
            heading_hierarchy[tag.name] += 1
            h = tag.name.replace('h', 'heading ')
            if len(tag.get_text()) > 0:
                return '{} number {}: {}'.format(h, heading_hierarchy[tag.name], tag.get_text())
            return

        elif tag.name == 'ul':
            # just for one level deep
            text_of_li = list()
            for li in tag.find_all('li'):
                if len(li.get_text()) > 0:
                    text_of_li.append('item: %s' %li.get_text())
            return '\n'.join(text_of_li)

        elif tag.name == 'ol':
            # just for one level deep
            text_of_li = list()
            counter = 1
            for li in tag.find_all('li'):
                if len(li.get_text()) > 0:
                    text_of_li.append('item %s: %s' %(counter, li.get_text()))
                    counter += 1
            return '\n'.join(text_of_li)

        else:
            return tag.get_text()
        
    def breaktomaxlen(self, text):
        "break the text to max 200 character line"
        # split text with '.'
        pattern = re.compile(r'[^\d](\.)')
        indexs = []
        for index in pattern.finditer(text):
            indexs.append(index.start() + 1)

        lines = []
        before = 0
        for index in indexs:
            line = text[before:index].strip()
            if len(line) > 0:
                lines.append(line)
                before = index + 1

        # split all line was splited with . to split again with separate maxlen = 200 char
        text = lines[:]
        lines = []
        for line in text:
            if len(line) <= 200:
                lines.append(line)
            else:
                temp = []
                while len(line) > 200:
                    index = 200
                    while line[index] != ' ':
                        index -= 1
                    temp.append(line[:index])
                    line = line[index+1:]
                temp.append(line)
                lines.extend(temp)
        return lines
    
    def create_parsed_text(self):
        """
        get the text of article and clean that
        - if figure -> find alt of img tag and get the value with format ->  "Image description" + "alt"  -> -------- is done
        - if heading -> get the text of heading with foramt -> "headign" + "text heading"
        - if ul or ol -> get the list of every li tag with format -> if ul ("li" + "text li") else ("number" + "text li")
        """
        tags = list(article.children)
        tags = tags[1:]

        # compile one time br pattern
        br_pattern = make_br_pattern()

        # for tree of heading and Hierarchy headings
        heading_hierarchy = {'h1':0, 'h2':0, 'h3':0, 'h4':0, 'h5':0, 'h6':0}
        with open(os.path.join(self.path, 'textparsed.txt'), 'w', encoding='utf-8') as file:
            for tag in tags:
                tag = replace_br(tag, br_pattern)
                text = get_text_value(tag, heading_hierarchy)
        #         file.write(text + '\n')
                for t in text.split('\n'):
                    if len(t) > 200:
                        t = breaktomaxlen(t)
                        file.write('\n'.join(t) + '\n')
                    else:
                        file.write(t + '\n')
                        
    def option_req(self):
        "request option method to soundoftext.com"
        option_url = 'https://api.soundoftext.com/sounds'
        try:
            response = requests.options(option_url)
            if response.status_code == 204 or response.status_code == 200:
                print('options request is OK(No Content)')
                return True
        except:
            pass
        
    def post_req(self, text):
        """
        this is for test post method of soundoftext.com
        warning: just send a single line means no \n character in line
        important thing: max len every query is not more than 200 character
        """
        if len(text) > 1:
            post_url = 'https://api.soundoftext.com/sounds'
            payload = {"engine":"Google","data":{"text":text,"voice":"en-US"}}
            response = requests.post(post_url, json=payload)
            #convert response to json format
            r_json = response.json()
            if r_json['success'] == True:
                return True, r_json['id']
            else:
                return False, '-1'
        else:
            return False, '-1'

    def getloc_req(self, id):
        "this is for get method of soundoftext.com to get location of mp3 voice file"
        getloc_url = 'https://api.soundoftext.com/sounds/' + id
        response = requests.get(getloc_url)
        #convert response to json format
        r_json = response.json()
        if r_json['status'] == 'Done':
            return True, r_json['location']
        else:
            return False, 'Invalid URL'
        
    def download_voice(self, mp3_url,filename):
        "this is for get method of soundoftext.com to get mp3 voice file"
        response = requests.get(mp3_url)
        print(response.status_code)
        if response.status_code == 200 or response.status_code == 204:
            with open(os.path.join(self.path, filename + '.mp3'), 'wb') as f:
                f.write(response.content)
            time.sleep(1)
            return True, 'file saved!'
        else:
            return False, 'files doesn\'t save!'
        
    def send_req(path, text, file_name):
        "4 in 1: four funcion of request to get mp3 file in one function"
        # option method
        if option_req():
            pass
        else:
            raise Exception('Option request gonna error')

        # post method
        success, id = post_req(text)
        if success:
            pass
        else:
            raise Exception('post request gonna error')

        # first get method to get location of mp3 file
        status, url = getloc_req(id)
        if status:
            pass
        else:
            raise Exception('first get method gonna error')

        # last get method to download ans save the mp3 file
        status, message = download_voice(mp3_url, file_name, path)
        if status:
            return True
        else:
            raise Exception('last get method gonna error')
            
    def alt_text_req(self):
        "this method for request of image des once time"
        image_des = 'this image doesn\'t have any description'
        if send_req(self.path, image_des, '0'):
            print('image description is saved!')
            
    def copy_mp3(self, line_number):
        "copy 0.mp3 file that contain sound of image description to line number.mp3"
        zero = ''
        with open(os.path.join(self.path, '0.mp3'), 'rb') as file_source:
            zere = file_source.read()
        with open(os.path.join(self.path, line_number + '.mp3'), 'wb') as file_destination:
            try:
                file_destination.write(zero)
            except:
                raise Exception('Copy from 0.mp3 isn\'t done!')
                
    def download_all_sound(self):
        with open(os.path.join(self.path, 'textparsed.txt'), 'r', encoding='utf-8') as file:
            # for finish the read of line
            counter = 0

            # save mp3 file description of iamge doesn't alt text
            alt_text_req(self.path)
            image_des = 'this image doesn\'t have any description'

            line = file.readline().strip()
            line_number = 0
            while True:
                # send request line and save the file
                if line == image_des:
                    line_number += 1
                    # save copy of 0.mp3 to linenumber.mp3
                    copy_mp3(self.path, str(line_number))
                    continue
                line_number += 1
                if send_req(self.path, line, str(line_number)):
                    print('sound of line {} is saved!'.format(line_number))
                line = file.readline().strip()
                if len(line) <= 1:
                    counter += 1
                if counter > 20:
                    break
                    
    def play_sound(self, line_number):
        "this func for play mp3 sound with line number"
        playsound(os.path.join(self.path, line_number + '.mp3'))
    
    def remove_sound(self, line_number, format_file):
        "this func for delete file of mp3 sound with line number"
        # format file like .mp3
        os.remove(os.path.join(self.path, line_number + format_file))
        
    def remove_dir(self):
        "this func for delete directory of mp3 sound with line number"
        os.rmdir(self.path)
        
    def convertforspeed(self, line_number, speed):
        "convert mp3 file for speed playback"
        sound = AudioSegment.from_mp3(os.path.join(self.path, line_number + '.mp3'))
        sound.export(os.path.join(self.path, "file.wav"), format="wav")

        y, sr = sf.read(os.path.join(self.path, "file.wav"))
        # Play back at extra desire speed
        y_stretch = pyrb.time_stretch(y, sr, speed)
        # Play back extra desire tones
        y_shift = pyrb.pitch_shift(y, sr, speed)
        sf.write(os.path.join(self.path, "analyzed_filepathXspeed.wav"), y_stretch, sr, format='wav')

        # convert wav file created to mp3 file
        sound = AudioSegment.from_wav(os.path.join(self.path, "analyzed_filepathXspeed.wav"))
        sound.export(os.path.join(self.path, line_number + '.mp3'), format="mp3")

        # delete the extra file line file.wav and analyzed_filepathXspeed.wav
        remove_sound(path, 'file', '.wav')
        remove_sound(path, 'analyzed_filepathXspeed', '.wav')

# these lines needs comments

In [5]:
url = 'https://medium.com/ironsource-levelup/aso-optimization-in-practice-how-a-game-i-made-over-the-weekend-amassed-2-million-downloads-32a7d13b093d'
url2 = 'https://virgool.io/@hasangilak/emigration-the-great-escape-l1vk2gyegns6'