# WEB SCRAPING 

## Importing the required libraries

In [3]:
import requests
from bs4 import BeautifulSoup

import sys
sys.path.append('..')
from utils.file_utils import create_folder_if_not_exists
from utils.file_utils import remove_files_in_folder
from utils.file_utils import file_exists

## Scraping the data from the website

In [4]:
langs = ['fr', 'en', 'es', 'de', 'it']
base_url = 'wikipedia.org/wiki'

def get_wiki_page(lang : str, page : str):
    url = f'https://{lang}.{base_url}/' + page
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
def get_wiki_pages_links_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.select('a[href*="/wiki/"]')
        return links
    else:
        return None
    
def get_wiki_pages_links(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_pages_links_from_html(html)

def get_wiki_page_title_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.select('h1#firstHeading')[0].text
        return title
    else:
        return None
    
def get_wiki_page_title(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_title_from_html(html)
    
def get_wiki_page_content_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        content = soup.select('div#mw-content-text')[0].text
        return content
    else:
        return None
    
def get_wiki_page_content(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_content_from_html(html)

page = ''
lang = 'fr'
content = get_wiki_page_content(lang, page)
#print(content)

In [5]:
saving_path = '../data/train/'

def save_wiki_page_content(lang : str, page : str):
    wiki_page = get_wiki_page(lang, page)
    content = get_wiki_page_content_from_html(wiki_page)
    if content:
        title = get_wiki_page_title_from_html(wiki_page)
        
        create_folder_if_not_exists(f'{saving_path}/{lang}')
        
        if file_exists(f'{saving_path}/{lang}/{title}.txt'):
            return False
        
        with open(f'{saving_path}/{lang}/{title}.txt', 'w') as f:
            f.write(content)
        return True
    else:
        return False
    
page = ''
lang = 'fr'
save_wiki_page_content(lang, page)

True

In [15]:
page = ''
max_depth = 1

def save_wiki_pages_content_recursively(lang : str, page : str, depth : int):
    if depth > max_depth:
        return
    else:
        links = get_wiki_pages_links(lang, page)
        for link in links:
            link = link['href']
            if link.startswith('/wiki/'):
                link = link[6:]
                if ':' not in link:
                    save_wiki_page_content(lang, link)
                    save_wiki_pages_content_recursively(lang, link, depth + 1)
          
#remove_files_in_folder(f'{saving_path}/fr')       
for lang in langs:
    save_wiki_pages_content_recursively(lang, page, 0)

True