# WEB SCRAPING 

## Importing the required libraries

In [22]:
import requests
from bs4 import BeautifulSoup

import sys
sys.path.append('..')
from utils.file_utils import create_folder_if_not_exists
from utils.file_utils import remove_files_in_folder
from utils.file_utils import file_exists

## Scraping the data from the website

In [23]:
langs = ['fr', 'en', 'es', 'de', 'it']
base_url = 'wikipedia.org/wiki'

def get_wiki_page(lang : str, page : str):
    url = f'https://{lang}.{base_url}/' + page
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
def get_wiki_pages_links_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.select('a[href*="/wiki/"]')
        return links
    else:
        return None
    
def get_wiki_pages_links(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_pages_links_from_html(html)

def get_wiki_page_title_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.select('h1#firstHeading')[0].text
        return title
    else:
        return None
    
def get_wiki_page_title(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_title_from_html(html)
    
def get_wiki_page_content_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        content = soup.select('div#mw-content-text')[0].text
        return content
    else:
        return None
    
def get_wiki_page_content(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_content_from_html(html)

page = ''
lang = 'fr'
content = get_wiki_page_content(lang, page)
#print(content)

In [40]:
saving_path = '../data/train/'

def save_wiki_page_content(lang : str, page : str):
    wiki_page = get_wiki_page(lang, page)
    content = get_wiki_page_content_from_html(wiki_page)
    if content:
        title = get_wiki_page_title_from_html(wiki_page)
        
        create_folder_if_not_exists(f'{saving_path}/{lang}')
        
        if file_exists(f'{saving_path}/{lang}/{title}.txt'):
            #print(f'{title}.txt already exists')
            return False
        
        try:
            with open(f'{saving_path}/{lang}/{title}.txt', 'w') as f:
                f.write(content)
        except:
            print(f'Error while saving {title}.txt')
            return False
        return True
    else:
        print(f'Error while getting {title}.txt')
        return False
    
    
page = ''
lang = 'fr'
save_wiki_page_content(lang, page)

False

In [42]:
max_files_per_lang = 10
files_saved_per_lang = {}
page = ''

def clear_folder(lang : str):
    remove_files_in_folder(f'{saving_path}/{lang}')
    
def clear_all_folders():
    for lang in langs:
        clear_folder(lang)
        
def save_pages_from_links(lang : str, links : list):
    for link in links:
        if link:
            link = link['href']
            if link:
                if link.startswith('/wiki/'):
                    link = link[6:]
                    save_wiki_pages_content(lang, link)

def save_wiki_pages_content(lang : str, page : str):
    global files_saved_per_lang
    global max_files_per_lang
    
    if lang not in files_saved_per_lang:
        files_saved_per_lang[lang] = 0
    
    if files_saved_per_lang[lang] < max_files_per_lang:

        if save_wiki_page_content(lang, page):
            files_saved_per_lang[lang] += 1
            print(f'{files_saved_per_lang[lang]} files saved for {lang}')
            
            # get links from page and save them
            links = get_wiki_pages_links(lang, page)
            save_pages_from_links(lang, links)
            
        else:
            pass
            # print(f'Error while saving {lang}')
            
    else:
        pass
        #print(f'{lang} already has {max_files_per_lang} files')
          
clear_all_folders()
for lang in langs:
    save_wiki_pages_content(lang, page)

1 files saved for fr
2 files saved for fr
3 files saved for fr
4 files saved for fr
5 files saved for fr
6 files saved for fr
7 files saved for fr
8 files saved for fr
9 files saved for fr
10 files saved for fr
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 files
fr already has 10 fil