# WEB SCRAPING 

## Importing the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup

import sys
sys.path.append('..')
from utils.file_utils import create_folder_if_not_exists
from utils.file_utils import remove_files_in_folder
from utils.file_utils import file_exists

## Scraping the data from the website

In [2]:
langs = ['fr', 'en', 'es', 'de', 'it']
base_url = 'wikipedia.org/wiki'

def get_wiki_page(lang : str, page : str):
    url = f'https://{lang}.{base_url}/' + page
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
def get_wiki_pages_links_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.select('a[href*="/wiki/"]')
        return links
    else:
        return None
    
def get_wiki_pages_links(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_pages_links_from_html(html)

def get_wiki_page_title_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.select('h1#firstHeading')[0].text
        return title
    else:
        return None
    
def get_wiki_page_title(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_title_from_html(html)
    
def get_wiki_page_content_from_html(html : str):
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        content = soup.select('div#mw-content-text')[0].text
        return content
    else:
        return None
    
def get_wiki_page_content(lang : str, page : str):
    html = get_wiki_page(lang, page)
    return get_wiki_page_content_from_html(html)

page = ''
lang = 'fr'
content = get_wiki_page_content(lang, page)
#print(content)

In [5]:
saving_path = '../data/train/'

def save_wiki_page_content(lang : str, page : str, saving_path : str = saving_path):
    wiki_page = get_wiki_page(lang, page)
    content = get_wiki_page_content_from_html(wiki_page)
    if content:
        title = get_wiki_page_title_from_html(wiki_page)
        
        create_folder_if_not_exists(f'{saving_path}/{lang}')
        
        if file_exists(f'{saving_path}/{lang}/{title}.txt'):
            #print(f'{title}.txt already exists')
            return False
        
        try:
            with open(f'{saving_path}/{lang}/{title}.txt', 'w') as f:
                f.write(content)
        except:
            print(f'Error while saving {title}.txt')
            return False
        return True
    else:
        print(f'Error while getting {title}.txt')
        return False
    
    
page = ''
lang = 'fr'
save_wiki_page_content(lang, page)

False

In [20]:
max_files_per_lang_train = 10 
max_files_per_lang_test = 5

saving_path_train = '../data/train/'
saving_path_test = '../data/test/'

files_saved_per_lang_train = {}
files_saved_per_lang_test = {}
page = ''

def clear_folder(lang : str, path : str):
    remove_files_in_folder(f'{path}/{lang}')
    
def clear_all_folders():
    for lang in langs:
        clear_folder(lang, saving_path_train)
        clear_folder(lang, saving_path_test)
        
def save_pages_from_links(lang : str, links : list):
    for link in links:
        if link:
            link = link['href']
            if link:
                if link.startswith('/wiki/'):
                    link = link[6:]
                    save_wiki_pages_content(lang, link)

def save_wiki_pages_content(lang : str, page : str):
    global files_saved_per_lang_train
    global files_saved_per_lang_test
    global max_files_per_lang_train
    global max_files_per_lang_test
    
    if lang not in files_saved_per_lang_train :
        files_saved_per_lang_train[lang] = 0
    if lang not in files_saved_per_lang_test :
        files_saved_per_lang_test[lang] = 0
    
    if files_saved_per_lang_train[lang] < max_files_per_lang_train or files_saved_per_lang_test[lang] < max_files_per_lang_test:
        
        if files_saved_per_lang_train[lang] < max_files_per_lang_train:

            if save_wiki_page_content(lang, page, saving_path_train):
                files_saved_per_lang_train[lang] += 1
                print(f'{lang} : {files_saved_per_lang_train[lang]} files saved in train')
                
                # get links from page and save them
                links = get_wiki_pages_links(lang, page)
                save_pages_from_links(lang, links)
            else:
                pass
                # print(f'Error while saving {lang}')
                
        else:
            
            if save_wiki_page_content(lang, page, saving_path_test):
                files_saved_per_lang_test[lang] += 1
                print(f'{lang} : {files_saved_per_lang_test[lang]} files saved in test')
                
                # get links from page and save them
                links = get_wiki_pages_links(lang, page)
                save_pages_from_links(lang, links)
            else:
                pass
                # print(f'Error while saving {lang}')
            
    else:
        pass
        #print(f'{lang} already has {max_files_per_lang} files')
          
clear_all_folders()
for lang in langs:
    save_wiki_pages_content(lang, page)

fr : 1 files saved in train
fr : 2 files saved in train
fr : 3 files saved in train
fr : 4 files saved in train
fr : 5 files saved in train
fr : 6 files saved in train
fr : 7 files saved in train
fr : 8 files saved in train
fr : 9 files saved in train
fr : 10 files saved in train
fr : 1 files saved in test
fr : 2 files saved in test
fr : 3 files saved in test
fr : 4 files saved in test
fr : 5 files saved in test
en : 1 files saved in train
en : 2 files saved in train
en : 3 files saved in train
en : 4 files saved in train
en : 5 files saved in train
en : 6 files saved in train
en : 7 files saved in train
en : 8 files saved in train
en : 9 files saved in train
en : 10 files saved in train
en : 1 files saved in test
en : 2 files saved in test
en : 3 files saved in test
en : 4 files saved in test
en : 5 files saved in test
es : 1 files saved in train
es : 2 files saved in train
es : 3 files saved in train
es : 4 files saved in train
es : 5 files saved in train
es : 6 files saved in train
