# **Web Scraping**

In [22]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import pandas as pd
import glob
import os

In [23]:
url = "https://www.webonary.org/moore/browse/browse-vernacular-english/?key=mos&letter=a&lang=en"
html_code = requests.get(url).content

source = BeautifulSoup(html_code, "html.parser")


page_info_element = source.find("li", class_="page_info")
if page_info_element:
    pages = int(page_info_element.text.split(" ")[3])
else:
    print("Pas de pagination.")
    pages = 1

urls = []

def extract_url(pages) -> dict:
    for i in range(1, pages + 1):
        url_pages = f"https://www.webonary.org/moore/browse/browse-vernacular-english/?key=mos&letter=a&lang=en&pagenr={i}"
        urls.append(url_pages)
    return urls

urls = extract_url(pages)

In [24]:
def extract_data(article: Tag) -> dict:
    translation_span = article.find("span", class_="translation")
    examples_span = article.find("span", class_="example")


    fr = translation_span.find("span", lang='fr').text if translation_span and translation_span.find("span", lang='fr') else None


    mos = examples_span.find("span", lang='mos').text if examples_span and examples_span.find("span", lang='mos') else None

    return {"mos": mos, "fr": fr}

In [25]:
def data():
    all_data = []
    for i in urls:
        html_code = requests.get(i).content
        sources = BeautifulSoup(html_code, "html.parser")
        fr_mos = sources.find_all("span", class_="post")
        data = [extract_data(article) for article in fr_mos]
        all_data.extend(data)
    return all_data

In [26]:
data = data()

In [27]:
# Enregistrement sous fichier csv
df = pd.DataFrame.from_dict(data)
df.to_csv("data_a.csv", index=False)

In [None]:
df.head()

# Collecte et netoyage

In [8]:
# Spécifiez le chemin du répertoire contenant vos fichiers CSV
chemin_repertoire = "/content/"  # Remplacez par le chemin réel

# Obtenez une liste de tous les fichiers CSV dans le répertoire
tous_les_fichiers = glob.glob(os.path.join(chemin_repertoire, "*.csv"))

# Créez une liste pour stocker les DataFrames individuels
liste_df = []

# Parcourez chaque fichier CSV et lisez-le dans un DataFrame
for fichier in tous_les_fichiers:
    df = pd.read_csv(fichier)
    liste_df.append(df)


In [18]:
# Concaténez tous les DataFrames en un seul DataFrame
data_final = pd.concat(liste_df, ignore_index=True)

# Affichez le DataFrame combiné
data_final.dropna(inplace=True)
data_final.reset_index(drop=True, inplace=True)

In [20]:
data_final.to_csv("data_final.csv", index=False)

In [21]:
# Export sous format json
import json
df = pd.read_csv("data_final.csv")

data_json = []
for _, row in df.iterrows():
    data_json.append({
        "translation": {
            "fr": row["fr"],
            "mos": row["mos"]
        }
    })

with open("datas.json", "w", encoding="utf-8") as json_file:
    json.dump(data_json, json_file, ensure_ascii=False, indent=3)