In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import csv
import ast
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import textwrap
import time

In [None]:

driver = webdriver.Firefox()

# Go to the website
driver.get('https://www.ilo.org/dyn/normlex/en/f?p=1000:20061::FIND:NO:::') 

# Find elements by tag name
rows = driver.find_elements(By.TAG_NAME, 'tr')

# Open CSV writer
with open('CFA_all_cases_final.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    # Write header row
    writer.writerow(["case_and_country", "complaint_date", "organization", "short_description", "additional_texts", "href"])

    current_case = {}
    for row in rows:
        soup = BeautifulSoup(row.get_attribute('innerHTML'), 'html.parser')
        td = soup.find('td', class_="apex_report_break")
        if td:
            li_tags = td.find_all('li')  # Find all li tags under td, not just immediate children
            for li_tag in li_tags:
                a_tag = li_tag.find('a')
                href = a_tag.get('href') if a_tag else None

                if 'firstLevel' in li_tag.get('class', []):  # If it's a new case...
                    if current_case:  # ... and there's a current case...
                        # ... write its details to CSV and clear the dictionary
                        writer.writerow(list(current_case.values()))
                        current_case = {}
                    a_text = a_tag.text if a_tag else None
                    case_and_country, complaint_date = a_text.split(" - Complaint date: ") if a_text and " - Complaint date: " in a_text else (a_text, None)
                    organization = td.find('em').text if td.find('em') else None


                    short_desc_elements = td.find_all('strong')# le code original : td.find_all('span', class_='secondLine')


                    short_description =  short_desc_elements[1].text.strip() if len(short_desc_elements) > 1 else None
                    current_case = {"case_and_country": case_and_country, "complaint_date": complaint_date, "organization": organization, "short_description": short_description, "additional_texts": [], "href": href}
                    print(current_case)
                else:  # If it's an associated document...
                    if a_tag:  # If there's an <a> tag...
                        current_case["additional_texts"].append({"text": a_tag.text, "href": a_tag.get('href')})  # Update the associated documents in the dictionary

    # Write the details of the last case
    if current_case:
        writer.writerow(list(current_case.values()))

driver.quit()


In [None]:
def extract_hrefs(additional_texts):
    """
    function to extract correct href
    
    """
    if pd.isna(additional_texts) or additional_texts == '[]':
        return None 
    try:
        texts_list = ast.literal_eval(additional_texts)
        hrefs = [text_dict['href'] for text_dict in texts_list]
        return str(hrefs)
    except:
        return None 

In [None]:
docs = pd.read_csv("D:/Chartes_cours/Memoire/data/CFA_all_cases_final.csv") #File generated in the last bloc
docs['hrefs'] = docs['additional_texts'].apply(extract_hrefs)

In [None]:
# Corrected initial scraping script

# Open the CSV file and load it into a DataFrame
df = docs 

# Go to the website
base_url = 'https://www.ilo.org/dyn/normlex/en/'

# Create a requests session
session = requests.Session()

# Create a retry object
retries = Retry(total=3, backoff_factor=5, status_forcelist=[ 502, 503, 504 ])

# Mount it for both http and https usage
session.mount("http://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries))

# Open a CSV file to write the results
with open('scraped_texts_ALL_cleaned.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)  # Use QUOTE_ALL to ensure all fields are quoted
    # Write the header
    writer.writerow(['case', 'document_nr', 'document_name', 'document_href', 'text', 'log'])

    # Loop over each row in the DataFrame
    for idx, row in df.iterrows():
        print(f"Processing row {idx + 1} out of {len(df)}")
        
        case = row['case_and_country']
        document_nr = row['complaint_date']
        document_name = row['organization']
        hrefs = row['hrefs'] #corrected coloumn names
        
        if pd.notna(hrefs):

            #loop to make sure every single report can be scrapped
            hrefs = ast.literal_eval(hrefs)
            links = [base_url + href for href in hrefs]
            for link in links:
             

             try:  
                response = session.get(link, timeout=10)  # Add a timeout
                log = f"Scraping {link}, response status: {response.status_code}"  # create log entry
                
                page_soup = BeautifulSoup(response.text, 'html.parser')  
                text_box = page_soup.find('div', class_='textBoxConvention large')  
                
                if text_box is not None:  # Check if the element was found before trying to call get_text
                    chunks = []
                    for li in text_box.find_all('li'):
                        number_tag = li.find('strong', class_='number')
                        if number_tag is not None:
                            number = number_tag.get_text(strip=True)
                            text = li.get_text(strip=True).replace(number, '', 1)
                            chunks.append((number, text))

                    for chunk in chunks:
                        chunk_text = "".join(chunk)
                        chunk_text_split = textwrap.wrap(chunk_text, 500)

                        for sub_chunk in chunk_text_split:
                            # Write each sub-chunk as a separate row with the same case, document_nr, document_name, href, and log
                            writer.writerow([case, document_nr, document_name, hrefs, sub_chunk, log])
                else:
                    log = f"Could not find a 'div' with class 'textBoxConvention large' on page {link}"  # update log entry
                    # Write a row with the same case, document_nr, document_name, href, empty text, and log
                    writer.writerow([case, document_nr, document_name, hrefs, '', log])
             except requests.exceptions.RequestException as e:
                print(f"Error occurred: {e}")
                time.sleep(10)  # Wait for 10 seconds before next attempt
        else:
            print(f"Skipping row with missing 'document_href'.")

