In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import time
import re
import os
from bs4 import BeautifulSoup, Tag
import pandas as pd
import sqlite3
import requests
import json
from joblib import Parallel, delayed
import multiprocessing

In [None]:
links = pd.read_parquet(r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\all_links.parquet")
filtered_links = links.loc[links['typeName'] == 'AZƏRBAYCAN RESPUBLİKASI PREZİDENTİNİN SƏRƏNCAMLARI']
filtered_links

## Loop to Process All Filtered Links

In [None]:
# Assuming 'filtered_links' is a pandas DataFrame and the 'id' column holds the last number for the URL
base_url = 'https://e-qanun.az/framework/'

for idx, row in filtered_links.iterrows():
    # Construct the URL dynamically from the 'id' column
    url = f"{base_url}{row['id']}"
    print(f"Processing URL: {url}")

    # Define file names based on 'id'
    file_name = row['id']
    
    # Define paths for HTML and parquet files
    base_path = "E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/e-qanun HTMLs/"
    sub_directory = "AZƏRBAYCAN RESPUBLİKASI PREZİDENTİNİN SƏRƏNCAMLARI/"
    output_dir = os.path.join(base_path, sub_directory)

    # Now use output_dir in your file paths
    html_file_path = os.path.join(output_dir, f"{file_name}.html")
    parquet_file_path = os.path.join(output_dir, f"{file_name}.parquet")

    # Skip processing if both the HTML and parquet files already exist
    if os.path.exists(html_file_path) and os.path.exists(parquet_file_path):
        print(f"Skipping {file_name}: Both HTML and parquet files already exist.")
        continue

    # Step 1: Initialize the webdriver and scrape the page dynamically using Selenium
    options = Options()
    options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)

    # Step 2: Wait explicitly for the content within any "SectionX" or "WordSectionX" class to load
    try:
        # Use regex to wait for any class starting with "Section" or "WordSection" followed by a number
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'Section') or contains(@class, 'WordSection')]"))
        )
        print("Page content with 'SectionX' or 'WordSectionX' loaded successfully")
    except TimeoutException:
        print("Loading took too much time! Exiting...")
        driver.quit()
        continue  # Skip to the next iteration if loading failed

    # Get the fully rendered page source
    html_content = driver.page_source

    # Save the HTML to the specified directory
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    # Close the driver
    driver.quit()

    print(f"HTML saved to: {html_file_path}")

    # Step 3: Load the saved HTML file and parse it using BeautifulSoup
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Step 4: Extract the document name ('Aktın Adı') from the title
    document_name = soup.title.get_text(strip=True)

    # Step 5: Extract the text sections within any "SectionX" or "WordSectionX" div, excluding tables and certain divs
    word_sections = soup.find_all(lambda tag: (
        tag.name == 'div' and
        re.match(r'(WordSection|Section)\d+', ' '.join(tag.get('class', [])))  # Match any class like SectionX or WordSectionX
    ))

    # If no matching sections are found, skip this page
    if not word_sections:
        print(f"No 'WordSectionX' or 'SectionX' found for {file_name}. Skipping.")
        continue

    paragraphs = []
    
    # Extract paragraphs from all matched sections
    for word_section in word_sections:
        paragraphs.extend(word_section.find_all(lambda tag: (
            tag.name == 'p' and 
            ('MsoNormal' in tag.get('class', []) or 'Mecelle' in tag.get('class', []) or '21' in tag.get('class', [])) and
            not tag.find_parent(['table']) and  # Exclude content inside any table
            not tag.find_parent('table', class_='MsoTableTheme') and  # Exclude content inside MsoTableTheme
            # Exclude the specific 'div' style
            not tag.find_parent('div', style=lambda s: s and 'border-bottom:double gray' in s) and
            not tag.find_parent('table', class_='MsoTableGrid')  # Exclude another table class
        )))

    
    def clean_text(text):
        # Replace non-breaking spaces with a regular space
        text = text.replace(u'\xa0', ' ')
        # Use regex to replace multiple spaces (including non-breaking) with a single space
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # Step 6: Extract the text (including those that might be empty or contain only whitespace) and clean it
    sections = [clean_text(p.get_text(strip=True).replace('\n', ' ')) for p in paragraphs]

    # Step 7: Structure the data into a DataFrame with sections
    data = {
        'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
        'Mətn': sections,  # Each section as a separate row
        'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
        'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
    }

    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
    df['Mətn'] = df['Mətn'].astype(str)
    df['Mətn'] = df['Mətn'].apply(clean_text)  # Apply the cleaning function to all text entries
    df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
    df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

    # Step 9: Reset the index after filtering
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['Mətn'])  # Drop duplicate rows based on the 'Mətn' column
    df = df.reset_index(drop=True)  # Reset the index again after dropping duplicates

    # Step 10: Save the DataFrame to a .parquet file, using the dynamically named file based on the HTML file name
    df.to_parquet(parquet_file_path)

    print(f"Data saved to {parquet_file_path}")

## Parallel Processing

In [None]:
def process_url(row):
    base_url = 'https://e-qanun.az/framework/'

    # Construct the URL dynamically from the 'id' column
    url = f"{base_url}{row['id']}"
    print(f"Processing URL: {url}")

    # Define file names based on 'id'
    file_name = row['id']

    # Define paths for HTML and parquet files
    base_path = "E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/e-qanun HTMLs/"
    sub_directory = "AZƏRBAYCAN RESPUBLİKASI PREZİDENTİNİN SƏRƏNCAMLARI/"
    output_dir = os.path.join(base_path, sub_directory)

    # Now use output_dir in your file paths
    html_file_path = os.path.join(output_dir, f"{file_name}.html")
    parquet_file_path = os.path.join(output_dir, f"{file_name}.parquet")

    # Skip processing if both the HTML and parquet files already exist
    if os.path.exists(html_file_path) and os.path.exists(parquet_file_path):
        print(f"Skipping {file_name}: Both HTML and parquet files already exist.")
        return

    # Step 1: Initialize the webdriver and scrape the page dynamically using Selenium
    options = Options()
    options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)

    # Step 2: Wait explicitly for the content within any "SectionX" or "WordSectionX" class to load
    try:
        # Use regex to wait for any class starting with "Section" or "WordSection" followed by a number
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'Section') or contains(@class, 'WordSection')]"))
        )
        print("Page content with 'SectionX' or 'WordSectionX' loaded successfully")
    except TimeoutException:
        print("Loading took too much time! Exiting...")
        driver.quit()
        return  # Skip to the next iteration if loading failed

    # Get the fully rendered page source
    html_content = driver.page_source

    # Save the HTML to the specified directory
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    # Close the driver
    driver.quit()

    print(f"HTML saved to: {html_file_path}")

    # Step 3: Load the saved HTML file and parse it using BeautifulSoup
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Step 4: Extract the document name ('Aktın Adı') from the title
    document_name = soup.title.get_text(strip=True)

    # Step 5: Extract the text sections within any "SectionX" or "WordSectionX" div, excluding tables and certain divs
    word_sections = soup.find_all(lambda tag: (
        tag.name == 'div' and
        re.match(r'(WordSection|Section)\d+', ' '.join(tag.get('class', [])))  # Match any class like SectionX or WordSectionX
    ))

    # If no matching sections are found, skip this page
    if not word_sections:
        print(f"No 'WordSectionX' or 'SectionX' found for {file_name}. Skipping.")
        return

    paragraphs = []
    
    # Extract paragraphs from all matched sections
    for word_section in word_sections:
            paragraphs.extend(word_section.find_all(lambda tag: (
                tag.name == 'p' and 
                ('MsoNormal' in tag.get('class', []) or 'Mecelle' in tag.get('class', []) or '21' in tag.get('class', [])) and
                not tag.find_parent(['table']) and  # Exclude content inside any table
                not tag.find_parent('table', class_='MsoTableTheme') and  # Exclude content inside MsoTableTheme
                # Exclude the specific 'div' style
                not tag.find_parent('div', style=lambda s: s and 'border-bottom:double gray' in s) and
                not tag.find_parent('table', class_='MsoTableGrid')  # Exclude another table class
            )))

    def clean_text(text):
        # Replace non-breaking spaces with a regular space
        text = text.replace(u'\xa0', ' ')
        # Use regex to replace multiple spaces (including non-breaking) with a single space
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # Step 6: Extract the text (including those that might be empty or contain only whitespace) and clean it
    sections = [clean_text(p.get_text(strip=True).replace('\n', ' ')) for p in paragraphs]

    # Step 7: Structure the data into a DataFrame with sections
    data = {
        'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
        'Mətn': sections,  # Each section as a separate row
        'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
        'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
    }

    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
    df['Mətn'] = df['Mətn'].astype(str)
    df['Mətn'] = df['Mətn'].apply(clean_text)  # Apply the cleaning function to all text entries
    df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
    df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

    # Step 9: Reset the index after filtering
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['Mətn'])  # Drop duplicate rows based on the 'Mətn' column
    df = df.reset_index(drop=True)  # Reset the index again after dropping duplicates

    # Step 10: Save the DataFrame to a .parquet file, using the dynamically named file based on the HTML file name
    df.to_parquet(parquet_file_path)

    print(f"Data saved to {parquet_file_path}")

# Use joblib to parallelize the process, using 50% of available CPUs
num_cores = multiprocessing.cpu_count() // 2
Parallel(n_jobs=num_cores)(delayed(process_url)(row) for idx, row in filtered_links.iterrows())

## Check for Empty Ones and Remove Them

In [None]:
base_path = "E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/e-qanun HTMLs/"
sub_directory = "AZƏRBAYCAN RESPUBLİKASI PREZİDENTİNİN SƏRƏNCAMLARI/"
output_dir = os.path.join(base_path, sub_directory)

# List all files in the directory
all_files = os.listdir(output_dir)

# Filter only the .parquet files
parquet_files = [f for f in all_files if f.endswith('.parquet')]

# Counter to track the number of empty files
empty_file_count = 0

# List to store the names of the files that will be removed (for one-time printing)
files_to_remove = []

# Iterate through the parquet files
for parquet_file in parquet_files:
    parquet_path = os.path.join(output_dir, parquet_file)
    
    try:
        # Check if the parquet file is empty by reading it into a DataFrame
        df = pd.read_parquet(parquet_path)
        
        if df.empty:
            # If the parquet file is empty, count it and prepare to remove the corresponding files
            empty_file_count += 1
            files_to_remove.append(parquet_file)
            
            # Remove the parquet file
            os.remove(parquet_path)
            
            # Construct the corresponding HTML file path
            html_file = parquet_file.replace('.parquet', '.html')
            html_path = os.path.join(output_dir, html_file)
            
            # Check if the corresponding HTML file exists and delete it
            if os.path.exists(html_path):
                os.remove(html_path)
                
    except Exception as e:
        print(f"Error processing file {parquet_file}: {e}")

# Print the number of empty files found and removed
print(f"Total number of empty parquet files found and removed: {empty_file_count}")

# Optional: Print the list of removed files if you want to keep track
#print(f"Files removed: {files_to_remove}")

In [None]:
print(f"Files removed: {files_to_remove}")

In [None]:
file_name = "57645.parquet" # 3185 also check for images included

# Combine the variables to create the full file path
file_path = f"{base_path}{sub_directory}{file_name}"

# Read the parquet file
d = pd.read_parquet(file_path)

# Display the data and unique values
display(d)
display(d['Aktın Adı'].unique())
display(d['Mətn'][0])
display(d['Mətn'][1])
display(d['Mətn'][2])

In [None]:
display(d['Mətn'][4])

In [None]:
display(d['Mətn'][63])

### Testing the Above Script on Single Page

In [None]:
# Create a DataFrame with the target URL id (add the target one SECOND in order, because the FIRST one doesn't work!)
# AFTER PROCESSING REMOVE THE FIRST DUMMY ONE !!!
filtered_links = pd.DataFrame({
    'id': [3488, 3805]
})

# Assuming 'filtered_links' is a pandas DataFrame and the 'id' column holds the last number for the URL
base_url = 'https://e-qanun.az/framework/'

for idx, row in filtered_links.iterrows():
    # Construct the URL dynamically from the 'id' column
    url = f"{base_url}{row['id']}"
    print(f"Processing URL: {url}")

    # Define file names based on 'id'
    file_name = row['id']
    
    # Define paths for HTML and parquet files
    base_path = "E:/Software/Data Science and AI/NLP/Edliyye/Legal Acts Question Answering/NLP project/e-qanun HTMLs/"
    sub_directory = "AZƏRBAYCAN RESPUBLİKASI NAZİRLƏR KABİNETİNİN SƏRƏNCAMLARI/"
    output_dir = os.path.join(base_path, sub_directory)

    # Now use output_dir in your file paths
    html_file_path = os.path.join(output_dir, f"{file_name}.html")
    parquet_file_path = os.path.join(output_dir, f"{file_name}.parquet")

    # Skip processing if both the HTML and parquet files already exist
    if os.path.exists(html_file_path) and os.path.exists(parquet_file_path):
        print(f"Skipping {file_name}: Both HTML and parquet files already exist.")
        continue

    # Step 1: Initialize the webdriver and scrape the page dynamically using Selenium
    options = Options()
    options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)

    # Step 2: Wait explicitly for the content within any "SectionX" or "WordSectionX" class to load
    try:
        # Use regex to wait for any class starting with "Section" or "WordSection" followed by a number
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'Section') or contains(@class, 'WordSection')]"))
        )
        print("Page content with 'SectionX' or 'WordSectionX' loaded successfully")
    except TimeoutException:
        print("Loading took too much time! Exiting...")
        driver.quit()
        continue  # Skip to the next iteration if loading failed

    # Get the fully rendered page source
    html_content = driver.page_source

    # Save the HTML to the specified directory
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    # Close the driver
    driver.quit()

    print(f"HTML saved to: {html_file_path}")

    # Step 3: Load the saved HTML file and parse it using BeautifulSoup
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Step 4: Extract the document name ('Aktın Adı') from the title
    document_name = soup.title.get_text(strip=True)

    # Step 5: Extract the text sections within any "SectionX" or "WordSectionX" div, excluding tables and certain divs
    word_sections = soup.find_all(lambda tag: (
        tag.name == 'div' and
        re.match(r'(WordSection|Section)\d+', ' '.join(tag.get('class', [])))  # Match any class like SectionX or WordSectionX
    ))

    # If no matching sections are found, skip this page
    if not word_sections:
        print(f"No 'WordSectionX' or 'SectionX' found for {file_name}. Skipping.")
        continue

    paragraphs = []
    
    # Extract paragraphs from all matched sections
    for word_section in word_sections:
        paragraphs.extend(word_section.find_all(lambda tag: (
            tag.name == 'p' and 
            ('MsoNormal' in tag.get('class', []) or 'Mecelle' in tag.get('class', [])) and
            not tag.find_parent(['table']) and  # Exclude content inside any table
            not tag.find_parent('table', class_='MsoTableTheme') and  # Exclude content inside MsoTableTheme
            # Exclude the specific 'div' style
            not tag.find_parent('div', style=lambda s: s and 'border-bottom:double gray' in s) and
            not tag.find_parent('table', class_='MsoTableGrid')  # Exclude another table class
        )))

    
    def clean_text(text):
        # Replace non-breaking spaces with a regular space
        text = text.replace(u'\xa0', ' ')
        # Use regex to replace multiple spaces (including non-breaking) with a single space
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # Step 6: Extract the text (including those that might be empty or contain only whitespace) and clean it
    sections = [clean_text(p.get_text(strip=True).replace('\n', ' ')) for p in paragraphs]

    # Step 7: Structure the data into a DataFrame with sections
    data = {
        'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
        'Mətn': sections,  # Each section as a separate row
        'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
        'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
    }

    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
    df['Mətn'] = df['Mətn'].astype(str)
    df['Mətn'] = df['Mətn'].apply(clean_text)  # Apply the cleaning function to all text entries
    df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
    df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

    # Step 9: Reset the index after filtering
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['Mətn'])  # Drop duplicate rows based on the 'Mətn' column
    df = df.reset_index(drop=True)  # Reset the index again after dropping duplicates

    # Step 10: Save the DataFrame to a .parquet file, using the dynamically named file based on the HTML file name
    df.to_parquet(parquet_file_path)

    print(f"Data saved to {parquet_file_path}")

In [None]:
file_name = "3805.parquet" # 3185 also check for images included

# Combine the variables to create the full file path
file_path = f"{base_path}{sub_directory}{file_name}"

# Read the parquet file
d = pd.read_parquet(file_path)

# Display the data and unique values
display(d)
display(d['Aktın Adı'].unique())
display(d['Mətn'][0])
display(d['Mətn'][1])
display(d['Mətn'][2])

## API-Based Approach

In [None]:
# Adjust these parameters to filter for your specific document type
start = 100
codeType = 1  # Change this to the desired document type
secondType = 2  # Change this for subcategory filtering if necessary

# Initial URL with filtering parameters
url = f"https://api.e-qanun.az/getDetailSearch?start=0&length={100}&orderColumn=8&orderDirection=desc&title=true&codeType={codeType}&dateType=1&statusId=1&secondType={secondType}&specialDate=false&array="

# Fetch the first set of data
text = json.loads(requests.get(url).text)

# Replace 'start' with the total number of documents to fetch all at once
url = url.replace(str(start), str(text["totalCount"]))
text = json.loads(requests.get(url).text)

# Normalize the data to a DataFrame and drop unnecessary columns
data = pd.json_normalize(text["data"]).drop(
    ["rowNum", "citation", "effectDate",
     "registerCode", "registerDate", "htmlPath",
     "fields", "relation", "classCodes"], axis=1)

# Define the output directory
out_dir_path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs"

# Ensure the output directory exists
if not os.path.exists(out_dir_path):
    os.makedirs(out_dir_path)

# Save the DataFrame as a Parquet file
output_file = os.path.join(out_dir_path, "all_links.parquet")
data.to_parquet(output_file, index=False)

print(f"Data successfully saved to {output_file}")

In [None]:
# Step 1: Initialize the webdriver
options = Options()
options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
driver = webdriver.Chrome(options=options)

# Define the output directory
output_dir = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load your filtered DataFrame containing the 'id' column
# Assume `filtered_links` is the DataFrame containing the document ids from your filtered results
document_ids = filtered_links['id'].tolist()  # Extract the ids as a list
# Print the total number of document ids to be processed
print(f"Number of documents to process: {len(document_ids)}")

# Function to process a document based on its id
def process_document(doc_id):
    # Check if both the HTML and Parquet files already exist, if so, skip this document
    html_file_path = os.path.join(output_dir, f"{doc_id}.html")
    parquet_file_path = os.path.join(output_dir, f"{doc_id}.parquet")
    
    if os.path.exists(html_file_path) and os.path.exists(parquet_file_path):
        print(f"Skipping {doc_id}, already processed.")
        return

    # Step 1: Generate the URL dynamically
    url = f'https://e-qanun.az/framework/{doc_id}'
    driver.get(url)

    try:
        # Step 2: Wait explicitly for the content within "WordSection1" to load
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "WordSection1"))
        )
        print(f"Page content loaded successfully for document ID: {doc_id}")
    except TimeoutException:
        print(f"Loading took too much time for document ID: {doc_id}, skipping...")
        return

    # Get the fully rendered page source
    html_content = driver.page_source

    # Save the HTML to the specified directory
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print(f"HTML saved to: {html_file_path}")

    # Step 3: Load the saved HTML file and parse it using BeautifulSoup
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Step 4: Extract the document name ('Aktın Adı')
    document_name = soup.title.get_text(strip=True)

    # Step 5: Extract the text sections
    word_section = soup.find('div', class_='WordSection1')
    paragraphs = word_section.find_all('p', class_='MsoNormal')  # Adjust class as needed

    # Extract the text (including those that might be empty or contain only whitespace)
    sections = [p.get_text(strip=True).replace('\n', ' ') for p in paragraphs]

    # Step 6: Structure the data into a DataFrame with sections
    data = {
        'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
        'Mətn': sections,  # Each section as a separate row
        'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
        'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Step 7: Ensure all values in 'Mətn' column are strings before using .str
    df['Mətn'] = df['Mətn'].astype(str)

    # Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
    df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
    df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

    # Step 9: Reset the index after filtering
    df = df.reset_index(drop=True)

    # Step 10: Save the DataFrame to a .parquet file
    df.to_parquet(parquet_file_path)

    print(f"Data saved to {parquet_file_path}")

# Step 3: Loop through each document id and process it
for doc_id in document_ids:
    process_document(doc_id)

# Close the driver after processing all documents
driver.quit()

**Loading took too much time for document ID: 15918, skipping...** Check all these files to handle them also!

In [None]:
df = pd.read_parquet(r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs\57810.parquet")
df

**There are empty parquet files, maybe beacuse the page has not been opened during crawling! They don't open even if I click on the saved HTML locally!**

## Script to Check and Remove Empty/Invalid Files After Parsing Session!

In [None]:
# Define the directory where the files are saved
output_dir = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs"

# List all files in the directory
all_files = os.listdir(output_dir)

# Filter only the .parquet files
parquet_files = [f for f in all_files if f.endswith('.parquet')]

# Counter to track the number of empty files
empty_file_count = 0

# List to store the names of the files that will be removed (for one-time printing)
files_to_remove = []

# Iterate through the parquet files
for parquet_file in parquet_files:
    parquet_path = os.path.join(output_dir, parquet_file)
    
    try:
        # Check if the parquet file is empty by reading it into a DataFrame
        df = pd.read_parquet(parquet_path)
        
        if df.empty:
            # If the parquet file is empty, count it and prepare to remove the corresponding files
            empty_file_count += 1
            files_to_remove.append(parquet_file)
            
            # Remove the parquet file
            os.remove(parquet_path)
            
            # Construct the corresponding HTML file path
            html_file = parquet_file.replace('.parquet', '.html')
            html_path = os.path.join(output_dir, html_file)
            
            # Check if the corresponding HTML file exists and delete it
            if os.path.exists(html_path):
                os.remove(html_path)
                
    except Exception as e:
        print(f"Error processing file {parquet_file}: {e}")

# Print the number of empty files found and removed
print(f"Total number of empty parquet files found and removed: {empty_file_count}")

# Optional: Print the list of removed files if you want to keep track
#print(f"Files removed: {files_to_remove}")

In [None]:
print(f"Files removed: {files_to_remove}")

**This is fully enough to process most cases at this stage of the Product Development! Leave rest cases for now!**

## Parsing YERLİ İCRA HAKİMİYYƏTİ ORQANININ QƏRARI (avoiding tables)

In [None]:
# Step 1: Initialize the webdriver and scrape the page dynamically using Selenium
options = Options()
options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
driver = webdriver.Chrome(options=options)

# Define the URL dynamically
url = 'https://e-qanun.az/framework/29882'  # Change as needed
driver.get(url)

# Step 2: Wait explicitly for the content within "WordSection1" to load
try:
    element = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CLASS_NAME, "Section1"))
    )
    print("Page content loaded successfully")
except TimeoutException:
    print("Loading took too much time! Exiting...")
    driver.quit()
    exit()

# Get the fully rendered page source
html_content = driver.page_source

# Extract the last part of the URL to use as the file name (e.g., '57810')
file_name = url.split('/')[-1]

# Construct the full file path for saving the HTML
output_dir = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs\YERLİ İCRA HAKİMİYYƏTİ ORQANLARININ QƏRARLARI"
file_path = os.path.join(output_dir, f"{file_name}.html")

# Save the HTML to the specified directory
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(html_content)

# Close the driver
driver.quit()

print(f"HTML saved to: {file_path}")

# Step 3: Load the saved HTML file and parse it using BeautifulSoup
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

# Step 4: Extract the document name ('Aktın Adı') from the title
document_name = soup.title.get_text(strip=True)

# Step 5: Extract the text sections within the "Section1" div excluding tables and certain div sections
word_section = soup.find('div', class_='Section1')

# Find all paragraphs that are not inside the excluded tables or divs
paragraphs = word_section.find_all(lambda tag: (
    tag.name == 'p' and 
    'MsoNormal' in tag.get('class', []) and 
    not tag.find_parent(['table']) and  # Exclude content inside any table
    not tag.find_parent('div', style=lambda s: s and 'border-bottom:double gray' in s) and  # Exclude the specific 'div' style
    not tag.find_parent('table', class_='MsoTableGrid')  # Exclude another table class
))

# Extract the text (including those that might be empty or contain only whitespace)
sections = [p.get_text(strip=True).replace('\n', ' ') for p in paragraphs]

# Step 7: Structure the data into a DataFrame with sections
data = {
    'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
    'Mətn': sections,  # Each section as a separate row
    'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
    'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
}

# Create a DataFrame
df = pd.DataFrame(data)

# Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
df['Mətn'] = df['Mətn'].astype(str)
df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

# Step 9: Reset the index after filtering
df = df.reset_index(drop=True)

# Step 10: Save the DataFrame to a .parquet file, using the dynamically named file based on the HTML file name
output_parquet_path = os.path.join(output_dir, f'{file_name}.parquet')
df.to_parquet(output_parquet_path)

print(f"Data saved to {output_parquet_path}")

In [None]:
df = pd.read_parquet(r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs\YERLİ İCRA HAKİMİYYƏTİ ORQANLARININ QƏRARLARI\29882.parquet")
df

In [None]:
display(df['Aktın Adı'].unique())
display(df['Mətn'][0])
display(df['Mətn'][1])
display(df['Mətn'][2])

## Looping Throigh filtered_links df to Construct the URL Dynamically from the 'id' Column

In [None]:
# Assuming 'filtered_links' is a pandas DataFrame and the 'id' column holds the last number for the URL
base_url = 'https://e-qanun.az/framework/'

for idx, row in filtered_links.iterrows():
    # Construct the URL dynamically from the 'id' column
    url = f"{base_url}{row['id']}"
    print(f"Processing URL: {url}")

    # Define file names based on 'id'
    file_name = row['id']
    
    # Define paths for HTML and parquet files
    output_dir = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs\YERLİ İCRA HAKİMİYYƏTİ ORQANLARININ QƏRARLARI"
    html_file_path = os.path.join(output_dir, f"{file_name}.html")
    parquet_file_path = os.path.join(output_dir, f"{file_name}.parquet")
    
    # Skip processing if both the HTML and parquet files already exist
    if os.path.exists(html_file_path) and os.path.exists(parquet_file_path):
        print(f"Skipping {file_name}: Both HTML and parquet files already exist.")
        continue

    # Step 1: Initialize the webdriver and scrape the page dynamically using Selenium
    options = Options()
    options.add_argument("--headless")  # Optional: Run browser in headless mode if you don't want to see the browser UI
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)

    # Step 2: Wait explicitly for the content within "Section1" to load
    try:
        element = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "Section1"))
        )
        print("Page content loaded successfully")
    except TimeoutException:
        print("Loading took too much time! Exiting...")
        driver.quit()
        continue  # Skip to the next iteration if loading failed

    # Get the fully rendered page source
    html_content = driver.page_source

    # Save the HTML to the specified directory
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

    # Close the driver
    driver.quit()

    print(f"HTML saved to: {html_file_path}")

    # Step 3: Load the saved HTML file and parse it using BeautifulSoup
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Step 4: Extract the document name ('Aktın Adı') from the title
    document_name = soup.title.get_text(strip=True)

    # Step 5: Extract the text sections within the "Section1" div excluding tables and certain div sections
    word_section = soup.find('div', class_='Section1')

    # Find all paragraphs that are not inside the excluded tables or divs
    paragraphs = word_section.find_all(lambda tag: (
        tag.name == 'p' and 
        'MsoNormal' in tag.get('class', []) and 
        not tag.find_parent(['table']) and  # Exclude content inside any table
        not tag.find_parent('div', style=lambda s: s and 'border-bottom:double gray' in s) and  # Exclude the specific 'div' style
        not tag.find_parent('table', class_='MsoTableGrid')  # Exclude another table class
    ))

    # Extract the text (including those that might be empty or contain only whitespace)
    sections = [p.get_text(strip=True).replace('\n', ' ') for p in paragraphs]

    # Step 7: Structure the data into a DataFrame with sections
    data = {
        'Aktın Adı': [document_name] * len(sections),  # Document name repeated for each section
        'Mətn': sections,  # Each section as a separate row
        'e-qanun reference': [url] * len(sections),  # Use the dynamic URL from Selenium
        'Embeddings': ['[Empty]'] * len(sections)  # Placeholder for embeddings
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Step 8: Remove rows where 'Mətn' column is empty, contains only whitespace, or is shorter than 33 characters
    df['Mətn'] = df['Mətn'].astype(str)
    df = df[df['Mətn'].str.strip().astype(bool)]  # Remove empty/whitespace entries
    df = df[df['Mətn'].str.len() >= 33]  # Remove rows where 'Mətn' length is less than 33 characters

    # Step 9: Reset the index after filtering
    df = df.reset_index(drop=True)

    # Step 10: Save the DataFrame to a .parquet file, using the dynamically named file based on the HTML file name
    df.to_parquet(parquet_file_path)

    print(f"Data saved to {parquet_file_path}")

In [None]:
# Define the directory where the files are saved
output_dir = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\e-qanun HTMLs\YERLİ İCRA HAKİMİYYƏTİ ORQANLARININ QƏRARLARI"

# List all files in the directory
all_files = os.listdir(output_dir)

# Filter only the .parquet files
parquet_files = [f for f in all_files if f.endswith('.parquet')]

# Counter to track the number of empty files
empty_file_count = 0

# List to store the names of the files that will be removed (for one-time printing)
files_to_remove = []

# Iterate through the parquet files
for parquet_file in parquet_files:
    parquet_path = os.path.join(output_dir, parquet_file)
    
    try:
        # Check if the parquet file is empty by reading it into a DataFrame
        df = pd.read_parquet(parquet_path)
        
        if df.empty:
            # If the parquet file is empty, count it and prepare to remove the corresponding files
            empty_file_count += 1
            files_to_remove.append(parquet_file)
            
            # Remove the parquet file
            os.remove(parquet_path)
            
            # Construct the corresponding HTML file path
            html_file = parquet_file.replace('.parquet', '.html')
            html_path = os.path.join(output_dir, html_file)
            
            # Check if the corresponding HTML file exists and delete it
            if os.path.exists(html_path):
                os.remove(html_path)
                
    except Exception as e:
        print(f"Error processing file {parquet_file}: {e}")

# Print the number of empty files found and removed
print(f"Total number of empty parquet files found and removed: {empty_file_count}")

# Optional: Print the list of removed files if you want to keep track
#print(f"Files removed: {files_to_remove}")

In [None]:
print(f"Files removed: {files_to_remove}")

## Labeled Dataset Preprocessing

In [None]:
# Define the path to your SQLite database
db_path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\sqlite3.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Load the 'user_feedback' table into a DataFrame
query = "SELECT * FROM user_feedback"
df_user_feedback = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Save the DataFrame to a .parquet file
output_path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\user_feedback.parquet"
df_user_feedback.to_parquet(output_path, index=False)

print(f"Data saved to {output_path}")

### Uploading and Exploring Dataset

In [None]:
import pandas as pd

path1 = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\test_data.parquet"
df1 = pd.read_parquet(path1)

display(df1)
df1.info()
df1.index

In [None]:
df1.to_parquet(path1, index=False)

In [None]:
n = 19

display(df1['query_text'].iloc[n])
display(df1['gpt_response'].iloc[n])

In [None]:
import pandas as pd

path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\finetune_data.parquet"
df = pd.read_parquet(path)

display(df)
df.info()
df.index

In [None]:
len(df['query'].unique())

In [None]:
df['query'].value_counts()

In [None]:
df['label'].value_counts()

In [None]:
import matplotlib.pyplot as plt

# Ensure plots are displayed in the notebook
%matplotlib inline

# Class distribution plot
df['label'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
s = 'borclunun ölkədən getmək hüququnun müvəqqəti məhdudlaşdırılması və ölkədən çıxarılması proseduru necədir?'

### Randomly sampling rows from both classes (0s and 1s) to achieve a 10:10 proportion for the given query 

In [None]:
# Find the indices of rows matching the query 's' and labels 0 and 1
# Set "n=" to the number of rows to remove!
indices_to_remove_0 = df[(df['label'] == 0) & (df['query'] == s)].sample(n=1, random_state=42).index
#indices_to_remove_1 = df[(df['label'] == 1) & (df['query'] == s)].sample(n=1, random_state=42).index

# Drop these rows from the original DataFrame inplace
df.drop(indices_to_remove_0, inplace=True)
#df.drop(indices_to_remove_1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.index

In [None]:
display(len(df['query'].loc[(df['label'] == 0) & (df['query'] == s)]))
display(len(df['query'].loc[(df['label'] == 1) & (df['query'] == s)]))

In [None]:
filtered_df = df.loc[df['query'] == s]
filtered_df

In [None]:
df['text'].iloc[109]

### Modifying Dataset

In [None]:
#df.loc[(df['query'] == s) & (df.index != 226), 'label'] = 0

#df.loc[(df['query'] == s) & (df.index == 48), 'label'] = 0

#df.loc[(df['query'] == s) & (df.index == 294), 'text'] = ''

#df["query"] = df["query"].replace(["nikahın ləğvi "], "nikahın ləğvi")

#df['query'] = df['query'].str.lower()

#indexes_to_drop = [109, 110]  # List of indices you want to drop
#df = df.drop(indexes_to_drop)
#df = df.reset_index(drop=True)

In [None]:
#indexes_to_drop = [241]  # List of indices you want to drop
#df = df.drop(indexes_to_drop)
#df = df.reset_index(drop=True)

# Step 1: Identify the rows where query == "analıq məzuniyyəti"
new_rows = df[df['query'] == "alimentin ödənilməsi"].copy()

# Step 2: Update the 'query' value of the new rows
new_rows['query'] = "aliment tələbi"

# Step 3: Append the new rows to the original DataFrame
df = df.append(new_rows, ignore_index=True)

display(df.loc[df['query'] == "alimentin ödənilməsi"])
display(df.loc[df['query'] == "aliment tələbi"])
display(df)
df.info()
df.index

### Adding Training Instances Manually

In [None]:
# Define your list of texts
new_texts = [
    ""
]
label = 1

# Create a new DataFrame for all the new rows
new_rows_df = pd.DataFrame([{'query': s, 'text': text, 'label': label} for text in new_texts])

# Add the new rows to the DataFrame using pd.concat
df = pd.concat([df, new_rows_df], ignore_index=True)

# Display the DataFrame to verify the changes
display(df)
df.index

In [None]:
df.to_parquet(path, index=False)

**Go through all unique Queries, open the act-finder.com, search them in it, check the existing ones and add labels manually to the dataset! If there is no one True answer, find them in the E-Qanun and Add Manually! <u>Use Google Search, ChatGPT and E-Qanun (Mündəricat) to Find Correct (Target) Answers!</u>**

***This is the <u>Training Dataset</u>, so <u>I can add any relevant text from anywhere</u> (Article Titles and Texts by Clauses Separately!, Web Search, etc.!) <u>to make it richer for my 56 unique queries</u>!***

### OpenAI API Model Testing

In [None]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Set your API key
openai.api_key = "your-api-key"

# Strings to compare
texts = [
    "Paylı mülkiyyətim qanunsuz dağıdılıb",
    "Maddə 354. Avtomobil yollarında yol hərəkəti təhlükəsizliyi qaydalarının pozulması 165 354.0. ...",
    "Maddə 12 Tərəflərin səlahiyyətli orqanları maliyyə pozuntularının qarşısının alınması, aşkar olunması ...",
    "Maddəyə əsasən qlobal texniki qaydaların müəyyənləşdirilməsi və dəyişdirilməsi; 14.3. 7-ci",
    "Maddə 245: Birgə və paylı mülkiyyət hüququnun əsaslarını müəyyən edir. ...",
    "Maddə 29: Hər kəsin mülkiyyət hüququ vardır və bu hüquq qanunla qorunur. ..."
]

# Generate embeddings using the OpenAI API
embeddings = []
for text in texts:
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-3-large"
    )
    embeddings.append(response['data'][0]['embedding'])

# Convert embeddings to numpy arrays
embedding_vectors = np.array(embeddings)

# Calculate cosine similarity between the first text and the others
similarities = cosine_similarity([embedding_vectors[0]], embedding_vectors[1:])

# Print the similarities
for i, similarity in enumerate(similarities[0]):
    print(f"Cosine similarity with text {i+1}: {similarity}")

```
Cosine similarity with text 1: 0.3204981110849322 (correct label: 0)
Cosine similarity with text 2: 0.34029572696519317 (correct label: 0)
Cosine similarity with text 3: 0.3275207821570136 (correct label: 0)
Cosine similarity with text 4: 0.5116664280377892 (correct label: 1)
Cosine similarity with text 5: 0.4335759279244248 (correct label: 1)
```

## Parsing Raw Text Approach

In [None]:
path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\codes.parquet"
df = pd.read_parquet(path)

display(df)
df.info()
df.index

In [None]:
display(df.duplicated().sum())
display(df.loc[df.duplicated() == True])

In [None]:
df['text'].iloc[6472]

In [None]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

display(df.duplicated().sum())
display(df.loc[df.duplicated() == True])
display(df)
df.info()
df.index

In [None]:
s = df['text'][0]
s

In [None]:
str[str.find("1. Ümumi müddəalar")::]

In [None]:
df.to_parquet(path, index=False)

## Converting CSV to Parquet and Concatenating All Parquets into the Single One

In [None]:
path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\NLP project\!referendum_acts_processed.csv"
df = pd.read_csv(path)

display(df)
df.info()
df.index

In [None]:
df['text'].iloc[1]

In [None]:
# Remove Leading/Trailing Spaces
def g(text):
    return text.strip()

df['text'] = df['text'].apply(g)

In [None]:
# Remove "\xa0" pattern
df['text'] = df['text'].str.replace(r'\xa0', '', regex=True)

In [None]:
# Replace newline characters
df['text'] = df['text'].str.replace('\n', ' ', regex=True)

In [None]:
# Replace Multiple Spaces with a Single One
def g(text):
    return re.sub(' +', ' ', text)

df['text'] = df['text'].apply(g)

In [None]:
df['text'].iloc[6469]

In [None]:
# Define the new save path
save_path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\referendum_acts.parquet"

# Save the DataFrame as a Parquet file
df.to_parquet(save_path)

In [None]:
import os
import pandas as pd

# Directory containing the Parquet files
directory = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering"

# List to hold DataFrames
df_list = []

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".parquet"):
        file_path = os.path.join(directory, filename)
        # Read the Parquet file and append it to the list
        df_list.append(pd.read_parquet(file_path))

# Concatenate all DataFrames in the list
concatenated_df = pd.concat(df_list, ignore_index=True)

# Save the concatenated DataFrame to a single Parquet file
output_path = os.path.join(directory, "corpus.parquet")
concatenated_df.to_parquet(output_path, index=False)

print(f"All files have been concatenated and saved to {output_path}")

In [None]:
import pandas as pd

path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\corpus.parquet"
df = pd.read_parquet(path)

display(df)
df.info()
df.index

In [None]:
# Define the path to save the Parquet file
output_path = r"E:\Software\Data Science and AI\NLP\Edliyye\Legal Acts Question Answering\Data Collection\corpus.parquet"

# Save the DataFrame as a Parquet file
df.to_parquet(output_path, index=False)

In [None]:
len(df['url'].unique())

In [None]:
df.loc[df['type'] == 'Konstitusiya']

In [None]:
df['name'].loc[df['type'] == 'Məcəllələr'].unique()

In [None]:
# Assign '-' to rows where 'name' column has null values
df.loc[df['name'].isnull(), 'name'] = '-'

In [None]:
df.drop(columns=['status'], inplace=True)

display(df)
df.info()
df.index

In [None]:
display(df.duplicated().sum())
display(df.loc[df.duplicated() == True])

In [None]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

display(df.duplicated().sum())
display(df.loc[df.duplicated() == True])
display(df)
df.info()
df.index

In [None]:
df = df[df['text'].str.strip() != '']
df = df.reset_index(drop=True)

display(df)
df.info()
df.index

In [None]:
# Rename the 'title' column to 'name'
df = df.rename(columns={'title': 'name', 'typeName': 'type'})

# Reorder the columns
df = df[['url', 'type', 'name', 'text']]

# Verify the change
display(df.head())