In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [None]:
# Base URL of the webpage to scrape
base_url = 'https://en.wikisource.org/wiki/A_Dictionary_of_the_Sunda_language/'

# List of alphabets from A to Z
alphabets = [chr(i) for i in range(ord('A'), ord('Z') + 1)]

# List to store all scraped words and hyperlinks
based_words = []

# Iterate through each alphabet
for letter in alphabets:
    # Construct the URL for the specific alphabet
    url = base_url + letter

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all elements with class 'hanging-indent'
        indent_elements = soup.find_all(class_='hanging-indent')

        # Extract text and hyperlinks from these elements
        for selected_element in indent_elements:
            for element in selected_element.find_all('p'):
                hyperlink = [link.get('href') for link in element.find_all('a')]
                based_words.append({
                    'word': element.text,
                    'hyperlink': hyperlink  
                })
    else:
        print(f"Failed to retrieve content from {url}")

# Convert based_words list to JSON format
json_data = json.dumps(based_words, indent=4, ensure_ascii=False)

# Print the JSON data (for visualization)
print(json_data)

# Optionally, save the JSON data to a file
with open('based_words_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(based_words, json_file, indent=4, ensure_ascii=False)


In [41]:
from bs4 import BeautifulSoup
import requests
import re

def extract_word_type(url):
    element_class = "mw-content-ltr"
    try:
        # Send an HTTP request to fetch the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad response status

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the specific element by its class name
        target_element = soup.find('div', {'class': element_class})

        if target_element:
            # Function to clean individual text strings
            def clean_text(text):
                # Remove unwanted patterns using regular expressions
                cleaned_text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets and the brackets themselves
                cleaned_text = re.sub(r'\(id\)', '', cleaned_text)  # Remove specific (id) occurrences
                cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
                cleaned_text = cleaned_text.strip()  # Strip leading and trailing spaces
                return cleaned_text

            # Extract text content only from <span> tags within the target element and clean it
            span_texts = [clean_text(span.get_text(strip=True)) for span in target_element.find_all('span')]

            # Filter out empty strings and remove text that only contains '*'
            cleaned_texts = [text for text in span_texts if text.strip() and text.strip() != '*']

            # Join the list of cleaned extracted texts into a single string
            cleaned_text = ' '.join(cleaned_texts)
            return cleaned_text
        else:
            return "Element not found or page structure has changed."
    except requests.exceptions.RequestException as e:
        return f"Error fetching or parsing HTML: {e}"

# Example usage:
url = "https://su.wiktionary.org/wiki/adi"

cleaned_text = extract_word_type(url)
print(cleaned_text)


Basa Sunda [ édit ] Kecap barang


In [33]:
def scrape_page(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <div> with class 'mw-category-group'
    category_group_div = soup.find('div', id='mw-pages')

    # Initialize a list to store results
    results = []

    # Find all <a> tags under the <div class="mw-category-group">
    if category_group_div:
        links = category_group_div.find_all('li')

        for link in links:
            href = "https://su.wiktionary.org" + link.find('a').get('href')
            title = link.find('a').get('title')
            results.append({'word': title, 'link': href})

    return results

def get_next_page(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    next_page = soup.find('a', {'title': 'Kategori:Kecap basa Sunda', },string= 'kaca salajengna').get('href')
    
    return next_page

In [None]:
## This block will result in error, please continue to the next block, it will run properly as intended
list_url = []

root_url = "https://su.wiktionary.org/wiki/Kategori:Kecap_basa_Sunda"

while get_next_page(root_url) is not None:
    list_url.append(root_url)
    root_url = get_next_page(root_url)
    root_url = "https://su.wiktionary.org" + root_url

In [64]:
word_list = []
for url in list_url:
    word_list.append(scrape_page(url))


In [None]:
#flatten the list
word_list = [item for sublist in word_list for item in sublist] 

word_list

In [66]:
test_link = word_list[0]['link']
word_type = extract_word_type(test_link)
def clean_word_tyoe(word_type):
    word_type = word_type.split("Basa")
    word_type = word_type[1:]
    word_type = [i.replace("[ édit ]", ":") for i in word_type]
    return word_type

In [67]:
def extract_word_mean(url):
    links = []
    response = requests.get(url)

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <div> with class 'mw-category-group'
    category_group_div = soup.find('div', id='bodyContent')

    # Initialize a list to store results
    if category_group_div:
        links = category_group_div.find_all('li')
    #remove the tag from the list
    links = [link.text for link in links]
    return links





In [68]:
def extract_content(url):
    word_type = extract_word_type(url)
    word_type = clean_word_tyoe(word_type)  # Corrected function name
    word_meaning = extract_word_mean(url)  # Corrected function name

    # Use ':' instead of '=' in dictionary creation
    content = {
        'jenis kata': word_type,
        'arti kata': word_meaning
    }

    return content  # Return the content dictionary

In [None]:
for word in word_list:
    url = word['link']
    content = extract_content(url)
    word.update(content)  # Update the word dictionary with the extracted content
    print(word)  # Print the updated word dictionary
word_list

In [72]:
#save the data to json
print(len(word_list))
with open('word_list_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(word_list, json_file, indent=4, ensure_ascii=False)


3200


In [30]:
#load word_list_data.json
with open('word_list_data.json', 'r', encoding='utf-8') as json_file:
    word_list = json.load(json_file)


# List of sentences to check and replace
sentences_to_replace = ['Kecap bilangan', 'Kecap barang', 'Kecap pagawéan', 'Kecap Sipat', 
                        'Kecap panambah', 'Kecap panyeluk', 'Kecap panyambung', 
                        "Kecap sulur", "Kecap pananya", "Kecap panuduh", "Kecap pangantét", "Kecap sipat",
                        "Kecap Barang"]

# Iterate over each word in the word_list
for word in word_list:
    # Check if "jenis kata" is present in the word dictionary
    if 'jenis kata' in word:
        # Iterate over each sentence in the "jenis kata" list
        for i in range(len(word['jenis kata'])):
            for sentence in sentences_to_replace:
                # Check if the sentence is present in the current "jenis kata" sentence
                if sentence in word['jenis kata'][i]:
                    # Replace the sentence with the desired text
                    word['jenis kata'][i] = sentence


for word in word_list:
    if len(word['word']) < 3:
        word_list.remove(word)

In [None]:
print(json.dumps(word_list, indent=4, ensure_ascii=False))


In [32]:
#find every unique word type
word_type_unique = []
word_type_unique = [word['jenis kata'] for word in word_list]
word_type_unique = [item for sublist in word_type_unique for item in sublist]
word_type_unique = list(set(word_type_unique))
word_type_unique




[' Kecap sifat',
 'Kecap panuduh',
 'Kecap Barang',
 'Kecap panyambung',
 ' Sunda : ',
 'Kecap pagawéan',
 'Kecap bilangan',
 ' Sunda :',
 'Kecap pananya',
 ' Kecap Kaayaan',
 'Kecap pangantét',
 'Kecap sipat',
 'Kecap sulur',
 ' Sunda : Conto :',
 'Kecap Sipat',
 'Kecap panyeluk',
 ' Sunda : Tempo ogé :',
 'Kecap barang',
 'Kecap panambah']

In [34]:
#save the data to json
with open('word_list_data2.json', 'w', encoding='utf-8') as json_file:
    json.dump(word_list, json_file, indent=4, ensure_ascii=False)


3185