In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os

In [None]:
base_url = "https://ne.wikipedia.org/w/index.php?title=%E0%A4%B5%E0%A4%BF%E0%A4%B6%E0%A5%87%E0%A4%B7:AllPages&from=%27e%27+%28%E0%A4%97%E0%A4%A3%E0%A4%BF%E0%A4%A4%E0%A5%80%E0%A4%AF+%E0%A4%85%E0%A4%9A%E0%A4%B0%29"

In [None]:
## If chrome issue use the below code.
try:
    driver = webdriver.Chrome()
except:
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    print("Running in headless mode.")

In [None]:
driver.get(base_url)

In [None]:
df = pd.DataFrame(columns=["Title", "URL"])

In [None]:
i = 0
while True:
    content = driver.find_elements(By.CLASS_NAME, "mw-allpages-chunk")
    if len(content) < 1:
        print(f"Some problem with link: {base_url}")
    lists = content[0].find_elements(By.TAG_NAME, "a")
    for li in lists:
        title = li.get_attribute("title")
        url = li.get_attribute("href")
        df = pd.concat([df, pd.DataFrame({"Title": [title], "URL": [url]})], ignore_index=True)
        i +=1
    page_nav_classes = driver.find_elements(By.CLASS_NAME, "mw-allpages-nav")
    link = page_nav_classes[-1].find_elements(By.TAG_NAME, "a")[-1].get_attribute("href")
    driver.get(link)
    

In [None]:
df = df.drop_duplicates()
df.to_csv("nepali_wikipedia_links.csv", index=False)

In [None]:
import requests

def get_wikipedia_links(language="ne", limit=500):
    """
    Function to get links of all articles from Wikipedia in a specific language.
    By default, it fetches articles in Nepali (language='ne').
    """
    S = requests.Session()

    URL = f"https://{language}.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "list": "allpages",
        "aplimit": limit,  # Number of pages to return
        "format": "json"
    }

    article_links = []

    while True:
        response = S.get(url=URL, params=PARAMS)
        data = response.json()
        
        # Extract page titles and create links
        pages = data['query']['allpages']
        for page in pages:
            title = page['title']
            link = f"https://{language}.wikipedia.org/wiki/{title.replace(' ', '_')}"
            article_links.append(link)
        
        # Check if there's a 'continue' key to paginate through more results
        if "continue" in data:
            PARAMS.update(data['continue'])
        else:
            break

    return article_links

# Fetch all Nepali Wikipedia links
nepali_wiki_links = get_wikipedia_links(language="ne")

# Display first 10 links
print(nepali_wiki_links[:10])

# Optionally, save the links to a file
with open('nepali_wikipedia_links.txt', 'w', encoding='utf-8') as f:
    for link in nepali_wiki_links:
        f.write(link + '\n')


In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import tqdm
import re

# Set up Chrome options for headless mode
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)

# Create a folder for wiki articles if it doesn't exist
if not os.path.exists('wiki_articles'):
    os.mkdir('wiki_articles')

# Read the links from 'nepali_wikipedia_links.txt'
with open('nepali_wikipedia_links.txt', 'r', encoding='utf-8') as f:
    links = [line.strip() for line in f.readlines()]

# Limit to first 10 links for testing
links = links[:10]

# Initialize an index to keep track of file naming
i = 0

# Use tqdm for progress tracking
for link in tqdm(links, desc="Scraping Wikipedia Links"):
    try:
        driver.get(link)
        time.sleep(0.1)

        # Extract the heading (title of the page)
        heading = driver.find_elements(By.TAG_NAME, 'h1')
        if len(heading) < 1:
            with open('wiki_articles_alerts.txt', 'a', encoding="utf-8") as f:
                f.write(f"Some problem with link (missing heading): {link}\n")
            continue

        # Use 'i' as the filename instead of the heading
        filename = f'{i}.txt'

        # Check if the file already exists
        if not os.path.exists(f'wiki_articles/{filename}'):
            # Try extracting content from 'mw-body-content'
            content = driver.find_elements(By.CLASS_NAME, 'mw-body-content')
            
            if len(content) < 1:
                # Fallback: Try extracting from 'mw-parser-output' if 'mw-body-content' is not found
                content = driver.find_elements(By.CLASS_NAME, 'mw-parser-output')
                if len(content) < 1:
                    with open('wiki_articles_alerts.txt', 'a', encoding="utf-8") as f:
                        f.write(f"Some problem with link (missing content): {link}\n")
                    continue

            # Accumulate paragraph text, list items, and tables
            data_str = ''

            # Extract paragraphs
            paras = content[0].find_elements(By.TAG_NAME, 'p')
            for para in paras:
                data_str += para.text + '\n'

            # Extract lists
            lists = content[0].find_elements(By.TAG_NAME, 'li')
            for li in lists:
                if li.text:
                    data_str += '- ' + li.text + '\n'

            # Extract infobox content only if it hasn't been captured yet
            infobox = content[0].find_elements(By.CLASS_NAME, 'infobox')
            if infobox:
                infobox_text = '\n'.join([box.text for box in infobox]) + '\n'
                
                # If infobox text is not already in data_str, append it
                if infobox_text not in data_str:
                    data_str += infobox_text
            nepali_content = re.sub(r'[a-zA-Z]', '', data_str)

            # Save the content to a text file with the index number as filename
            with open(f'wiki_articles/{filename}', 'w', encoding="utf-8") as g:
                g.write(f'Link: {link}\n\n' + nepali_content)

        # Log successful saves to the alerts file
        with open('wiki_articles_alerts.txt', 'a', encoding="utf-8") as f:
            f.write(f"Successfully saved: {filename}\n")

    except Exception as e:
        with open('wiki_articles_alerts.txt', 'a', encoding="utf-8") as f:
            f.write(f"Error processing link: {link}, Error: {e}\n")

    i += 1

# Close the driver after scraping
driver.quit()
