In [None]:
! pip install Wikipedia-API

In [None]:
import wikipediaapi
import re
import pandas as pd
from tqdm import tqdm
import json
import os 
import time 

"""DB & progress paths (better for the files to be stored in the drive
 in case you're using a notebook on the cloud for limited sessions)"""

pathbd= "/content/drive/MyDrive/DB" # Change this
pathp = "/content/drive/MyDrive/ProgressScraping"

titles = pd.read_csv(r'/content/drive/MyDrive/ProgressScraping/titles.csv', sep='\t')


article_titles = titles.drop_duplicates().reset_index(drop=True)

# Check if progress file exists
progress_file = os.path.join(pathp,"scraping_progress.json")
progress = {}
try:
    with open(progress_file, 'r') as file:
        progress = json.load(file)
except FileNotFoundError:
    pass

#depening on what you're looking for you may  want to not remove this

def clean_text(text):
    # Remove links
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove special characters or unwanted symbols
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Trim leading and trailing whitespace
    text = text.strip()
    
    return text

# Function to scrape the content of a Wikipedia article

wiki_wiki = wikipediaapi.Wikipedia(
        language='', # language
        extract_format=wikipediaapi.ExtractFormat.WIKI
)
def scrape_wikipedia_article(article_title):

    p_wiki = wiki_wiki.page(article_title)
    if p_wiki.exists():
      return p_wiki.text
    else :
      return False

# Scrape the content for each article title and save as text files
# Scrape the content for each article title
pbar = tqdm(article_titles)
for title in pbar:
    if title in progress:
        # Skip already scraped articles
        pbar.set_postfix({'status': 'Skipped'})
        continue
    
    # Scrape the article
    time.sleep(0.5)
    try :
      article_content = scrape_wikipedia_article(title)
    except:
      continue
    
    if article_content:
        # Save the content in a text file
        filename = f"{title.replace('/','_')}.txt"
        filename = os.path.join(pathbd,filename)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(clean_text(article_content))
        
        # Update the progress
        progress[title] = True
        with open(progress_file, 'w') as file:
            json.dump(progress, file)
        
        pbar.set_postfix({'status': 'Scraped'})
        print(f"Content saved for article: {title}")
    else:
        pbar.set_postfix({'status': 'Failed'})
        print(f"Failed to scrape content for article: {title}")

pbar.close()


In [None]:
"""Some help with the post processing of the text"""
# Set the directory path in Google Drive
directory_path = '/content/drive/MyDrive/DB'

# List all the files in the directory
files = os.listdir(directory_path)

# Specify the strings to search for in the filenames
strings_to_search = ['something'] #some ids to removes like "user:" '

# Iterate over the files and remove the ones matching the strings
for file_name in files:
    for string in strings_to_search:
        if string in file_name:
            file_path = os.path.join(directory_path, file_name)
            os.remove(file_path)
            print(f"Removed file: {file_name}")
            break

In [None]:
files = os.listdir(directory_path)
print(len(files))