In [None]:
pip install wikipedia-api

In [None]:
import wikipediaapi

In [None]:
wiki_html = wikipediaapi.Wikipedia(
    user_agent='MyProjectName (merlin@example.com)',
    language='en',
    extract_format=wikipediaapi.ExtractFormat.HTML
)
# p_html = wiki_html.page("(332446) 2008 AF4")
# print(p_html.text)

In [None]:
!cp /kaggle/input/datasets-wheel/datasets-2.14.4-py3-none-any.whl /kaggle/working
!pip install  /kaggle/working/datasets-2.14.4-py3-none-any.whl
!cp -r /kaggle/input/stem-wiki-cohere-no-emb /kaggle/working
!cp -r /kaggle/input/all-paraphs-parsed-expanded /kaggle/working/

In [None]:
from datasets import load_from_disk
#查看最终是什么样的
cohere_dataset_filtered = load_from_disk("/kaggle/input/all-paraphs-parsed-expanded")

#数据的text表示内容，section表示HTML中的标签
#所以一个title 会有许多个section，一个section会有多个text

df = cohere_dataset_filtered.to_pandas()

In [None]:
df.head()

In [None]:
# stem-wiki-cohere-no-emb id	title	text	url	wiki_id	views	paragraph_id	langs
# all-paraphs-parsed-expanded 'title', 'section', 'text'

# all-paraphs-parsed-expanded 是基于stem-wiki-cohere-no-emb 通过wikipedia api进行重新请求的。
# 逻辑是：输入title，请求对应的full page。section 表示为page中的段落，例如history ，Examples。
# text 是对应的内容

In [None]:
cohere_dataset = load_from_disk("/kaggle/input/new-wiki-15-clusters")

In [None]:
cohere_dataset = cohere_dataset.to_pandas()

In [None]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup

from tqdm import tqdm
import pandas as pd
import regex as re

dataframe = pd.DataFrame(columns=["title", "section", "text"])

unique_titles = cohere_dataset["title"].unique()
print("The number of different titles:", len(unique_titles))
print("5 titles example: ", unique_titles[:5])

def clean_string(s):
    text = re.sub(r'#+', '', s)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\(\)', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_section_and_text(title, html_content):
    
    extracted_data = []
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    first_h_tag = soup.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    prev_p_tags = first_h_tag.find_all_previous('p') if first_h_tag else []
    if prev_p_tags:
        for p in prev_p_tags:
            text = p.get_text()
            text = clean_string(text)
            extracted_data.append({"title": title, "section": "start", "text": text})
            
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        section = tag.get_text()
        
        exclude_section = ["See also", "References", "Further reading", "Bibliography"]
        
        if section not in exclude_section:
            for sibling in tag.find_next_siblings():
                if sibling.name and sibling.name.startswith('h'):
                    break
                if sibling.name == 'p':
                    text = sibling.get_text()
                    text = clean_string(text)
                    extracted_data.append({"title": title, "section": section, "text": text})
                
    last_h_tag = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])[-1] if soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) else None
    next_p_tags = last_h_tag.find_all_next('p') if last_h_tag else []
    if next_p_tags:
        for p in next_p_tags:
            text = p.get_text()
            text = clean_string(text)
            extracted_data.append({"title": title, "section": "end", "text": text})

    return extracted_data

In [None]:
# 创建一个多线程池
import pandas as pd
import concurrent.futures
import requests
import json
from tqdm import tqdm

s = 8
df = unique_titles[50000*(s-1):50000*(s)]
#df = unique_titles[84400:50000*s]
out = []
#df = unique_titles[:100]
max_threads = 20  # 指定最大线程数
dataframe = pd.DataFrame(columns=["title", "section", "text"])

with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
    # 定义一个函数来处理每一行的数据
    def get_and_clean(use_title):
        try:
            p_html = wiki_html.page(use_title)
            extracted_data = extract_section_and_text(use_title, p_html.text)
            return extracted_data
        except:
            out.append(use_title)
            print(f"error {use_title}")
            return [{"title": None, "section": None, "text": None}]
        
    # 使用多线程处理每一行        
    for result in tqdm(executor.map(get_and_clean, df), total=len(df)):
        try:
            dataframe = pd.concat([dataframe, pd.DataFrame(result)], ignore_index=True)
        except:
            print(f"error title")
            print(result)

In [None]:
pd.DataFrame(out,columns = ['title']).to_csv('out.csv',index = False)

In [None]:
#保存为dataset
import datasets
df = datasets.Dataset.from_pandas(dataframe.reset_index(drop=True))
df.save_to_disk(f'new_wiki_emb_20_clusters_{50000*(s-1)}_{50000*(s)}')