In [1]:
import os
import requests
from tqdm import tqdm

def write_article_titles_to_txt(filename):
    # Base Url API wiki PT
    base_url = "https://pt.wikipedia.org/w/api.php"

    # Params to call API
    params = {
        "action": "query",
        "format": "json",
        "list": "allpages",
        "aplimit": "max"  # Max number of results per call
    }
    
    # Total de iterations (only to show on tqdm, not needed to be accurate)
    total_iterations = 500
    
    # Open file to write
    with open(filename, 'w', encoding='utf-8') as f:
        # Init tqdm
        progress_bar = tqdm(range(total_iterations), desc="Downloading", unit=" iteration", position=0, leave=True)
        # Do call to API
        for _ in progress_bar:
            response = requests.get(base_url, params=params)
            data = response.json()
            # Verify if the response has the key 'query' and 'allpages'
            if 'query' in data and 'allpages' in data['query']:
                # Write the title of each article in the file
                for page in data['query']['allpages']:
                    f.write(page['title'] + '\n')
                # Update progress bar
                file_size_mb = os.path.getsize(filename) / (1024 * 1024)
                progress_bar.set_postfix(file_size=f"{file_size_mb:.2f} MB")
                progress_bar.update(1)
                # If the response has the key 'continue', update the params to get the next page
                if 'continue' in data:
                    params['apcontinue'] = data['continue']['apcontinue']
                else:
                    break
            else:
                break
        # End tqdm
        progress_bar.close()

# Filename to write the article titles
filename = 'article_titles.txt'

# Write the article titles to the file
write_article_titles_to_txt(filename)

Downloading: 100%|██████████| 500/500 [07:07<00:00,  1.17 iteration/s, file_size=5.01 MB]


In [2]:
def read_article_titles_from_txt(filename):
    # Open the file to read
    with open(filename, 'r', encoding='utf-8') as f:
        # Read all lines from the file
        article_titles = f.readlines()
        # Remove the '\n' from the end of each line
        article_titles = [title.strip() for title in article_titles]
    return article_titles

# Filename to read the article titles
filename = 'article_titles.txt'

# Read the article titles from the file
article_titles = read_article_titles_from_txt(filename)

for title in article_titles[:20]:
    print(title)

!
!!
!!!
!!!Fuck You!!! and Then Some
!!! (álbum)
!!Destroy-Oh-Boy!!
!Action Pact!
!Bang!
!O!ung
!Oka Tokat
!Tchau Radar!
! (álbum)
"
"A" de Álibi
"Anos 70"
"Art"
"B" is for Burglar
"Buzz!!" The Movie
"Claudia Moda"
"Crocodylus" acer


In [4]:
import wikipediaapi

def save_article_to_txt(article_title, file_path):
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='wikipedia_article_scraper',
                                       language='pt')
    page = wiki_wiki.page(article_title)
    if page.exists():
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(page.text)
        except Exception as e:
            pass   
        # print(f"Artigo '{article_title}' salvo em '{file_path}'.")
    else:
        print(f"O artigo '{article_title}' não foi encontrado.")
articles_to_download = article_titles

save_folder = "wikipedia_articles/"

for article_title in tqdm(articles_to_download):
    file_path = save_folder + article_title + ".txt"
    save_article_to_txt(article_title, file_path)

  0%|          | 1548/500000 [29:07<156:20:33,  1.13s/it]


KeyboardInterrupt: 

In [3]:
import os
import wikipediaapi
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def save_article_to_txt(article_title, file_path):
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='wikipedia_article_scraper', language='pt')
    page = wiki_wiki.page(article_title)
    if page.exists():
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(page.text)
        except Exception as e:
            pass
    else:
        print(f"O artigo '{article_title}' não foi encontrado.")

def save_articles_multithreaded(articles_to_download, save_folder, max_workers=12):
    with ThreadPoolExecutor(max_workers=12) as executor:
        futures = []
        for article_title in articles_to_download:
            file_path = os.path.join(save_folder, f"{article_title}.txt")
            future = executor.submit(save_article_to_txt, article_title, file_path)
            futures.append(future)
        for future in tqdm(futures, total=len(futures), desc="Downloading Articles"):
            future.result()  # Wait for each task to complete

# Lista de artigos da Wikipedia que você deseja baixar
articles_to_download = article_titles

# Pasta onde você deseja salvar os arquivos de texto
save_folder = "wikipedia_articles/"

# Baixar os artigos em paralelo
save_articles_multithreaded(articles_to_download, save_folder)

Downloading Articles:   3%|▎         | 6970/250000 [13:01<7:34:25,  8.91it/s]  


In [3]:
import os
import re

def is_valid_sentence(sentence, max_num_chars=0.4):
    # Calcular o número de caracteres que são dígitos
    num_chars = sum(c.isdigit() for c in sentence)
    # Calcular o total de caracteres na sentença
    total_chars = len(sentence)
    # Verificar se a proporção de caracteres numéricos não excede o limite especificado
    return num_chars / total_chars <= max_num_chars

def extract_sentences_from_file(input_file, output_file, max_sentence_words=100):
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Substituir caracteres de nova linha por espaços
    text = text.replace('\n', ' ')

    # Dividir o texto em sentenças usando expressões regulares
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    with open(output_file, 'a', encoding='utf-8') as f:
        for sentence in sentences:
            sentence = sentence.strip()
            # Verificar se a sentença é válida (não mais de 40% dos caracteres são números)
            if len(sentence.split()) >= 3 and len(sentence.split()) <= max_sentence_words and is_valid_sentence(sentence):
                f.write(sentence + '\n')

input_directory = r'C:\Users\levyb\Documents\Machine-Learning-Scratch\Pytorch\PLN\wikipedia_articles'
output_file = 'sentences_output.txt'

files = os.listdir(input_directory)

for file in files:
    if file.endswith('.txt'):
        input_file_path = os.path.join(input_directory, file)
        extract_sentences_from_file(input_file_path, output_file, max_sentence_words=100)


In [5]:
def remove_duplicate_lines(input_file, output_file):
    unique_lines = set()  # Conjunto para armazenar linhas únicas

    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        # Adicionar a linha ao conjunto de linhas únicas
        unique_lines.add(line.strip())

    with open(output_file, 'w', encoding='utf-8') as f:
        # Escrever apenas as linhas únicas de volta para o arquivo
        for line in unique_lines:
            f.write(line + '\n')

# Arquivo de entrada e de saída
input_file = r'C:\Users\levyb\Documents\Machine-Learning-Scratch\Pytorch\PLN\sentences_output.txt'
output_file = r'C:\Users\levyb\Documents\Machine-Learning-Scratch\Pytorch\PLN\sentences_output_clean.txt'

# Remover linhas duplicadas
remove_duplicate_lines(input_file, output_file)