In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from bs4 import BeautifulSoup, Comment

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

pages = [
    'https://wsei.edu.pl/oferta-edukacyjna/',
    'https://wsei.edu.pl/kandydaci/studia-i-stopnia/informatyka-stosowana/',
    'https://wsei.edu.pl/',
    'https://wsei.edu.pl/dzialalnosc-wsei/',
    'https://wsei.edu.pl/aktualnosci/',
    'https://biurokarier.wsei.edu.pl/praktyka-zawodowa/',
    'https://wsei.edu.pl/o-wsei/dlaczego-warto/'
]

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(string=True)  # Updated from text=True to string=True
    visible_texts = filter(tag_visible, texts)  
    visible_texts = [t.strip() for t in visible_texts if len(t.strip()) > 3]
    return " ".join(visible_texts)


texts = {
}

driver = setup_driver()

for page in pages:
    try:
        driver.get(page)
        sleep(2)  # Wait for dynamic content to load
        text = driver.page_source
        text = text_from_html(text)
        texts[page] = text
    except Exception as e:
        print(f"Error scraping {page}: {str(e)}")

driver.quit()

for page in texts:
    page_filename = page.replace('https://wsei.edu.pl/', '')
    page_filename = page_filename.replace('/', '_')
    page_filename = page_filename.replace('?', '')
    page_filename = page_filename.replace('&', '')
    page_filename = page_filename.replace(' ', '_')
    page_filename = "home" if page_filename == "" else page_filename
    

    with open(f'./_data/raw/{page_filename}.txt', 'w', encoding='utf-8') as f:
        f.write(texts[page])

KeyboardInterrupt: 

In [3]:
key_file = open('.key', 'r')
OPENAI_KEY = key_file.read()
key_file.close()

In [4]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_KEY)

In [5]:
def clean_text(text):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Jesteś narzędziem do czyszczenia tekstu. Twoim zadaniem jest przekształcenie surowego tekstu z przeglądarki w czytelną i zrozumiałą formę dla przeciętnego człowieka. Nie streszczaj go, ale zamiast tego sformatuj w jasny i uporządkowany sposób, zapewniając, że ma sens. Na końcu każdego tekstu dodaj bardzo krótkie podsumowanie. Wynik powinien być w języku polskim."},
            {"role": "user", "content": text}
        ],
        max_tokens=15000
    )
    return response.choices[0].message.content

In [6]:
import os
from tqdm import tqdm

file_list = []

# get files in _data/raw
for file in os.listdir('_data/raw'):
    if file.endswith('.txt'):
        file_list.append(file)

# summarize each file if its not empty
for file in tqdm(file_list, desc="Processing files"):
    with open(f'./_data/raw/{file}', 'r', encoding='utf-8') as f:
        text = f.read()
        if text:
            summary = clean_text(text)
            with open(f'./_data/clean/{file}', 'w', encoding='utf-8') as f:
                f.write(summary)
            print(f"Finished processing {file}")
        else:
            print(f"Skipped empty file {file}")

Processing files:  17%|█▋        | 1/6 [00:11<00:55, 11.16s/it]

Finished processing aktualnosci_.txt


Processing files:  33%|███▎      | 2/6 [00:21<00:43, 10.85s/it]

Finished processing dzialalnosc-wsei_.txt


Processing files:  50%|█████     | 3/6 [00:32<00:32, 10.76s/it]

Finished processing home.txt


Processing files:  67%|██████▋   | 4/6 [00:43<00:22, 11.03s/it]

Finished processing kandydaci_studia-i-stopnia_informatyka-stosowana_.txt


Processing files:  83%|████████▎ | 5/6 [00:54<00:10, 10.88s/it]

Finished processing o-wsei_dlaczego-warto_.txt


Processing files: 100%|██████████| 6/6 [01:00<00:00, 10.06s/it]

Finished processing oferta-edukacyjna_.txt



