# Chayka/Vechirka

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re

def scrape_chayka_links():
    """
    Scrape all links from the archived Chayka.org website that match the format
    <a href="/node/XXXXX" rel="bookmark">some text</a>
    Returns a list of dictionaries with link text and URL.
    """
    # URL of the archived page
    url = "https://web.archive.org/web/20250202215959/https://www.chayka.org/"

    # Add a user agent to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        # Add a small delay to be respectful to the web.archive.org servers
        time.sleep(1)

        # Send GET request to the URL
        response = requests.get(url, headers=headers)

        # Raise an exception for bad status codes
        response.raise_for_status()

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags with rel="bookmark" and href matching /node/XXXXX
        bookmark_links = []

        # Find all <a> tags with rel="bookmark"
        for link in soup.find_all('a', rel='bookmark'): #(chayk)
            href = link.get('href', '')
            # Check if the href matches the pattern "/node/XXXXX"

            if "node" in href and not ".jpg" in link.text and len(link.text)>0: #(chayk)
                # Create full URL by combining base URL with relative path
                full_url = f"https://web.archive.org/web/20250202215959/https://www.chayka.org{href}"
                bookmark_links.append({
                    "text": link.text.strip(),
                    "relative_url": href,
                    "full_url": full_url
                })

        return {
            "bookmark_links": bookmark_links,
            "count": len(bookmark_links),
            "source_url": url
        }

    except requests.exceptions.RequestException as e:
        return {"error": f"Request error: {str(e)}"}
    except Exception as e:
        return {"error": f"General error: {str(e)}"}


In [None]:
def main():
    print(f"Starting to scrape bookmark links from Chayka...")
    result = scrape_chayka_links()

    if "error" in result:
        print(result["error"])
    else:
        print(f"Found {result['count']} bookmark links on the page.\n")

        print("Bookmark Links:")
        for i, link in enumerate(result['bookmark_links'], 1):
            print(f"{i}. Text: {link['text']}")
            print(f"   URL: {link['relative_url']}")
            print(f"   Full URL: {link['full_url']}")
            print("-" * 80)

    print("Scraping completed.")

if __name__ == "__main__":
    main()

Starting to scrape bookmark links from Chayka...
Found 17 bookmark links on the page.

Bookmark Links:
1. Text: Православная церковь в мире. Zoom-встреча с отцом Джованни Гуайтой 9 февраля
   URL: /web/20250202215959/https://www.chayka.org/node/15874
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.org/web/20250202215959/https://www.chayka.org/node/15874
--------------------------------------------------------------------------------
2. Text: Согласись, Маня! Доводы мужа, который не хочет встать с дивана
   URL: /web/20250202215959/https://www.chayka.org/node/15912
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.org/web/20250202215959/https://www.chayka.org/node/15912
--------------------------------------------------------------------------------
3. Text: На водосборах Ойкумены. В продолжение темы
   URL: /web/20250202215959/https://www.chayka.org/node/15911
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.

# Vechirka (partial)

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re

def scrape_vechirka_links():
    """
    Scrape all links from the archived Chayka.org website that match the format
    <a href="/node/XXXXX" rel="bookmark">some text</a>
    Returns a list of dictionaries with link text and URL.
    """
    # URL of the archived page
    url = 'https://web.archive.org/web/20220203004906/http://vechirka.pl.ua/'

    # Add a user agent to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        # Add a small delay to be respectful to the web.archive.org servers
        time.sleep(1)

        # Send GET request to the URL
        response = requests.get(url, headers=headers)

        # Raise an exception for bad status codes
        response.raise_for_status()

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags with rel="bookmark" and href matching /node/XXXXX
        bookmark_links = []

        # Find all <a> tags with rel="bookmark"
        #for link in soup.find_all('a', rel='bookmark'): (chayk)
        for link in soup.find_all('a'):
            href = link.get('href', '')
            # Check if the href matches the pattern "/node/XXXXX"

            # if "node" in href and not ".jpg" in link.text and len(link.text)>0: (chayk)
            if "ukrayina" in href and len(link.text)>0 and href[-1] in "0123456789": #vechirka
                # Create full URL by combining base URL with relative path
                full_url = f"https://web.archive.org/web/20250202215959/https://www.chayka.org{href}"
                bookmark_links.append({
                    "text": link.text.strip(),
                    "relative_url": href,
                    "full_url": full_url
                })

        return {
            "bookmark_links": bookmark_links,
            "count": len(bookmark_links),
            "source_url": url
        }

    except requests.exceptions.RequestException as e:
        return {"error": f"Request error: {str(e)}"}
    except Exception as e:
        return {"error": f"General error: {str(e)}"}


In [None]:
def main():
    print(f"Starting to scrape bookmark links from vechirka.pl.ua...")
    result = scrape_chayka_links()


    if "error" in result:
        print(result["error"])
    else:
        print(f"Found {result['count']} bookmark links on the page.\n")

        print("Bookmark Links:")
        for i, link in enumerate(result['bookmark_links'], 1):
            print(f"{i}. Text: {link['text']}")
            print(f"   URL: {link['relative_url']}")
            print(f"   Full URL: {link['full_url']}")
            print("-" * 80)

    print("Scraping completed.")

if __name__ == "__main__":
    main()

Starting to scrape bookmark links from vechirka.pl.ua...
Found 9 bookmark links on the page.

Bookmark Links:
1. Text: «Я, звичайно, не хочу нікого нaлякa...
   URL: /web/20220203004906/http://vechirka.pl.ua/ukrayina/1614239425
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.org/web/20220203004906/http://vechirka.pl.ua/ukrayina/1614239425
--------------------------------------------------------------------------------
2. Text: Чому маємо безіменних загиблих захи...
   URL: /web/20220203004906/http://vechirka.pl.ua/ukrayina/1576827290
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.org/web/20220203004906/http://vechirka.pl.ua/ukrayina/1576827290
--------------------------------------------------------------------------------
3. Text: Росіяни вважають США та Україну най...
   URL: /web/20220203004906/http://vechirka.pl.ua/ukrayina/1573717439
   Full URL: https://web.archive.org/web/20250202215959/https://www.chayka.org/web/20220203004

# Full Vechirka newspaper scrapping pipeline (updated on March 28)

## 1. Get the date & time from Wayback machine
## 2. Know the max_article to scrap on that page
## 3. Know the column of interest (policy analytics/on the front line)
## 4. Obtain relative url (without Wayback Machine archive marks, which can make the content inaccessble)
## 5. Get content using the "extract content pipeline" function
## 6. Get a json file with all articles**

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import json

In [None]:
def get_vechirka_links(base_url):
    """
    Scrape all relevant links from the archived vechirka.pl.ua website.
    Returns a list of dictionaries with link text and URL.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        time.sleep(1)  # Be polite to the server
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        bookmark_links = []
        for link in soup.find_all('a'):
            href = link.get('href', '')
            link_text = link.text.strip()

            # Check if the href matches the pattern we're looking for (ends with a digit and contains "ukrayina")
            if "ukrayina" in href and len(link_text) > 0 and href[-1] in "0123456789":
                # Extract archive timestamp from base_url
                archive_timestamp = re.search(r'web/(\d+)/', base_url).group(1) if re.search(r'web/(\d+)/', base_url) else "20220203004906"

                # Create proper archive URL
                full_url = f"https://web.archive.org/web/{archive_timestamp}/http://vechirka.pl.ua{href}"
                bookmark_links.append({
                    "text": link_text,
                    "relative_url": href,
                    "full_url": full_url,
                    "article_content": None  # Will be filled in later
                })

        return bookmark_links

    except Exception as e:
        print(f"Error getting vechirka links: {str(e)}")
        return []

In [None]:
!pip install deep-translator
from deep_translator import GoogleTranslator
def deep_translate_to_english(ukrainian_text):
    print('deep translating now...')
    output = ''
    print('length of the text: ', len(ukrainian_text))
    if len(ukrainian_text) < 500:
      return GoogleTranslator(source='uk', target='en').translate(ukrainian_text)
    else:
      i = 0
      while i < len(ukrainian_text):
        if i <= len(ukrainian_text) - 500:
          output += GoogleTranslator(source='uk', target='en').translate(ukrainian_text[i:i+500])
        else:
          output += GoogleTranslator(source='uk', target='en').translate(ukrainian_text[i:])
        i += 500
    return output

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
def extract_article_content_pipepline(article_url):
    """
    Visit an article page and extract the content.
    For vechirka.pl.ua, we'll look for the main article content.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        time.sleep(2)  # Longer delay for article pages to be extra polite
        response = requests.get(article_url, headers=headers)

        response.raise_for_status()

        if response.status_code != 200: print('Webscrapping response OK.')


        soup = BeautifulSoup(response.text, 'html.parser')

        # First try to find content in a div with class field-item even (like in Chayka)
        article_div = soup.find_all('div', class_='field-item even')



        paragraphs = []
        # for x in article_div:
        #     print(x)
        for p in article_div:
        # for p in article_div.find_all('p'):
            paragraphs.append(p.get_text(strip=False))


        # Join paragraphs with double newlines to preserve structure
        content = "\n\n".join(paragraphs)

        content = deep_translate_to_english(content)

        return content

    except Exception as e:
        return f"Error extracting article: {str(e)}"

In [None]:
# def extract_article_content(article_url):
#     """
#     Visit an article page and extract the content.
#     For vechirka.pl.ua, we'll look for the main article content.
#     """
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
#     }

#     try:
#         time.sleep(2)  # Longer delay for article pages to be extra polite
#         response = requests.get(article_url, headers=headers)
#         response.raise_for_status()

#         soup = BeautifulSoup(response.text, 'html.parser')

#         # First try to find content in a div with class field-item even (like in Chayka)
#         article_div = soup.find_all('div', class_='field-item even')

#         #article_div = soup.find('p')


#         paragraphs = []
#         for p in article_div:
#         # for p in article_div.find_all('p'):
#             paragraphs.append(p.get_text(strip=True))

#         # Join paragraphs with double newlines to preserve structure
#         content = "\n\n".join(paragraphs)

#         # If no paragraphs were found, get all text from the div
#         if not content:
#             content = article_div.get_text(strip=True)

#         return content

#     except Exception as e:
#         return f"Error extracting article: {str(e)}"


def scrape_vechirka_articles(base_url, max_articles=5):
    """
    Main function to scrape vechirka.pl.ua articles.
    Limits the number of articles to scrape to avoid overloading the server.
    """

    print("1. Finding relevant links on the main page...")
    vechirka_links = get_vechirka_links(base_url)

    if not vechirka_links:
        return {"error": "No relevant links found"}

    # Limit the number of articles to scrape
    articles_to_scrape = vechirka_links[:max_articles]

    urls = []
    for a in articles_to_scrape:
      url = '/'.join(a['relative_url'].split('/')[-5:])
      urls.append(url)

    print(f"2. Found {len(vechirka_links)} links. Will scrape {len(urls)} articles...")

    # Visit each article page and extract content
    # for i, article in enumerate(articles_to_scrape, 1):
    #     print(f"   Scraping article {i}/{len(articles_to_scrape)}: {article['text'][:40]}...")
    #     #article_content = extract_article_content(article['full_url'])

    #     #url = "/".join(article['relative_url'].split("/")[3:])
    #     url = base_url[:-1] + article['relative_url']

    #     article_content = extract_article_content_pipeline(url)
    #     article['article_content'] = article_content

    for i in range(len(urls)):
      print(f"   Scraping article {i+1}/{len(urls)}: {urls[i][:40]}...")
      article_content = extract_article_content_pipepline(urls[i])
      articles_to_scrape[i]['article_content'] = article_content


    return {
        "total_links_found": len(vechirka_links),
        "articles_scraped": len(urls),
        "articles": articles_to_scrape
    }

In [None]:
# Save to json
# def save_to_file(data, filename="Vechirka Articles 20230131051233 [English].json"):
#     """Save the scraped data to a JSON file"""
#     with open(filename, 'w', encoding='utf-8') as f:
#         json.dump(data, f, ensure_ascii=False, indent=2)
#     print(f"Data saved to {filename}")

# New: save to csv
import csv
def save_to_file(data, filename=None):
    """
    Save the scraped data to a CSV file

    Args:
    data (list): List of dictionaries containing article information
    filename (str, optional): Name of the output CSV file.
                               If not provided, a default name will be generated.
    """
    # Generate default filename if not provided
    if filename is None:
        from datetime import datetime
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        filename = f"Vechirka Articles {timestamp} [English].csv"

    # Ensure the filename has .csv extension
    if not filename.lower().endswith('.csv'):
        filename += '.csv'


    # Determine the columns (assuming all dictionaries have the same keys)
    if not data:
        print("No data to save.")
        return

    data['Title'] = data['articles'][0]['text']
    data['Content'] = data['articles'][0]['article_content']
    data['URL'] = data['articles'][0]['relative_url']
    del data['articles']

    fieldnames = list(data.keys())

    print(list(data.keys()))
    # Write to CSV
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=list(data.keys()))

            # Write header
            writer.writeheader()

            # Write data rows
            writer.writerows(data)

        print(f"Data saved to {filename}")
        print(f"Total articles saved: {len(data)}")

    except IOError as e:
        print(f"Error saving file: {e}")
    except Exception as e:
        print(f"Unexpected error occurred: {e}")

In [None]:
def main():
    print("Starting vechirka.pl.ua article scraper...")

    # Limit to 5 articles by default to be gentle on the Internet Archive
    # base_url = 'http://vechirka.pl.ua/'
    max_articles = 1
    base_url = 'https://web.archive.org/web/20230131051233/http://vechirka.pl.ua/ukrayina/na-liniyi-frontu'
    result = scrape_vechirka_articles(base_url, max_articles)

    if "error" in result:
        print(f"Error: {result['error']}")
    else:
        print(f"\nSuccessfully scraped {result['articles_scraped']} articles out of {result['total_links_found']} links found.")

        # Display sample of the first article
        if result['articles']:
            first_article = result['articles'][0]
            print("\nSample of first article:")
            print(f"Title: {first_article['text']}")
            print(f"URL: {first_article['full_url']}")
            # Show first 200 characters of content
            content_preview = first_article['article_content'][:200] + "..." if len(first_article['article_content']) > 200 else first_article['article_content']
            print(f"Content: {content_preview}")

        # Save all data to a file
        save_to_file(result)

if __name__ == "__main__":
    main()

Starting vechirka.pl.ua article scraper...
1. Finding relevant links on the main page...
2. Found 66 links. Will scrape 1 articles...
   Scraping article 1/1: http://vechirka.pl.ua/ukrayina/164517005...
deep translating now...
length of the text:  679

Successfully scraped 1 articles out of 66 links found.

Sample of first article:
Title: Термін, за який можна витратити «ко...
URL: https://web.archive.org/web/20230131051233/http://vechirka.pl.ua/web/20230131051233/http://vechirka.pl.ua/ukrayina/1645170055
Content: The term of use of 1,000 UAH "support" is 9 months instead of 4. This was reported on the Page "Action" on Facebook.
If you have a "green" CVID certificate in "Action" and have not yet applied for pay...
['total_links_found', 'articles_scraped', 'Title', 'Content', 'URL']
Unexpected error occurred: 'str' object has no attribute 'keys'


# Extract one article

In [None]:
!pip install deep-translator
from deep_translator import GoogleTranslator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import json

In [None]:
def extract_article_content_pipepline(article_url):
    """
    Visit an article page and extract the content.
    For vechirka.pl.ua, we'll look for the main article content.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        time.sleep(2)  # Longer delay for article pages to be extra polite
        response = requests.get(article_url, headers=headers)

        response.raise_for_status()

        print('here', response.status_code)

        print(response.status_code != 200)


        soup = BeautifulSoup(response.text, 'html.parser')

        # First try to find content in a div with class field-item even (like in Chayka)
        article_div = soup.find_all('div', class_='field-item even')



        paragraphs = []
        # for x in article_div:
        #     print(x)
        for p in article_div:
        # for p in article_div.find_all('p'):
            paragraphs.append(p.get_text(strip=False))


        # Join paragraphs with double newlines to preserve structure
        content = "\n\n".join(paragraphs)

        # If no paragraphs were found, get all text from the div
        # if not content:
        #     content = article_div.get_text(strip=True)
        return content

    except Exception as e:
        return f"Error extracting article: {str(e)}"

full_url = 'http://vechirka.pl.ua/ukrayina/1618471692' # this works

full_url = 'http://vechirka.pl.ua/ukrayina/1617267033'

uk_output = extract_article_content_pipepline(full_url)

# def translate_to_english(ukrainian_text):
#     translator = Translator()
#     ukrainian_text_list = ukrainian_text.split(' ')[:]
#     english_translation = ''
#     print(len(ukrainian_text_list))
#     for i in range(len(ukrainian_text_list)):
#         word = ukrainian_text_list[i].strip()
#         # translated_word = translator.translate(word, src='uk', dest='en').text
#         translated_word = GoogleTranslator(source='uk', target='en').translate(word)
#         english_translation += str(translated_word) + ' '
#     print(english_translation, len(english_translation))
#     return english_translation

def deep_translate_to_english(ukrainian_text):
    output = ''
    if len(ukrainian_text) < 500:
      return GoogleTranslator(source='uk', target='en').translate(ukrainian_text)
    else:
      i = 0
      while i < len(ukrainian_text):
        if i <= len(ukrainian_text) - 500:
          output += GoogleTranslator(source='uk', target='en').translate(ukrainian_text[i:i+500])
        else:
          output += GoogleTranslator(source='uk', target='en').translate(ukrainian_text[i:])
        i += 500
    return output



content = deep_translate_to_english(uk_output)

def save_to_file(data, filename):
    """Save the scraped data to a TXT file"""
    filename = filename+'[english].txt'
    with open(filename, 'w', encoding='utf-8') as f:
        # If 'data' is a list of strings, write each string on a new line
        if isinstance(data, list):
            for line in data:
                f.write(f"{line}\n")
        # If 'data' is a single string, write it directly
        elif isinstance(data, str):
            f.write(data)
        else:
            raise ValueError("Data must be a string or a list of strings")
    print(f"Data saved to {filename}")

save_to_file(content, full_url.split("/")[-1])

here 200
False
Data saved to 1617267033[english].txt


# **"On the Frontline"** Scrapping

In [None]:
!pip install googletrans==4.0.0-rc1
from googletrans import Translator

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
import json
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [None]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [None]:
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.1.0 webdriver_manager-4.0.2


In [None]:
!pip install selectolax

Collecting selectolax
  Downloading selectolax-0.3.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading selectolax-0.3.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.3.28


In [None]:
def translate_to_english(ukrainian_text):
    translator = Translator()
    english_translation = translator.translate(ukrainian_text, dest='en').text
    return english_translation

sample_link_frontline_news = [] # obtain from Wayback machine without opening the page on WBM
sample_link_frontline_news.append('https://web.archive.org/web/20211118031812/http://vechirka.pl.ua/')
sample_link_frontline_news.append('https://web.archive.org/web/20220916000436/http://vechirka.pl.ua/')
sample_link_frontline_news.append('https://web.archive.org/web/20250221001907/http://vechirka.pl.ua/')
sample_link_frontline_news.append('https://web.archive.org/web/20230309045149/http://vechirka.pl.ua/')


section_key_word = {}
section_key_word['policy - analytics'] = 'polityka/analityka'
section_key_word['policy - current news'] = 'polityka/aktualni-novyny'
section_key_word['ukraine - on the frontline'] = 'ukrayina/na-liniyi-frontu'

sample_link = sample_link_frontline_news[-1]+section_key_word['ukraine - on the frontline']
sample_link = 'https://web.archive.org/web/20230131051233/http://vechirka.pl.ua/ukrayina/na-liniyi-frontu'
print(sample_link)

https://web.archive.org/web/20230131051233/http://vechirka.pl.ua/ukrayina/na-liniyi-frontu


In [None]:
def get_vechirka_links(base_url):
    """
    Scrape all relevant links from the archived vechirka.pl.ua website.
    Returns a list of dictionaries with link text and URL.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        #time.sleep(1)  # Be polite to the server
        print('Getting the request.')
        response = requests.get(base_url, headers=headers)
        print('Got the request.')
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        print('Get soup.')

        bookmark_links = []
        for link in soup.find_all('a'):
            print('a')
            href = link.get('href', '')
            link_text = link.text.strip()

            # Check if the href matches the pattern we're looking for (ends with a digit and contains "ukrayina")
            if "ukrayina" in href and len(link_text) > 0 and href[-1] in "0123456789":
                # Extract archive timestamp from base_url
                archive_timestamp = re.search(r'web/(\d+)/', base_url).group(1) if re.search(r'web/(\d+)/', base_url) else "20220203004906"

                # Create proper archive URL
                full_url = f"https://web.archive.org/web/{archive_timestamp}/http://vechirka.pl.ua{href}"
                article = (link_text, href)
                bookmark_links.append(article)

        return bookmark_links

    except Exception as e:
        print(f"Error getting vechirka links: {str(e)}")
        return []

In [None]:
import re
import time
from typing import List, Tuple

# Selenium Approach
def get_vechirka_links_selenium(base_url: str) -> List[Tuple[str, str]]:
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
    from webdriver_manager.chrome import ChromeDriverManager

    try:
        # Configure Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Setup WebDriver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Navigate to page
        driver.get(base_url)
        time.sleep(2)  # Wait for page to load

        # Find links
        bookmark_links = []
        links = driver.find_elements(By.TAG_NAME, 'a')

        for link in links:
            href = link.get_attribute('href') or ''
            link_text = link.text.strip()

            # Check if the href matches the pattern we're looking for
            if "ukrayina" in href and len(link_text) > 0 and href[-1] in "0123456789":
                # Extract archive timestamp from base_url
                archive_timestamp = re.search(r'web/(\d+)/', base_url).group(1) if re.search(r'web/(\d+)/', base_url) else "20220203004906"

                # Create proper archive URL
                full_url = f"https://web.archive.org/web/{archive_timestamp}/http://vechirka.pl.ua{href}"
                article = (link_text, href)
                bookmark_links.append(article)

        driver.quit()
        return bookmark_links

    except Exception as e:
        print(f"Selenium Scraping Error: {str(e)}")
        return []


In [None]:
def extract_article_content(article_url):
    """
    Visit an article page and extract the content.
    For vechirka.pl.ua, we'll look for the main article content.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        time.sleep(2)  # Longer delay for article pages to be extra polite
        response = requests.get(article_url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # First try to find content in a div with class field-item even (like in Chayka)
        article_div = soup.find_all('div', class_='field-item even')

        paragraphs = []
        # for x in article_div:
        #     print(x)
        for p in article_div:
        # for p in article_div.find_all('p'):
            paragraphs.append(p.get_text(strip=False))
            print('p')


        # Join paragraphs with double newlines to preserve structure
        content = "\n\n".join(paragraphs)

        # If no paragraphs were found, get all text from the div
        if not content:
            content = article_div.get_text(strip=True)
        return content

    except Exception as e:
        return f"Error extracting article: {str(e)}"

def save_to_file(title, content):
    """Save the scraped content to a TXT file"""
    filename = title + '_English' + '.txt'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write('Title: ' + title + '\n')
        # If 'content' is a list of strings, write each string on a new line
        if isinstance(content, list):
            for line in content:
                f.write(f"{line}\n")
        # If 'content' is a single string, write it directly
        elif isinstance(content, str):
            f.write(content)
        else:
            raise ValueError("Content must be a string or a list of strings")
    print(f"Content saved to {filename}")

In [None]:
import httpx
import re
from selectolax.parser import HTMLParser
from urllib.parse import urljoin

def get_vechirka_links_selectolax(base_url):
    """
    Scrape all relevant links from the archived vechirka.pl.ua website.
    Returns a list of dictionaries with link text and URL.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        print('Fetching the page...')
        response = httpx.get(base_url, headers=headers, timeout=60)
        response.raise_for_status()

        tree = HTMLParser(response.text)
        print('Parsing the page...')

        # Extract archive timestamp from base_url
        match = re.search(r'web/(\d+)/', base_url)
        archive_timestamp = match.group(1) if match else "20220203004906"

        bookmark_links = []
        for node in tree.css('a'):
            href = node.attributes.get('href', '')
            link_text = node.text(strip=True)

            # Check if URL contains "ukrayina" and ends with a digit
            if "ukrayina" in href and href[-1].isdigit() and link_text:
                full_url = urljoin(f"https://web.archive.org/web/{archive_timestamp}/http://vechirka.pl.ua", href)
                bookmark_links.append((link_text, full_url))

        print(f'Found {len(bookmark_links)} links.')
        return bookmark_links

    except httpx.HTTPError as e:
        print(f"Error fetching vechirka links: {str(e)}")
        return []


ModuleNotFoundError: No module named 'selectolax'

In [None]:
def get_vechirka_links_helium(base_url: str) -> List[Tuple[str, str]]:
    from helium import start_chrome, find_all, kill_browser

    try:
        # Start browser
        driver = start_chrome(base_url)

        # Find links
        bookmark_links = []
        links = find_all('a')

        for link in links:
            href = link.get_attribute('href') or ''
            link_text = link.text.strip()

            # Check if the href matches the pattern we're looking for
            if "ukrayina" in href and len(link_text) > 0 and href[-1] in "0123456789":
                # Extract archive timestamp from base_url
                archive_timestamp = re.search(r'web/(\d+)/', base_url).group(1) if re.search(r'web/(\d+)/', base_url) else "20220203004906"

                # Create proper archive URL
                full_url = f"https://web.archive.org/web/{archive_timestamp}/http://vechirka.pl.ua{href}"
                article = (link_text, href)
                bookmark_links.append(article)

        kill_browser()
        return bookmark_links

    except Exception as e:
        print(f"Helium Scraping Error: {str(e)}")
        return []

In [None]:
!pip install helium

Collecting helium
  Downloading helium-5.1.1.tar.gz (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: helium
  Building wheel for helium (setup.py) ... [?25l[?25hdone
  Created wheel for helium: filename=helium-5.1.1-py3-none-any.whl size=41058 sha256=3520aeafa474cb9936df40dcc65cf59124d10917a70da28c5deda9d8f0597304
  Stored in directory: /root/.cache/pip/wheels/6c/4d/b0/698033cd12ab8041849e98bab1c4db30879ea8250a8e8d75c4
Successfully built helium
Installing collected packages: helium
Successfully installed helium-5.1.1


In [None]:
d = {}
#bookmark_links = get_vechirka_links(sample_link)
#bookmark_links = get_vechirka_links_selenium(sample_link)
print(sample_link)
#bookmark_links = get_vechirka_links_selectolax(sample_link)
bookmark_links = get_vechirka_links_helium(sample_link)
print(bookmark_links)

https://web.archive.org/web/20230131051233/http://vechirka.pl.ua/ukrayina/na-liniyi-frontu
Helium Scraping Error: Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir
Stacktrace:
#0 0x5bfc606f9ffa <unknown>
#1 0x5bfc601b8970 <unknown>
#2 0x5bfc601f242a <unknown>
#3 0x5bfc601ee18f <unknown>
#4 0x5bfc6023ebd9 <unknown>
#5 0x5bfc6023e106 <unknown>
#6 0x5bfc60230063 <unknown>
#7 0x5bfc601fc328 <unknown>
#8 0x5bfc601fd491 <unknown>
#9 0x5bfc606c142b <unknown>
#10 0x5bfc606c52ec <unknown>
#11 0x5bfc606a8a22 <unknown>
#12 0x5bfc606c5e64 <unknown>
#13 0x5bfc6068cbef <unknown>
#14 0x5bfc606e8558 <unknown>
#15 0x5bfc606e8736 <unknown>
#16 0x5bfc606f8e76 <unknown>
#17 0x7a47c54e1ac3 <unknown>

[]


In [None]:
for x in bookmark_links:
    if x[0] != None:
      title = translate_to_english(x[0])
      if title == "Read the article" or title in '0123456789' or 'Next ›' in title or 'Last ”' in title: continue
      url = '/'.join(x[1].split("/")[3:])
      d[title] = url
for key in d:
  print(f'Title: {key}, link: {d[key]}')

In [None]:
def translate_to_english(ukrainian_text):
    translator = Translator()
    english_translation = translator.translate(ukrainian_text, src='uk', dest='en').text
    #english_translation = translator.translate(ukrainian_text, dest='en').text
    return english_translation

translate_to_english('\nОлександр Корнієнко більше не буде головою партії «Слуга Народу». Це може стати початком періоду хитавиці в партії і напруження серед основних її членів, але багато чого тут залежатиме від того, хто тепер очолить партію. 37-річний Олександр Корнієнко став номінальним головним «слугою» (ніхто навіть не сумнівається, що реальним партійним лідером був і залишиться Президент Володимир Зеленський) у листопаді 2019 року. Тепер, рівно за два роки, він полишає важливий пост в ієрархії')



'Oleksandr Kornienko will no longer be the chairman of the party "Servant of the People".This can be the beginning of the rowing period in the party and the tension among its main members, but much will depend on the one who will now head the party.37-year-old Alexander Kornienko became a nominal chief "servant" (no one even doubts that President Volodymyr Zelenskyy was and will remain the real party leader in November 2019.Now, exactly two years, he leaves an important post in the hierarchy'

In [None]:
def extract_to_file(d, max_num_article):
  i = 0
  for title, url in d.items():
    i += 1
    if i > max_num_article: break
    content = extract_article_content(url)
    content = ''.join(content.split( )[:])
    content = translate_to_english(content)
    save_to_file(title, content)

extract_to_file(d, 5)



1 Олександр
Content saved to Chemist, producer and party boss: I ... [scrapped].txt
2 Міністерство
Content saved to The Ministry of Health has expanded the list of professions, both ... [scrapped].txt


KeyboardInterrupt: 