# Web Scraping Armenian Poems from grqamol.am

This notebook scrapes poems from the Armenian poetry website grqamol.am. It extracts poems from multiple pages and collects the text content from each poem.

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin, urlparse
import re

: 

In [None]:
# Configuration
BASE_URL = "https://grqamol.am"
STARTING_URL = "https://grqamol.am/category/poems/?tag=180213113413410888"

# Headers to mimic a real browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Storage for scraped poems
poems_data = []

In [None]:
def get_poem_content(poem_url):
    """
    Extract the full poem content from a poem's individual page
    """
    try:
        response = requests.get(poem_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try to find the poem content - looking for various possible selectors
        poem_content = None
        
        # Common selectors for poem content
        selectors = [
            '.poem-content',
            '.content',
            '.post-content',
            '.entry-content',
            'article .content',
            '.poem-text',
            '.text'
        ]
        
        for selector in selectors:
            content_element = soup.select_one(selector)
            if content_element:
                poem_content = content_element.get_text(strip=True)
                break
        
        # If no specific selector works, try to find the main content area
        if not poem_content:
            # Look for the largest text block that might contain the poem
            possible_content = soup.find_all(['div', 'p', 'section'], string=re.compile(r'.{50,}'))
            if possible_content:
                poem_content = max(possible_content, key=lambda x: len(x.get_text())).get_text(strip=True)
        
        # Extract title
        title = ""
        title_selectors = ['h1', '.title', '.post-title', '.entry-title']
        for selector in title_selectors:
            title_element = soup.select_one(selector)
            if title_element:
                title = title_element.get_text(strip=True)
                break
        
        return {
            'url': poem_url,
            'title': title,
            'content': poem_content or "Content not found"
        }
        
    except Exception as e:
        print(f"Error scraping {poem_url}: {str(e)}")
        return {
            'url': poem_url,
            'title': "Error",
            'content': f"Error: {str(e)}"
        }

In [None]:
def scrape_poems_from_page(page_url):
    """
    Scrape poem links from a category page
    """
    try:
        response = requests.get(page_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all poem links on the page
        poem_links = []
        
        # Common selectors for poem links
        link_selectors = [
            'a[href*="/poem/"]',
            'a[href*="/poetry/"]',
            '.poem-link a',
            '.post-title a',
            '.entry-title a',
            'h2 a',
            'h3 a',
            '.title a'
        ]
        
        for selector in link_selectors:
            links = soup.select(selector)
            for link in links:
                href = link.get('href')
                if href:
                    full_url = urljoin(BASE_URL, href)
                    if full_url not in poem_links:
                        poem_links.append(full_url)
        
        # If no specific selectors work, try to find all links that might be poems
        if not poem_links:
            all_links = soup.find_all('a', href=True)
            for link in all_links:
                href = link.get('href')
                # Look for patterns that suggest this is a poem link
                if href and any(pattern in href.lower() for pattern in ['poem', 'poetry', 'post']):
                    full_url = urljoin(BASE_URL, href)
                    if full_url not in poem_links:
                        poem_links.append(full_url)
        
        print(f"Found {len(poem_links)} poem links on {page_url}")
        return poem_links
        
    except Exception as e:
        print(f"Error scraping page {page_url}: {str(e)}")
        return []

In [None]:
def get_next_page_url(soup, current_url):
    """
    Find the URL of the next page
    """
    next_page_selectors = [
        'a[rel="next"]',
        '.next-page a',
        '.pagination .next a',
        'a:contains("Հաջորդ")',  # "Next" in Armenian
        'a:contains("այլ")',     # "More" in Armenian
        'a[href*="page="]'
    ]
    
    for selector in next_page_selectors:
        next_link = soup.select_one(selector)
        if next_link and next_link.get('href'):
            return urljoin(BASE_URL, next_link.get('href'))
    
    # Try to find pagination links and get the next one
    pagination_links = soup.find_all('a', href=re.compile(r'page=\d+'))
    if pagination_links:
        # Extract page numbers and find the next one
        current_page = 1
        try:
            current_page = int(re.search(r'page=(\d+)', current_url).group(1))
        except:
            pass
        
        for link in pagination_links:
            href = link.get('href')
            try:
                page_num = int(re.search(r'page=(\d+)', href).group(1))
                if page_num == current_page + 1:
                    return urljoin(BASE_URL, href)
            except:
                continue
    
    return None

In [None]:
def scrape_all_poems(starting_url, max_pages=10):
    """
    Main function to scrape all poems from multiple pages
    """
    current_url = starting_url
    page_count = 0
    all_poem_links = []
    
    while current_url and page_count < max_pages:
        print(f"\nScraping page {page_count + 1}: {current_url}")
        
        # Get the page content
        try:
            response = requests.get(current_url, headers=HEADERS, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"Error accessing page {current_url}: {str(e)}")
            break
        
        # Get poem links from this page
        poem_links = scrape_poems_from_page(current_url)
        all_poem_links.extend(poem_links)
        
        # Find next page
        next_url = get_next_page_url(soup, current_url)
        current_url = next_url
        page_count += 1
        
        # Be respectful to the server
        time.sleep(1)
    
    print(f"\nFound total {len(all_poem_links)} poem links across {page_count} pages")
    
    # Now scrape each poem
    print("Starting to scrape individual poems...")
    for i, poem_url in enumerate(all_poem_links):
        print(f"Scraping poem {i+1}/{len(all_poem_links)}: {poem_url}")
        poem_data = get_poem_content(poem_url)
        poems_data.append(poem_data)
        
        # Be respectful to the server
        time.sleep(0.5)
    
    print(f"\nCompleted scraping {len(poems_data)} poems!")
    return poems_data

In [None]:
# Start scraping - you can adjust max_pages as needed
print("Starting to scrape Armenian poems from grqamol.am...")
scraped_poems = scrape_all_poems(STARTING_URL, max_pages=5)

In [None]:
# Create a DataFrame with the scraped poems
df = pd.DataFrame(poems_data)
print(f"Scraped {len(df)} poems")
print("\nFirst few poems:")
df.head()

In [None]:
# Display some statistics about the scraped poems
print(f"Total poems scraped: {len(df)}")
print(f"Poems with content: {len(df[df['content'] != 'Content not found'])}")
print(f"Average content length: {df['content'].str.len().mean():.1f} characters")

# Show a sample poem
if len(df) > 0:
    print("\n" + "="*50)
    print("SAMPLE POEM:")
    print("="*50)
    sample_poem = df.iloc[0]
    print(f"Title: {sample_poem['title']}")
    print(f"URL: {sample_poem['url']}")
    print(f"Content preview: {sample_poem['content'][:300]}...")
    print("="*50)

In [None]:
# Save the scraped poems to a CSV file
if len(df) > 0:
    filename = "armenian_poems_grqamol.csv"
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Saved {len(df)} poems to {filename}")
    
    # Also save as JSON for better text handling
    import json
    json_filename = "armenian_poems_grqamol.json"
    df.to_json(json_filename, orient='records', force_ascii=False, indent=2)
    print(f"Also saved as {json_filename}")
else:
    print("No poems were scraped successfully")

In [1]:
# Let's start with a simple test first
import requests
print("Requests library imported successfully!")

# Test connection
url = "https://grqamol.am/category/poems/?tag=180213113413410888"
print(f"Testing connection to: {url}")

Requests library imported successfully!
Testing connection to: https://grqamol.am/category/poems/?tag=180213113413410888


In [2]:
# Let's first examine the page structure
from bs4 import BeautifulSoup
import time

# Test the URL
response = requests.get(url)
print(f"Status code: {response.status_code}")

soup = BeautifulSoup(response.content, 'html.parser')
print(f"Page title: {soup.title.string if soup.title else 'No title found'}")

# Let's see what links are on the page
all_links = soup.find_all('a', href=True)
print(f"Found {len(all_links)} links on the page")

# Look for poem links specifically
poem_links = []
for link in all_links[:10]:  # Just look at first 10 for now
    href = link.get('href')
    text = link.get_text(strip=True)
    print(f"Link: {href} - Text: {text[:50]}")
    if href and ('poem' in href.lower() or 'post' in href.lower()):
        poem_links.append(href)

Status code: 200
Page title: Բանաստեղծություններ
Found 141 links on the page
Link: javascript: - Text: 
Link: #mob_menu - Text: Toggle navigation
Link: /pulitzer/ - Text: Պուլիցերյան մրցանակաբաշխություն
Link: /category/harcazruycner/ - Text: Հարցազրույցներ
Link: /category/press/ - Text: Արձագանքներ
Link: /view/about-us/ - Text: Մեր Մասին
Link: /feedback/ - Text: Հետադարձ կապ
Link: / - Text: 
Link: / - Text: 
Link: /category/quote-book/ - Text: Մեջբերումներ Գրքերից


In [3]:
# Let's look for more specific patterns and content divs
import re
from urllib.parse import urljoin

# Check for specific content areas
content_divs = soup.find_all('div', class_=re.compile(r'content|post|poem|article'))
print(f"Found {len(content_divs)} content divs")

# Look for actual poem content on this page
poem_content_areas = soup.find_all(['div', 'section', 'article'], string=re.compile(r'.{100,}'))
print(f"Found {len(poem_content_areas)} areas with substantial text")

# Let's check if there are links to individual poems
base_url = "https://grqamol.am"
individual_poem_links = []

for link in all_links:
    href = link.get('href')
    if href:
        # Look for links that might lead to individual poems
        if any(pattern in href for pattern in ['/view/', '/post/', '/poem/', '/article/']):
            full_url = urljoin(base_url, href)
            individual_poem_links.append((full_url, link.get_text(strip=True)))

print(f"\nFound {len(individual_poem_links)} potential poem links:")
for i, (url, text) in enumerate(individual_poem_links[:5]):  # Show first 5
    print(f"{i+1}. {text[:50]} - {url}")

Found 96 content divs
Found 0 areas with substantial text

Found 78 potential poem links:
1. Մեր Մասին - https://grqamol.am/view/about-us/
2.  - https://grqamol.am/article/paruyr-sevak-yerku-siro-aranqum/
3. Պարույր Սևակ «Երկու սիրո արանքում» - https://grqamol.am/article/paruyr-sevak-yerku-siro-aranqum/
4.  - https://grqamol.am/article/paruyr-sevak-mor-dzerqer/
5. Պարույր Սևակ «Մոր ձեռքերը» - https://grqamol.am/article/paruyr-sevak-mor-dzerqer/


In [1]:
# Now let's create a comprehensive scraping function
import pandas as pd

def scrape_poem_content(poem_url):
    """Scrape the actual poem content from an individual poem page"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(poem_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the title
        title = ""
        title_selectors = ['h1', 'h2', '.title', '.post-title', '.entry-title']
        for selector in title_selectors:
            title_element = soup.select_one(selector)
            if title_element:
                title = title_element.get_text(strip=True)
                break
        
        # Get the poem content
        poem_content = ""
        
        # Look for content areas
        content_selectors = [
            '.article-content',
            '.post-content', 
            '.content',
            '.entry-content',
            '#content',
            '.main-content'
        ]
        
        for selector in content_selectors:
            content_element = soup.select_one(selector)
            if content_element:
                # Get text and clean it up
                poem_content = content_element.get_text(separator='\n', strip=True)
                break
        
        # If no specific content selector works, try to find the main text
        if not poem_content:
            # Look for paragraphs or divs with substantial text
            paragraphs = soup.find_all(['p', 'div'], string=re.compile(r'.{20,}'))
            if paragraphs:
                poem_content = '\n'.join([p.get_text(strip=True) for p in paragraphs[:5]])
        
        return {
            'url': poem_url,
            'title': title,
            'content': poem_content or "Content not found"
        }
        
    except Exception as e:
        print(f"Error scraping {poem_url}: {str(e)}")
        return {
            'url': poem_url,
            'title': "Error",
            'content': f"Error: {str(e)}"
        }

def get_all_poem_links_from_page(page_url):
    """Get all poem links from a category page"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(page_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        poem_links = []
        all_links = soup.find_all('a', href=True)
        
        for link in all_links:
            href = link.get('href')
            if href and '/article/' in href:
                full_url = urljoin("https://grqamol.am", href)
                if full_url not in poem_links:
                    poem_links.append(full_url)
        
        return poem_links
        
    except Exception as e:
        print(f"Error getting links from {page_url}: {str(e)}")
        return []

print("Scraping functions defined successfully!")

: 

In [1]:
# Armenian Poems Scraper - Simple Approach
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin

print("All libraries imported successfully!")

: 