In [1]:
#import speeches from al-manasa website using selenium library with defining story number because of scrolling boundaries on website
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin
from selenium import webdriver
import time

# Define the URL of the main page containing links to speeches
main_page_url = "https://almanassa.com/columns/president-speeches"

try:
    # Initialize the Selenium WebDriver for Firefox (GeckoDriver)
    driver = webdriver.Firefox()

    # Navigate to the main page
    driver.get(main_page_url)

    # Scroll down the page to load more content (you can repeat this as many times as needed)
    for _ in range(5):  # Adjust the number of scrolls as needed
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Wait for the content to load

    # Parse the HTML content of the page
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find all links to individual speeches on the main page
    speech_links = soup.find_all("a", class_="overlay-link")

    if speech_links:
        # Start numbering from 385
        start_number = 385

        # Iterate through the speech links
        for speech_link in speech_links:
            # Get the URL of the speech
            speech_url = speech_link["href"]
            
            # Extract the story number from the URL
            story_number = int(re.search(r'/stories/(\d+)', speech_url).group(1))
            
            # Check if the story number is less than 4686
            if story_number < 4686:
                # Increment the start_number
                start_number += 1
            else:
                # Skip stories with numbers greater than or equal to 4686
                continue

            # Check if the speech_url is a relative URL and make it an absolute URL
            if not speech_url.startswith("http"):
                speech_url = urljoin("https://almanassa.com", speech_url)

            # Navigate to the speech page
            driver.get(speech_url)

            # Parse the HTML content of the speech page
            speech_soup = BeautifulSoup(driver.page_source, "html.parser")

            # Rest of your code...

    else:
        print("Speech links not found on the main page. Please inspect the webpage structure.")

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the Selenium WebDriver
    if 'driver' in locals():
        driver.quit()


An error occurred: Message: Failed to decode response from marionette



In [None]:
#reading speeches and categorizing them by year based on the date in the speech title
import os
import re
from collections import defaultdict
import shutil

# Define the directory containing your text files
directory_path = '/path/to/your/text/files/'

# Create a dictionary to store files by year
files_by_year = defaultdict(list)

# Regular expression pattern to extract dates in the format "dd/mm/yyyy"
date_pattern = r'\d{1,2}/\d{1,2}/\d{4}'

# Iterate through the files in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    # Check if the file is a text file (you can modify this check as needed)
    if filename.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Extract the date from the title using regular expression
            match = re.search(date_pattern, content)
            if match:
                date_str = match.group()
                year = date_str.split('/')[-1]
                files_by_year[year].append(filename)

# Create directories for each year and move the corresponding files
for year, filenames in files_by_year.items():
    year_directory = os.path.join(directory_path, year)
    os.makedirs(year_directory, exist_ok=True)
    
    for filename in filenames:
        old_path = os.path.join(directory_path, filename)
        new_path = os.path.join(year_directory, filename)
        shutil.move(old_path, new_path)

In [2]:
#extract metadata of the speeches (title, date, and place) and save them in csv file using regular expressions
import os
import re
import csv

# Define the directory containing your text files
directory_path = 'speeches/2023/'

# Regular expression patterns to extract the title, date, and place
title_pattern = r'^(.*?)\s\d{1,2}/\d{1,2}/\d{4}'  # Extracts everything before the date
date_pattern = r'(\d{1,2}/\d{1,2}/\d{4})'  # Extracts the date
place_pattern = r'ألقيت الكلمة في (\S+(?:\s+\S+)?)|في (?:محافظة|مدينة|أكاديمية|الأكاديمية) (\S+(?:\s+\S+)?)'  # Extracts the place after specified phrases

# Define the CSV file path
csv_file_path = 'speeches/2023/speech2023_metadata.csv'

# Initialize the CSV file (create it if it doesn't exist)
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['File Name', 'Title', 'Date', 'Place']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

# Iterate through the files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)

        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Use regular expressions to extract the metadata
        title_match = re.search(title_pattern, content)
        date_match = re.search(date_pattern, content)
        place_match = re.search(place_pattern, content)

        # Extracted metadata
        if title_match:
            title = title_match.group(1)
        else:
            title = None

        if date_match:
            date = date_match.group(1)
        else:
            date = None

        if place_match:
            place = place_match.group(1) or place_match.group(2)
        else:
            place = None

        # Append the metadata to the CSV file, including the file name
        with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writerow({'File Name': filename, 'Title': title, 'Date': date, 'Place': place})

