In [1]:
import os
import requests
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Directory to save downloaded files
download_directory = 'raw_texts'

# Ensure the download directory exists
if not os.path.exists(download_directory):
    os.makedirs(download_directory)

def download_file(file_number):
    url = f"https://edx-ai-class.s3.us-west-1.amazonaws.com/RAW_TEXT/article_{file_number}.txt"
    output_path = os.path.join(download_directory, f"raw_text_{file_number}.txt")

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes

        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
            logging.info(f"Downloaded and saved: {output_path}")

    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to download file from {url}: {str(e)}")

def main():
    for file_number in range(1, 101):  # Sequentially from 1 to 100
        logging.info(f"Downloading file number: {file_number}")
        download_file(file_number)

if __name__ == '__main__':
    main()


2024-05-20 21:05:53,520 - INFO - Downloading file number: 1


2024-05-20 21:05:54,092 - INFO - Downloaded and saved: raw_texts/raw_text_1.txt
2024-05-20 21:05:54,098 - INFO - Downloading file number: 2
2024-05-20 21:05:54,515 - INFO - Downloaded and saved: raw_texts/raw_text_2.txt
2024-05-20 21:05:54,520 - INFO - Downloading file number: 3
2024-05-20 21:05:54,959 - INFO - Downloaded and saved: raw_texts/raw_text_3.txt
2024-05-20 21:05:54,962 - INFO - Downloading file number: 4
2024-05-20 21:05:55,296 - INFO - Downloaded and saved: raw_texts/raw_text_4.txt
2024-05-20 21:05:55,300 - INFO - Downloading file number: 5
2024-05-20 21:05:55,738 - INFO - Downloaded and saved: raw_texts/raw_text_5.txt
2024-05-20 21:05:55,743 - INFO - Downloading file number: 6
2024-05-20 21:05:56,183 - INFO - Downloaded and saved: raw_texts/raw_text_6.txt
2024-05-20 21:05:56,186 - INFO - Downloading file number: 7
2024-05-20 21:05:56,624 - INFO - Downloaded and saved: raw_texts/raw_text_7.txt
2024-05-20 21:05:56,627 - INFO - Downloading file number: 8
2024-05-20 21:05:57,