<a href="https://colab.research.google.com/github/Neetagrg/Mini-Project_Spring-2025-WEB-DATA-MINING-CUS-635-0-/blob/main/miniproject635.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mini Project: Neeta Kumari & Bir Bahadur Gharti

In [11]:
!pip install boto3 requests pandas




Step 2: Import the Required Libraries
We need to import boto3, requests (for fetching data from the API), and other libraries to handle the data.

In [27]:
import os
import boto3
import requests
import pandas as pd
from botocore.config import Config
from botocore import UNSIGNED




# Setup the AWS S3 Connection

In [14]:
# Team 6 folder
TEAM = "TEAM_6/"

# Define the provided S3 bucket name
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))


# Fetching Articles from the Guardian API

In [31]:
import requests
import pandas as pd
from tqdm import tqdm

PAGE_SIZE = 200  # Maximum per request
total_articles = []

# Function to fetch articles and handle errors
def fetch_articles(page_num):
    params = {
        'api-key': API_KEY,
        'section': 'technology',
        'q': 'AI in health',  # Search query
        'page-size': PAGE_SIZE,
        'page': page_num,  # Change page number
        'show-fields': 'body'  # Request content (body) of the article
    }

    try:
        response = requests.get(API_URL, params=params)
        response.raise_for_status()  # Will raise an error for bad status codes
        return response.json()  # Return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        return None

# Function to process and store articles
def process_articles(articles_data):
    articles = articles_data['response']['results']
    for article in articles:
        heading = article['webTitle']  # Extracting the title
        context = article['fields'].get('body', 'No content available')  # Extracting the content

        # Clean context: truncate if it's too long and remove unnecessary whitespace
        cleaned_context = (context[:500] + '...') if len(context) > 500 else context.strip()

        total_articles.append({
            'heading': heading,
            'context': cleaned_context
        })

# Fetch and process articles
for page_num in tqdm(range(1, 6), desc="Fetching pages", unit="page"):
    articles_data = fetch_articles(page_num)
    if articles_data:
        process_articles(articles_data)
    else:
        print(f"Skipping page {page_num} due to an error.")

# Create a DataFrame for better presentation and export
df = pd.DataFrame(total_articles)

# Optionally save the results to a CSV file for later use
df.to_csv("AI_in_Health_Articles.csv", index=False)

# Displaying first 5 articles for quick preview
print(df.head())

print(f"Fetched {len(total_articles)} articles across 5 pages.")


Fetching pages: 100%|██████████| 5/5 [00:06<00:00,  1.24s/page]

                                             heading  \
0  Prioritise artists over tech in AI copyright d...   
1  EU accused of leaving ‘devastating’ copyright ...   
2  AI ‘godfather’ predicts another revolution in ...   
3  If the best defence against AI is more AI, thi...   
4  Chinese AI chatbot DeepSeek censors itself in ...   

                                             context  
0  <p>Two cross-party committees of MPs have urge...  
1  <p>An architect of EU copyright law has said l...  
2  <p>One of the “godfathers” of modern artificia...  
3  <p>Oscar Wilde’s quip, “Life imitates art far ...  
4  <p>Users experimenting with DeepSeek have seen...  
Fetched 1000 articles across 5 pages.





## Save Articles as a CSV

In [30]:

def save_articles_to_csv(articles, filename):
    article_data = []

    for article in articles:
        title = article['webTitle']
        url = article['webUrl']
        category = article['health-related']
        content = article.get('fields', {}).get('bodyText', 'No content available')

        # Append article details to the list
        article_data.append([title, url, category, content])

    # Convert the list into a DataFrame and save to CSV
    df = pd.DataFrame(article_data, columns=['Title', 'URL', 'Category', 'Content'])
    df.to_csv(filename, index=False)
    print(f"Articles saved to {filename}")


# Upload the CSV to S3

In [21]:

def upload_file_to_s3(local_filename, s3_filename):
    try:
        s3.upload_file(local_filename, BUCKET_NAME, TEAM + s3_filename)
        print(f"File '{local_filename}' uploaded successfully to {TEAM} folder in the S3 bucket.")
    except FileNotFoundError:
        print(f"The file {local_filename} was not found.")
    except Exception as e:
        print(f"Error uploading file: {e}")


# Download a File from S3

In [22]:

def download_file_from_s3(s3_filename, local_filename):
    try:
        s3.download_file(BUCKET_NAME, TEAM + s3_filename, local_filename)
        print(f"File '{s3_filename}' downloaded successfully as '{local_filename}'!")
    except Exception as e:
        print(f"Error downloading file: {e}")
