<a href="https://colab.research.google.com/github/Neetagrg/Mini-Project_Spring-2025-WEB-DATA-MINING-CUS-635-0-/blob/main/miniproject635.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mini Project: Neeta Kumari & Bir Bahadur Gharti

In [19]:
!pip install boto3 requests pandas




# Import the Required Libraries


In [20]:
import os
import boto3
import requests
import pandas as pd
from botocore.config import Config
from botocore import UNSIGNED


# Setup the AWS S3 Connection

In [21]:
# Team 6 folder
TEAM = "TEAM_6/"

# Define the provided S3 bucket name
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))


# Fetching Articles from the Guardian API

## Save Articles as a CSV

In [22]:
import requests
import pandas as pd
from tqdm import tqdm

API_URL = "https://content.guardianapis.com/search"  # API URL
API_KEY = "80d8126f-3f17-4f76-ab02-e27428b23e63"   # API Key

PAGE_SIZE = 200  # Maximum number of results per request
total_articles = []  # List to store all fetched articles

# Function to fetch articles with pagination and handle errors
def fetch_articles(page_num):
    params = {
        "api-key": API_KEY,  # Corrected this line by adding a comma
        "section": "technology",
        "q": "AI in health",  # Search query for AI in health
        "page-size": PAGE_SIZE,  # Limit number of articles per request
        "page": page_num,  # Page number for pagination
        "show-fields": "body"  # Request the body/content of the article
    }

    try:
        response = requests.get(API_URL, params=params)
        response.raise_for_status()  # Will raise an error for bad status codes
        return response.json()  # Return the JSON response
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        return None

# Function to process and store articles
def process_articles(articles_data):
    if "response" not in articles_data or "results" not in articles_data["response"]:
        print("Invalid response format")
        return

    articles = articles_data["response"]["results"]
    for article in articles:
        heading = article.get("webTitle", "No title available")  # Extracting the title
        context = article.get("fields", {}).get("body", "No content available")  # Extracting the content

        # Clean context: truncate if it's too long and remove unnecessary whitespace
        cleaned_context = (context[:500] + "...") if len(context) > 500 else context.strip()

        total_articles.append({
            "heading": heading,
            "context": cleaned_context
        })

# Fetch initial data to determine the total number of pages
initial_data = fetch_articles(1)
if not initial_data:
    print("Failed to fetch initial data. Exiting.")
    exit()

# Get the total number of results and calculate the number of pages available
total_results = initial_data["response"]["total"]
total_pages = (total_results // PAGE_SIZE) + (1 if total_results % PAGE_SIZE != 0 else 0)

# Fetch and process articles from available pages
for page_num in tqdm(range(1, total_pages + 1), desc="Fetching pages", unit="page"):
    articles_data = fetch_articles(page_num)
    if articles_data:
        process_articles(articles_data)
    else:
        print(f"Skipping page {page_num} due to an error.")

# Create a DataFrame for better presentation and export
df = pd.DataFrame(total_articles)

# Save the results to a CSV file for later use
df.to_csv("AI_in_Health_Articles.csv", index=False)

# Displaying first 5 articles for quick preview
print(df.head())

# Displaying the total number of articles fetched
print(f"Fetched {len(total_articles)} articles across {total_pages} pages.")


Fetching pages: 100%|██████████| 14/14 [00:15<00:00,  1.11s/page]

                                             heading  \
0  AI-driven weather prediction breakthrough repo...   
1  Prioritise artists over tech in AI copyright d...   
2  EU accused of leaving ‘devastating’ copyright ...   
3  If the best defence against AI is more AI, thi...   
4  AI ‘godfather’ predicts another revolution in ...   

                                             context  
0  <p>A single researcher with a desktop computer...  
1  <p>Two cross-party committees of MPs have urge...  
2  <p>An architect of EU copyright law has said l...  
3  <p>Oscar Wilde’s quip, “Life imitates art far ...  
4  <p>One of the “godfathers” of modern artificia...  
Fetched 2623 articles across 14 pages.





# Upload the CSV to S3

In [15]:
import boto3
from botocore.config import Config
from botocore import UNSIGNED

# Team - Specify your team number
TEAM = "TEAM_6/"  # Change "TEAM_1" to "TEAM_6"
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# Uploading a file to S3
file_name = "AI_in_Health_Articles.csv"  # Replace with your CSV file
s3.upload_file(file_name, BUCKET_NAME, TEAM + file_name)
print(f"File {file_name} uploaded successfully to {TEAM}{file_name}")


File AI_in_Health_Articles.csv uploaded successfully to TEAM_6/AI_in_Health_Articles.csv


# Download a File from S3

In [16]:
import boto3
from botocore.config import Config
from botocore import UNSIGNED

# Team - Specify your team number
TEAM = "TEAM_6/"  # Change "TEAM_1" to "TEAM_6"
BUCKET_NAME = "cus635-spring2025"

# Create an anonymous S3 client
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# Download a file from S3
object_name = "AI_in_Health_Articles.csv"  # Replace with the file you want to download
new_name = "downloaded_AI_in_Health_Articles.csv"  # New name for the downloaded file

s3.download_file(BUCKET_NAME, TEAM + object_name, new_name)
print(f"File '{TEAM + object_name}' downloaded successfully as '{new_name}'!")


File 'TEAM_6/AI_in_Health_Articles.csv' downloaded successfully as 'downloaded_AI_in_Health_Articles.csv'!


In [23]:
from google.colab import drive
drive.mount('/content/drive')

# Now, save the file in your Google Drive
file_path = '/content/drive/My Drive/AI_in_Health_Articles.csv'
df.to_csv(file_path, index=False)
print(f"File saved to Google Drive at {file_path}")


Mounted at /content/drive
File saved to Google Drive at /content/drive/My Drive/AI_in_Health_Articles.csv


In [24]:
# Try downloading the file to confirm its availability and uploaded to buckeT S3
s3.download_file(BUCKET_NAME, "TEAM_6/AI_in_Health_Articles.csv", "downloaded_AI_in_Health_Articles.csv")
print("File downloaded successfully!")


File downloaded successfully!
