### Clustering Public Perception of Artificial Intelligence
### Author: Joshua Kwan
### Goal: Collect, clean, and cluster public discussions about AI

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date

In [6]:
# Load Data 
# Advanced AI News Scraper (GoogleNews Version)
# Purpose: Collect diverse, current media coverage on AI to
#          analyze public perception and real-world framing.

from GoogleNews import GoogleNews
import pandas as pd
from datetime import datetime, date

# CONFIGURATION

# Topics to focus on (edit freely)
SEARCH_TOPICS = [
    "Artificial Intelligence",
    "AI and mental health",
    "AI and creativity",
    "AI and education",
    "AI ethics",
    "AI and human behavior"
]

PERIOD = "1m"           # options: '1d', '7d', '1m', '3m', '6m'
MAX_PAGES = 5           # number of pages per topic (≈10 results/page)
REGION = "US"           # e.g. 'US', 'GB', 'IN'
LANG = "en"             # language
SAVE_FILENAME = f"ai_news_dataset_full_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"

# Words to exclude (filters out irrelevant stock/crypto news)
EXCLUDE_KEYWORDS = ["stock", "token", "crypto", "bitcoin", "price", "tokenization"]

# SCRAPER LOGIC

def fetch_news_for_topic(topic):
    googlenews = GoogleNews(lang=LANG, region=REGION, period=PERIOD)
    googlenews.search(topic)
    results = []
    for page in range(1, MAX_PAGES + 1):
        googlenews.getpage(page)
        results.extend(googlenews.result())
    return results

all_results = []
for topic in SEARCH_TOPICS:
    print(f"🔍 Fetching: {topic}")
    topic_results = fetch_news_for_topic(topic)
    all_results.extend(topic_results)
    print(f"✅ {len(topic_results)} articles collected for '{topic}'\n")

#  CLEANING & FORMATTING

df = pd.DataFrame(all_results)

# Ensure consistent columns
expected_cols = ['title', 'media', 'date', 'desc', 'link']
for col in expected_cols:
    if col not in df.columns:
        df[col] = None

# Drop duplicates and missing values
df = df.dropna(subset=['title'])
df = df.drop_duplicates(subset=['title'])

# Rename columns for consistency
df.rename(columns={'title': 'text', 'media': 'source'}, inplace=True)

# Add metadata
df['date_fetched'] = date.today()
df['type'] = 'news'

# Filter out irrelevant articles (stocks, crypto, etc.)
mask = ~df['text'].str.lower().str.contains('|'.join(EXCLUDE_KEYWORDS), na=False)
df = df[mask]

# Optional: remove non-English or foreign sources
df = df[~df['source'].str.contains("Noticias|Zeitung|Le Monde|El País|Der", na=False)]

# SAVE RESULTS

df.to_csv(SAVE_FILENAME, index=False)
print(f"\n✅ {len(df)} total articles saved to {SAVE_FILENAME}")
print("Example rows:\n")
pd.set_option("display.max_colwidth", 100)
display(df.sample(10))

🔍 Fetching: Artificial Intelligence
✅ 200 articles collected for 'Artificial Intelligence'

🔍 Fetching: AI and mental health
✅ 150 articles collected for 'AI and mental health'

🔍 Fetching: AI and creativity
✅ 200 articles collected for 'AI and creativity'

🔍 Fetching: AI and education
✅ 200 articles collected for 'AI and education'

🔍 Fetching: AI ethics
✅ 200 articles collected for 'AI ethics'

🔍 Fetching: AI and human behavior
✅ 200 articles collected for 'AI and human behavior'


✅ 289 total articles saved to ai_news_dataset_full_20251029_1708.csv
Example rows:



Unnamed: 0,text,source,date,datetime,desc,link,img,date_fetched,type
193,"Why Context, Not Prompts, Is Key to Enterprise Reliability",PYMNTS.com,3 hours ago,2025-10-29 14:08:38.490204,"Forget fine-tuning: The future of reliable enterprise AI could be in context engineering, not be...",https://www.pymnts.com/artificial-intelligence-2/2025/why-context-not-prompts-is-key-to-enterpri...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
199,"AI errors, Intel supply strain, nuclear plant breach | Ep. 9",Computerworld,3 hours ago,2025-10-29 14:08:38.509444,In today's Tech Briefing: A global study finds AI chatbots get the news wrong nearly half the ti...,https://www.computerworld.com/podcast/4081279/ai-errors-intel-supply-strain-nuclear-plant-breach...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
344,Preliminary Report on Dangers of AI Chatbots,Psychiatric Times,3 weeks ago,2025-10-08 17:08:42.244218,"AI chatbots pose significant risks in mental health, often exacerbating issues like self-harm an...",https://www.psychiatrictimes.com/view/preliminary-report-on-dangers-of-ai-chatbots&ved=2ahUKEwiU...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
681,"Google Invests $2.85 Million to Strengthen South Africa’s AI Skills, Education, and Cybersecurity",iAfrica.com,2 hours ago,2025-10-29 15:08:47.656804,"On the eve of AI Expo Africa 2025, Google.org announced a $2.85 million (~R49 million) investmen...",https://iafrica.com/google-invests-2-85-million-to-strengthen-south-africas-ai-skills-education-...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
752,The digital ethics curriculum: Should every university require a 'how to work with AI' course?,The Times of India,3 hours ago,2025-10-29 14:08:48.577929,News News: Universities face growing pressure to teach AI literacy as graduates enter workplaces...,https://timesofindia.indiatimes.com/education/news/the-digital-ethics-curriculum-should-every-un...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
1,MIT Technology Review and the Financial Times form strategic editorial partnership to explore th...,StreetInsider,0 minutes ago,2025-10-29 17:08:35.551529,... 2025 /PRNewswire/ -- MIT Technology Review and the Financial Times have formed a strategic e...,https://www.streetinsider.com/PRNewswire/MIT%2BTechnology%2BReview%2Band%2Bthe%2BFinancial%2BTim...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
482,Toronto based creative technologists create AI-powered Halloween playlist designed to calm dogs ...,Cantech Letter,1 day ago,2025-10-28 17:08:44.685540,"The Gentle Tails, a new 35-track playlist by Toronto-based innovation agency WTD, uses AI and ca...",https://www.cantechletter.com/newswires/toronto-based-creative-technologists-create-ai-powered-h...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
203,"As A.I. grows more realistic, experts caution against emotional dependence",13newsnow.com,4 hours ago,2025-10-29 13:08:40.291199,"As teens seek connection online, A.I. companions fill the gap, but experts address the hidden em...",https://www.13newsnow.com/article/tech/ai-grows-more-realistic-experts-caution-against-emotional...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
1147,A foundation model to predict and capture human cognition,Nature,3 months ago,2025-07-29 17:08:53.893814,"Here we introduce Centaur, a computational model that can predict and simulate human behaviour i...",https://www.nature.com/articles/s41586-025-09215-4&ved=2ahUKEwij0sXRqMqQAxWOvokEHS1XNzY4KBDF9AF6...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
996,Top 24 Applications of AI: Transforming Industries Today,Simplilearn.com,2 months ago,2025-08-29 17:08:52.715094,24 Artificial Intelligence Applications: 1. E-Commerce 2. Education 3. Lifestyle 4. Navigation 5...,https://www.simplilearn.com/tutorials/artificial-intelligence-tutorial/artificial-intelligence-a...,"data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",2025-10-29,news
