In [33]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

In [34]:
# setup
chrome_options = Options()
chrome_options.add_argument('--headless')   # no browser window
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=chrome_options)

# dictionary for urls we're going to scrape
categories = {
    'Ai': 'https://techcrunch.com/category/artificial-intelligence/',
    'Meta': 'https://techcrunch.com/tag/meta/',
    'Gaming': 'https://techcrunch.com/category/gaming/',
    'Apple': 'https://techcrunch.com/tag/apple/',
    'Space': 'https://techcrunch.com/category/space/'
}

title = ''
link = ''
date = ''
image_url = ''
category_articles = []   # storing articles for current category
processed_links = set()  # To avoid duplicates
all_articles = []        # storing articles across all categories


# main loop (lopping through each category to scrape)
for category_name, url in categories.items():
    print(f"Scraping {category_name}...")
    
    # Scrape multiple pages
    max_pages = 10
    current_page = 1
    

    # change pages to load more and new content (techcrunch uses this format)
    while current_page <= max_pages:
        if current_page == 1:
            page_url = url
        else:
            page_url = f"{url.rstrip('/')}/page/{current_page}/"
        
        print(f"Scraping page {current_page}: {page_url}")
        
        # navigates to the given url and waits 3 seconds for it to load
        driver.get(page_url)
        time.sleep(3)

        # Scrolling to load more news (waiting 2 seconds for it load)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        # Try different container selectors
        # containers are used to extract info from each individual article
        # instead of searching through the entire HTML document
        article_containers = driver.find_elements(By.CSS_SELECTOR, '.post-block, .loop-card, article, [data-destinationlink], .river--homepage > div, .content > div')


        for container in article_containers:
            try:
                # extracting title and url
                title_elem = container.find_element(By.CSS_SELECTOR, '.loop-card__title-link')
                title = title_elem.text.strip()
                link = title_elem.get_attribute('href')
                
                if not title or not link or link in processed_links:
                    continue
                    
                # adding to processed links to avoid duplicates
                processed_links.add(link)
                    
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                # Extracting date
                date_elem = container.find_element(By.CSS_SELECTOR,  'time.loop-card__meta-item.loop-card__time.wp-block-tc23-post-time-ago')
                date = date_elem.get_attribute('datetime') or date_elem.text.strip()

                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                # Extracting image url
                img_elem = container.find_element(By.CSS_SELECTOR, 'img.wp-post-image')
                image_url = img_elem.get_attribute('src') or img_elem.get_attribute('data-src')


                
                # structurize and store scraped data in this format before saving as csv (making it clean)
                article_data = {
                    'title': title,
                    'link': link,
                    'date': date,
                    'image_url': image_url,
                    'category': category_name,
                    'source': 'techcrunch.com'
                }
                # collecting all articles for the current category
                category_articles.append(article_data)
                    
            except Exception as e:
                continue
        
        # switching to the next page until we reach max_pages
        current_page += 1
        time.sleep(2)

    # adding data from each category we just collected to all_articles which contains data from all categories
    all_articles.extend(category_articles)
    print(f"Found {len(category_articles)} articles in {category_name}")
    
    
driver.quit()


# Save to csv
if all_articles:
    df = pd.DataFrame(all_articles)
    df.to_csv('techcrunch_news.csv', index=False, encoding='utf-8-sig')
    df.to_json('techcrunch_news.json', index=False, force_ascii=False, orient='records', indent=2)
    print(f"Saved {len(all_articles)} articles total")
else:
    print("No articles found!")

Scraping Ai...
Scraping page 1: https://techcrunch.com/category/artificial-intelligence/
Scraping page 2: https://techcrunch.com/category/artificial-intelligence/page/2/
Scraping page 3: https://techcrunch.com/category/artificial-intelligence/page/3/
Scraping page 4: https://techcrunch.com/category/artificial-intelligence/page/4/
Scraping page 5: https://techcrunch.com/category/artificial-intelligence/page/5/
Scraping page 6: https://techcrunch.com/category/artificial-intelligence/page/6/
Scraping page 7: https://techcrunch.com/category/artificial-intelligence/page/7/
Scraping page 8: https://techcrunch.com/category/artificial-intelligence/page/8/
Scraping page 9: https://techcrunch.com/category/artificial-intelligence/page/9/
Scraping page 10: https://techcrunch.com/category/artificial-intelligence/page/10/
Found 298 articles in Ai
Scraping Meta...
Scraping page 1: https://techcrunch.com/tag/meta/
Scraping page 2: https://techcrunch.com/tag/meta/page/2/
Scraping page 3: https://techcr

In [35]:
dataset_path = "techcrunch_news.csv"
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,title,link,date,image_url,category,source
0,Lovable’s CEO isn’t too worried about the vibe...,https://techcrunch.com/2025/09/01/lovables-ceo...,2025-09-01T02:00:00-07:00,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com
1,Nvidia says two mystery customers accounted fo...,https://techcrunch.com/2025/08/30/nvidia-says-...,2025-08-30T14:40:49-07:00,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com
2,Taco Bell is having second thoughts about rely...,https://techcrunch.com/2025/08/30/taco-bell-is...,2025-08-30T09:50:00-07:00,https://techcrunch.com/wp-content/uploads/2023...,Ai,techcrunch.com
3,Cracks are forming in Meta’s partnership with ...,https://techcrunch.com/2025/08/29/cracks-are-f...,2025-08-29T18:34:05-07:00,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com
4,Spotlight on AI at TechCrunch Disrupt: Don’t m...,https://techcrunch.com/2025/08/29/spotlight-on...,2025-08-29T14:05:00-07:00,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4314 entries, 0 to 4313
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      4314 non-null   object
 1   link       4314 non-null   object
 2   date       4314 non-null   object
 3   image_url  4314 non-null   object
 4   category   4314 non-null   object
 5   source     4314 non-null   object
dtypes: object(6)
memory usage: 202.3+ KB


In [37]:
# cleaning date column

# extracting just the date from the original column and storing it in a new one
df['date_only'] = df['date'].str[:10]

# doing to the with time
df['time_only'] = df['date'].str[11:19]

# dropping the original column cuz we dont need it anymore
df = df.drop('date', axis=1)

df.head()

Unnamed: 0,title,link,image_url,category,source,date_only,time_only
0,Lovable’s CEO isn’t too worried about the vibe...,https://techcrunch.com/2025/09/01/lovables-ceo...,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com,2025-09-01,02:00:00
1,Nvidia says two mystery customers accounted fo...,https://techcrunch.com/2025/08/30/nvidia-says-...,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com,2025-08-30,14:40:49
2,Taco Bell is having second thoughts about rely...,https://techcrunch.com/2025/08/30/taco-bell-is...,https://techcrunch.com/wp-content/uploads/2023...,Ai,techcrunch.com,2025-08-30,09:50:00
3,Cracks are forming in Meta’s partnership with ...,https://techcrunch.com/2025/08/29/cracks-are-f...,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com,2025-08-29,18:34:05
4,Spotlight on AI at TechCrunch Disrupt: Don’t m...,https://techcrunch.com/2025/08/29/spotlight-on...,https://techcrunch.com/wp-content/uploads/2025...,Ai,techcrunch.com,2025-08-29,14:05:00


In [38]:
df.isnull().sum()

title        0
link         0
image_url    0
category     0
source       0
date_only    0
time_only    0
dtype: int64