In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_articles_from_page(page_number):
    base_url = f'https://www.express.pk/science/archives/?page={page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    titles = []
    urls = []
    contents = []

    # Find all article containers on the page
    article_containers = soup.find_all('div', {'class': 'story'})

    # Debug to check the containers found
    if not article_containers:
        print(f"No articles found on page {page_number}")

    for container in article_containers:
        # Find all 'a' tags with the class 'image' within the article container
        a_tags = container.find_all('a', href=True, class_='image')

        for a_tag in a_tags:
            # Extract the article URL from the 'href' attribute
            article_url = a_tag['href']

            # Fetch the article content
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.text, 'html.parser')

            # Extract the title
            title_tag = article_soup.find('h1', class_='title')
            title = title_tag.get_text(strip=True) if title_tag else 'No title found'

            # Extract the content
            content_div = article_soup.find('div', class_='span-16 story-content last mobile-story-content fix-l-r')
            content = ''
            if content_div:
                content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))

            # Append extracted information to lists
            titles.append(title)
            urls.append(article_url)
            contents.append(content)

    return titles, urls, contents

# Function to scrape articles from multiple pages
def scrape_articles(max_pages):
    all_titles = []
    all_urls = []
    all_contents = []

    for page_number in range(1, max_pages + 1):
        titles, urls, contents = scrape_articles_from_page(page_number)
        all_titles.extend(titles)
        all_urls.extend(urls)
        all_contents.extend(contents)

    return all_titles, all_urls, all_contents

# Scrape articles from the first 10 pages (adjust as needed)
max_pages = 643
all_titles, all_urls, all_contents = scrape_articles(max_pages)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Title': all_titles,
    'URL': all_urls,
    'Content': all_contents
})

# Save DataFrame to CSV
df.to_csv('express_science_articles.csv', index=False)
print("Scraping completed")


Scraping completed


In [None]:
!pip install datasets
import pandas as pd
import os
import time
import requests
from datasets import Dataset, DatasetDict, load_dataset
from datasets import load_dataset, Dataset
from bs4 import BeautifulSoup
from huggingface_hub import HfApi, HfFolder



repo_name = "snehagautam/nlp_webscraping"
csv_file_path = "/content/jang_archive_articles_02-08-2016 to 01-08-2017.csv"

existing_dataset = load_dataset(repo_name, split='train')
existing_df = existing_dataset.to_pandas().reset_index(drop=True)
new_df = pd.read_csv(csv_file_path)
new_df = new_df.rename(columns={
    'Title': 'heading',
    'URL': 'url',
    'Content': 'content'
})

combined_df = pd.concat([existing_df, new_df], ignore_index=True)

if '__index_level_0__' in combined_df.columns:
    combined_df = combined_df.drop(columns=['__index_level_0__'])

combined_dataset = Dataset.from_pandas(combined_df, preserve_index=False)

api = HfApi()
HfFolder.save_token("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")
combined_dataset.push_to_hub(repo_name)


In [None]:
!pip install datasets
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import HfApi, HfFolder

# Hugging Face repo name and CSV file path
repo_name = "snehagautam/nlp_webscraping"
csv_file_path = "/content/jang_archive_articles_02-08-2016 to 01-08-2017.csv"

# Load the existing dataset from the Hugging Face repository
existing_dataset = load_dataset(repo_name, split='train')
existing_df = existing_dataset.to_pandas().reset_index(drop=True)

# Process the large CSV in chunks to avoid memory issues
chunk_size = 10000  # Define a chunk size for processing
new_df_chunks = pd.read_csv(csv_file_path, chunksize=chunk_size)
new_df = pd.DataFrame()

for chunk in new_df_chunks:
    # Rename columns to match the existing dataset
    chunk = chunk.rename(columns={'Title': 'heading', 'URL': 'url', 'Content': 'content'})

    # Append each chunk to the main new_df DataFrame
    new_df = pd.concat([new_df, chunk], ignore_index=True)

# Combine the old and new datasets
combined_df = pd.concat([existing_df, new_df], ignore_index=True)

# Drop unnecessary index column if present
if '__index_level_0__' in combined_df.columns:
    combined_df = combined_df.drop(columns=['__index_level_0__'])

# Convert the combined DataFrame to a Hugging Face Dataset
combined_dataset = DatasetDict({
    'train': Dataset.from_pandas(combined_df, preserve_index=False)
})

# Hugging Face API setup
api = HfApi()
HfFolder.save_token("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")

# Push the combined dataset to the Hugging Face Hub
combined_dataset.push_to_hub(repo_name)

print("Dataset push complete!")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_articles_from_page(page_number):
    base_url = f'https://www.express.pk/crime/archives/?page={page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    titles = []
    urls = []
    contents = []

    # Find all article containers on the page
    article_containers = soup.find_all('div', {'class': 'story'})

    # Debug to check the containers found
    if not article_containers:
        print(f"No articles found on page {page_number}")

    for container in article_containers:
        # Find all 'a' tags with the class 'image' within the article container
        a_tags = container.find_all('a', href=True, class_='image')

        for a_tag in a_tags:
            # Extract the article URL from the 'href' attribute
            article_url = a_tag['href']

            # Fetch the article content
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.text, 'html.parser')

            # Extract the title
            title_tag = article_soup.find('h1', class_='title')
            title = title_tag.get_text(strip=True) if title_tag else 'No title found'

            # Extract the content
            content_div = article_soup.find('div', class_='span-16 story-content last mobile-story-content fix-l-r')
            content = ''
            if content_div:
                content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))

            # Append extracted information to lists
            titles.append(title)
            urls.append(article_url)
            contents.append(content)

    return titles, urls, contents

# Function to scrape articles from multiple pages
def scrape_articles(max_pages):
    all_titles = []
    all_urls = []
    all_contents = []

    for page_number in range(1, max_pages + 1):
        titles, urls, contents = scrape_articles_from_page(page_number)
        all_titles.extend(titles)
        all_urls.extend(urls)
        all_contents.extend(contents)

    return all_titles, all_urls, all_contents

# Scrape articles from the first 10 pages (adjust as needed)
max_pages = 280
all_titles, all_urls, all_contents = scrape_articles(max_pages)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Title': all_titles,
    'URL': all_urls,
    'Content': all_contents
})

# Save DataFrame to CSV
df.to_csv('express_crime_articles.csv', index=False)
print("Scraping completed")


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_articles_from_page(page_number):
    base_url = f'https://www.express.pk/business/archives/?page={page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    titles = []
    urls = []
    contents = []

    # Find all article containers on the page
    article_containers = soup.find_all('div', {'class': 'story'})

    # Debug to check the containers found
    if not article_containers:
        print(f"No articles found on page {page_number}")

    for container in article_containers:
        # Find all 'a' tags with the class 'image' within the article container
        a_tags = container.find_all('a', href=True, class_='image')

        for a_tag in a_tags:
            # Extract the article URL from the 'href' attribute
            article_url = a_tag['href']

            # Fetch the article content
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.text, 'html.parser')

            # Extract the title
            title_tag = article_soup.find('h1', class_='title')
            title = title_tag.get_text(strip=True) if title_tag else 'No title found'

            # Extract the content
            content_div = article_soup.find('div', class_='span-16 story-content last mobile-story-content fix-l-r')
            content = ''
            if content_div:
                content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))

            # Append extracted information to lists
            titles.append(title)
            urls.append(article_url)
            contents.append(content)

    return titles, urls, contents

# Function to scrape articles from multiple pages
def scrape_articles(max_pages):
    all_titles = []
    all_urls = []
    all_contents = []

    for page_number in range(1, max_pages + 1):
        titles, urls, contents = scrape_articles_from_page(page_number)
        all_titles.extend(titles)
        all_urls.extend(urls)
        all_contents.extend(contents)

    return all_titles, all_urls, all_contents

# Scrape articles from the first 10 pages (adjust as needed)
max_pages = 2114
all_titles, all_urls, all_contents = scrape_articles(max_pages)

# Create a DataFrame from the lists
df = pd.DataFrame({
    'Title': all_titles,
    'URL': all_urls,
    'Content': all_contents
})

# Save DataFrame to CSV
df.to_csv('express_business_articles.csv', index=False)
print("Scraping completed")
