In [6]:
from datetime import datetime
from airflow.operators.python import PythonOperator
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import requests




In [18]:
# Function to preprocess text data
def preprocess_text(text):
    # Remove extra whitespace and newline characters
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

# Function to extract data from a website
def extract_data(url):
    # Configure Chrome options to run headless
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    
    # Using Selenium to load the webpage and wait for dynamic content to load
    driver = webdriver.Chrome(options=chrome_options)  # You need to have Chrome WebDriver installed.
    driver.get(url)
    
    # Get the page source after waiting for a while to ensure content loads
    driver.implicitly_wait(10)  # Wait for 10 seconds
    page_source = driver.page_source
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Extract titles and descriptions
    data = []
    
    # Extract titles from heading tags (h1 to h6)
    headings = soup.find_all(re.compile('^h[1-6]$'))
    for heading in headings:
        title_text = heading.get_text(strip=True)
        title = preprocess_text(title_text)
        data.append([title, "No description available"])  # Append None for description
    
    # Extract descriptions from paragraph tags (p tags)
    paragraphs = soup.find_all('p')
    for i, description in enumerate(paragraphs):
        description_text = description.get_text(strip=True)
        description = preprocess_text(description_text)
        if i < len(data):  # Check if corresponding title exists
            data[i][1] = description  # Update description in existing data entry
    
    # Close the Selenium WebDriver
    driver.quit()
    
    return data




In [19]:
# Function to preprocess text data
def preprocess_text(text):
    # Remove extra whitespace and newline characters
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

# Function to extract data from a website using Beautiful Soup
def extract_data(url):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract titles and descriptions
        data = []
        
        # Extract titles from heading tags (h1 to h6)
        headings = soup.find_all(re.compile('^h[1-6]$'))
        for heading in headings:
            title_text = heading.get_text(strip=True)
            title = preprocess_text(title_text)
            data.append([title, "No description available"])  # Append None for description
        
        # Extract descriptions from paragraph tags (p tags)
        paragraphs = soup.find_all('p')
        for i, description in enumerate(paragraphs):
            description_text = description.get_text(strip=True)
            description = preprocess_text(description_text)
            if i < len(data):  # Check if corresponding title exists
                data[i][1] = description  # Update description in existing data entry
        
        return data
    else:
        print(f"Failed to fetch data from {url}")
        return None

# Extract data from Dawn.com
dawn_data = extract_data("https://www.dawn.com/")
dawn_df = pd.DataFrame(dawn_data, columns=['Title', 'Description'])

# Extract data from Geo.tv
geo_data = extract_data("https://www.bbc.com/")
geo_df = pd.DataFrame(geo_data, columns=['Title', 'Description'])







In [20]:
# Display dataframes of both sources before combining
print("Dawn Data:")

dawn_df




Dawn Data:


Unnamed: 0,Title,Description
0,"Today's Paper | May 15, 2024",Compunode.com Pvt. Ltd. (www.compunode.com).De...
1,End of live blog for elections 2024,"Copyright © 2024, Dawn"
2,Scrutiny concludes for nomination papers of 48...,NewsKit Publishing Platform
3,ECP notifies Mahmood Khan as PTI-P chairman,No description available
4,IHC approves Imran Khan’s bail in £190m corrup...,No description available
...,...,...
98,Chinese films shine once more at Cannes,No description available
99,Putin’s visit signifies high level of cooperation,No description available
100,China’s Swap Connect further enhanced to promo...,No description available
101,Chinese films shine once more at Cannes,No description available


In [21]:
print("\nBBC Data:")
geo_df





BBC Data:


Unnamed: 0,Title,Description
0,Slovak PM Robert Fico carried to car after bei...,PM Robert Fico was shot in what the Slovak Int...
1,"Crew trapped on Baltimore ship, weeks after br...","Morale is low for the Dali's crew members, who..."
2,Slovak PM Robert Fico carried to car after bei...,PM Robert Fico was shot in what the Slovak Int...
3,"Crew trapped on Baltimore ship, weeks after br...","Morale is low for the Dali's crew members, who..."
4,Slovak PM in life-threatening condition after ...,Robert Fico was shot as he left a meeting in H...
...,...,...
116,Travel,No description available
117,An F1-fanatic chef's guide to Emilia-Romagna,No description available
118,Where do all those Mother's Day flowers come f...,No description available
119,The US Founding Father who travelled the globe,No description available


In [22]:
# Concatenate DataFrames
combined_df = pd.concat([dawn_df, geo_df], ignore_index=True)

# Remove rows containing '<h1>' to '<h6>' or '<p>' tags in 'Title' or 'Description' columns
combined_df = combined_df[~combined_df['Title'].str.contains(r'<h[1-6]>|<p>', regex=True, case=False)]
combined_df = combined_df[~combined_df['Description'].str.contains(r'<h[1-6]>|<p>', regex=True, case=False)]

# Remove rows with titles having length less than 25
combined_df = combined_df[combined_df['Title'].str.len() >= 15]

# Drop duplicate rows based on the 'Title' column
combined_df.drop_duplicates(subset=['Title'], keep='first', inplace=True)

# Reset the index of the combined DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Save DataFrame to CSV file
csv_file = "articles_data.csv"
combined_df.to_csv(csv_file, index=False, encoding='utf-8')

# Print confirmation message
print(f"Data has been stored in '{csv_file}'.")




Data has been stored in 'articles_data.csv'.
