In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re

# Function to extract the main heading (news title)
def extract_heading(soup):
    for heading_tag in ['h1', 'h2', 'h3']:
        heading = soup.find(heading_tag)
        if heading:
            return heading.get_text(strip=True)
    return "No Title Found"

# Optimized function to extract the main news content by analyzing text blocks within <div> and <p> tags
def extract_news_content(soup):
    divs = soup.find_all('div')
    div_text_lengths = []

    # Extract text from each <div> and store its length
    for div in divs:
        p_texts = []
        for p in div.find_all('p'):
            text = p.get_text(separator=' ', strip=True)
            text = re.sub(r'\s+', ' ', text).strip()
            if text:
                p_texts.append(text)
        
        if p_texts:
            full_text = " ".join(p_texts)
            div_text_lengths.append((full_text, len(full_text)))

    # Sort the divs by the length of text (descending order)
    div_text_lengths.sort(key=lambda x: x[1], reverse=True)

    # Optimized: Extract text, iterating through paragraphs within the largest div
    if div_text_lengths:
        largest_div_text, _ = div_text_lengths[0]
        p_texts = re.split(r'\s*\n\s*', largest_div_text)
        
        selected_text = ""
        previous_length = len(p_texts[0])
        significant_drop_threshold = 0.2  # 80% decrease means 0.2 of the previous length

        for text in p_texts:
            current_length = len(text)
            
            if current_length < previous_length * significant_drop_threshold:
                break
            selected_text += text + " "
            previous_length = current_length
        
        return selected_text.strip()

    return "No Content Found"

# Step 1: Read the CSV file with low_memory=False to avoid DtypeWarning
csv_file_path ='C:\\Users\\Milan\\OneDrive\\Desktop\\New folder\\Data\\Random\\ayo\\2015.csv'
df = pd.read_csv(csv_file_path, low_memory=False)

# Step 2: Specify the column that contains the URLs
url_column = 'SOURCEURL'

# Step 3: Create a folder to save the scraped content
output_folder = 'scraped_contents_2024_earthqake'
os.makedirs(output_folder, exist_ok=True)

# Step 4: Define excluded domains
excluded_domains = [
    "www.linkedin.com", "www.youtube.com", "www.facebook.com",
    "www.twitter.com", "www.help.twitter.com"
]

# Step 5: Open a log file to record any failed URLs
log_file = open('eq_failed_urls_optimized.log', 'w')

# Step 6: Iterate through the URLs and scrape content
for index, row in df.iterrows():
    url = row[url_column]
    domain = urlparse(url).netloc
    
    if domain in excluded_domains:
        print(f"Skipping {url} (excluded domain)")
        continue
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted sections (ads, footers, headers, etc.)
        for ad_section in soup(['aside', 'footer', 'header', 'nav', 'iframe', 'script', 'style']):
            ad_section.decompose()

        # Extract the main heading
        heading = extract_heading(soup)

        # Extract the optimized news content
        news_content = extract_news_content(soup)
        
        # Combine the heading and news content
        combined_content = f"{heading}\n\n{news_content}"
        
        # Clean up the text
        combined_content = re.sub(r'\s+', ' ', combined_content)
        combined_content = combined_content.strip()
        
        # Define the file name
        file_name = f"url_content_{index + 1}.txt"
        file_path = os.path.join(output_folder, file_name)
        
        # Save the combined content to a text file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(combined_content)

        print(f"Content from {url} saved to {file_path}")
    
    except requests.exceptions.RequestException as e:
        # Log the failed URL and error message
        log_file.write(f"Failed to retrieve {url}: {e}\n")
        print(f"Failed to retrieve {url}: {e}")

# Close the log file
log_file.close()

print("Optimized scraping completed!")

Failed to retrieve http://www.couriermail.com.au/news/world/nepal-earthquake-fears-for-australians-missing/story-fnihsmjt-1227321312652?nk=09696f5f0b7330aa50f18308f0ccdacb: 404 Client Error: Not Found for url: https://www.couriermail.com.au/news/world/nepal-earthquake-fears-for-australians-missing/story-fnihsmjt-1227321312652?nk=09696f5f0b7330aa50f18308f0ccdacb
Failed to retrieve http://www.seacoastonline.com/article/20150425/NEWS/150429361/101017/NEWS: 404 Client Error: OK for url: https://www.seacoastonline.com/article/20150425/NEWS/150429361/101017/NEWS
Failed to retrieve http://www.seacoastonline.com/article/20150425/NEWS/150429361/101017/NEWS: 404 Client Error: OK for url: https://www.seacoastonline.com/article/20150425/NEWS/150429361/101017/NEWS
Failed to retrieve http://www.KAALtv.com/article/stories/S3777715.shtml: 404 Client Error: Not Found for url: https://www.kaaltv.com/article/stories/S3777715.shtml
Failed to retrieve http://www.KAALtv.com/article/stories/S3777715.shtml: 4