In [17]:
import os
import requests
from bs4 import BeautifulSoup
from googlesearch import search

def search_articles(query, num_results):
    # Perform Google search and retrieve news articles only
    search_query = query + " news article"

    articles = []
    counter = 0  # Track the number of articles found

    for url in search(search_query, stop=num_results):
        if 'twitter.com' in url or 'facebook.com' in url:
            continue  # Skip social media links

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article title
        title = soup.find('title').get_text()

        # Extract article summary (if available)
        summary_tag = soup.find('meta', attrs={'name': 'description'})
        summary = summary_tag.get('content') if summary_tag else None

        # Extract all image tags from the article
        image_tags = soup.find_all('img')

        # Create a folder for the article's images
        folder_name = ''.join(c if c.isalnum() else '_' for c in title)
        os.makedirs(folder_name, exist_ok=True)

        image_info = []
        for i, img_tag in enumerate(image_tags):
            image_data = {}

            # Extract image source attribute(s)
            src = img_tag.get('src')
            if src and src.startswith(('http://', 'https://')):
                image_data['src'] = src

            dataset_src = img_tag.get('dataset-src')
            if dataset_src and dataset_src.startswith(('http://', 'https://')):
                image_data['dataset-src'] = dataset_src

            # Download and save each image in the folder
            for attr_name, img_link in image_data.items():
                img_data = requests.get(img_link).content
                img_path = os.path.join(folder_name, f'image_{i}_{attr_name}.jpg')
                with open(img_path, 'wb') as img_file:
                    img_file.write(img_data)

            image_info.append(image_data)

        # Store article data in a dictionary
        article_data = {
            'title': title,
            'url': url,
            'summary': summary,
            'images': image_info
        }
        articles.append(article_data)
        # Create a text file with article information
        text_file_path = os.path.join(folder_name, 'article_info.txt')
        with open(text_file_path, 'w') as text_file:
            text_file.write(f"Title: {title}\n")
            text_file.write(f"Summary: {summary}\n")
            text_file.write(f"URL: {url}\n")

        counter += 1
        if counter == num_results:
            break  # Stop retrieving more articles once desired count is reached

    return articles

# Example usage
query = input("Enter your search query: ")
num_results = int(input("Enter the number of articles to retrieve: "))
articles = search_articles(query, num_results)

# Display the retrieved article data
for i, article in enumerate(articles, start=1):
    print(f"Article {i}:")
    print("Title:", article['title'])
    print("URL:", article['url'])
    print("Summary:", article['summary'])
    print("Images:")
    for img_info in article['images']:
        for attr_name, img_link in img_info.items():
            print(f"Attribute: {attr_name}")
            print("Image Link:", img_link)
            print()
    print()


Enter your search query: mehul choksi
Enter the number of articles to retrieve: 3
Article 1:
Title: ‘Mehul Choksi can’t be removed from Antigua and Barbuda’: What the court ruled | Explained News,The Indian Express
URL: https://indianexpress.com/article/explained/mehul-choksi-cannot-be-removed-antigua-8557711/
Summary: According to Mehul Choksi's account, in 2021, RAW agents kidnapped him, torturing him and coercing him to admit to wrongdoing.
Images:
Attribute: src
Image Link: https://sb.scorecardresearch.com/p?c1=2&c2=8738137&cv=2.0&cj=1

Attribute: src
Image Link: https://www.facebook.com/tr?id=444470064056909&ev=PageView&noscript=1

Attribute: src
Image Link: https://indianexpress.com/wp-content/themes/indianexpress/images/facebook-icon.svg

Attribute: src
Image Link: https://indianexpress.com/wp-content/themes/indianexpress/images/twitter-icon.svg

Attribute: src
Image Link: https://indianexpress.com/wp-content/themes/indianexpress/images/youtube-icon.svg

Attribute: src
Image Lin