In [3]:
pip install yake



In [4]:
import requests
from bs4 import BeautifulSoup
import time
from itertools import cycle
import asyncio  # ✅ Added for async execution
from transformers import pipeline  # ✅ Keep this as it is
import yake
import spacy
from diffusers import StableDiffusionPipeline
import torch
import os
import time


In [5]:
BASE_URL = "https://indianexpress.com/section/{topics}/{subtopics}/page/{page_no}"

categories = {
    "maharashtra": ["", "pune", "nagpur"],
    "sports": ["", "cricket", "football", "chess"]
}

category_pairs = [(cat, sub) for cat, subs in categories.items() for sub in subs]  # ✅ Kept unchanged
cyclic_iterator = cycle(category_pairs)  # ✅ Kept unchanged

headers = {  # ✅ Your headers are now properly included
    "Referer": "https://www.google.com/",
    "User-Agent": "Mozilla/5.0"
}

article_indices = {pair: 0 for pair in category_pairs}
article_cache = {pair: [] for pair in category_pairs}
current_page = {pair: 1 for pair in category_pairs}

async def get_article_links(topics, subtopics, page_no):
    """Fetch article links for a given category and page asynchronously."""
    if article_cache[(topics, subtopics)]:  # If cache is already filled, return
        return article_cache[(topics, subtopics)]

    url = BASE_URL.format(topics=topics, subtopics=subtopics, page_no=page_no)
    response = await asyncio.to_thread(requests.get, url, headers=headers)  # ✅ Now using your headers

    if response.status_code != 200:
        print(f"Unable to fetch {url}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    articles_link = []
    for link in soup.find_all(['h2', 'h3']):  # ✅ Handle different page structures
        a_tag = link.find('a')
        if a_tag and "href" in a_tag.attrs:
            articles_link.append(a_tag["href"])

    article_cache[(topics, subtopics)] = articles_link
    return articles_link

async def get_article(article_url,topic,subtopic):
    """Scrape article details from the given URL asynchronously."""
    response_article = await asyncio.to_thread(requests.get, article_url, headers=headers)  # ✅ Now using your headers

    if response_article.status_code != 200:
        print(f"Unable to fetch article: {article_url}")
        return None

    soup_article = BeautifulSoup(response_article.text, 'html.parser')

    heading_tag = soup_article.find('h1', class_='native_story_title')
    heading = heading_tag.text.strip() if heading_tag else "No Title Found"

    content_tag = soup_article.find('div', class_='story_details')
    all_p = [p.get_text(strip=True) for p in content_tag.find_all(['p', 'h4'])] if content_tag else []
    full_text = " ".join(all_p)

    discription_tag = soup_article.find('h2', class_='synopsis')
    discription = discription_tag.text.strip() if discription_tag else "No Description Found"

    metadata_tag = soup_article.find('div', class_='editor')
    by = metadata_tag.find('a').text if metadata_tag else "No Author Found"
    date = metadata_tag.find('span').text if metadata_tag else "No Date Found"

    return {'heading': heading, 'by': by, 'date': date, 'discription': discription,'topic':topic,'subtopic':subtopic, 'content': full_text, 'og_link': article_url}


In [6]:
async def scrape_article():
    """Scraping pipeline: Fetch article links, extract content, and process data asynchronously."""
    topics, subtopics = next(cyclic_iterator)

    if not article_cache[(topics, subtopics)]:
        current_page[(topics, subtopics)] += 1
        await get_article_links(topics, subtopics, current_page[(topics, subtopics)])

    if not article_cache[(topics, subtopics)]:
        return None

    index = article_indices[(topics, subtopics)] % len(article_cache[(topics, subtopics)])
    article_url = article_cache[(topics, subtopics)].pop(index)

    article = await get_article(article_url,topics,subtopics)
    return article


In [7]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def async_summarize_long_text(text):
    """Summarizes long text asynchronously by first chunking it."""

    def chunk_text(text, max_tokens=400):
        """Breaks long text into manageable chunks."""
        words = text.split()
        return [" ".join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

    chunks = chunk_text(text)

    # ✅ Summarize each chunk in parallel
    summaries = await asyncio.gather(
        *[asyncio.to_thread(summarizer, chunk,
            max_length=min(int(len(chunk.split()) * 0.8), 200),
            min_length=50,
            do_sample=False
        ) for chunk in chunks]
    )

    # ✅ Extract text from summaries
    summaries = [s[0]['summary_text'] for s in summaries]

    # ✅ Merge and summarize again if multiple chunks exist
    final_summary_text = " ".join(summaries)
    if len(summaries) > 1:
        final_summary = await asyncio.to_thread(
            summarizer, final_summary_text,
            max_length=min(int(len(final_summary_text.split()) * 0.8), 200),
            min_length=50,
            do_sample=False
        )
        return final_summary[0]['summary_text']

    return final_summary_text


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
def summerize_article(article):
  summary = async_summarize_long_text(article)
  return summary

In [9]:
article = scrape_article()
print(article)

<coroutine object scrape_article at 0x78b9f09f2f00>


In [10]:
!pip install yake
!pip install spacy



In [11]:
nlp = spacy.load("en_core_web_sm")  # ✅ Keeping your exact SpaCy model

async def extract_keywords(text, num_keywords=7):
    """Extracts keywords using YAKE and spaCy for better SEO optimization."""

    # ✅ Using YAKE for keyword ranking (async)
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=num_keywords)
    yake_keywords = await asyncio.to_thread(kw_extractor.extract_keywords, text)  # ✅ Async call
    yake_keywords = [kw[0] for kw in yake_keywords]

    # ✅ Using spaCy for extracting nouns & proper nouns (async)
    doc = await asyncio.to_thread(nlp, text)  # ✅ Running SpaCy processing asynchronously
    spacy_keywords = [token.text.lower() for token in doc if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop]

    # ✅ Merge YAKE & SpaCy keywords (remove duplicates)
    all_keywords = list(set(yake_keywords + spacy_keywords))[:num_keywords]

    return all_keywords

In [12]:
async def seo_optimize(content, title, description, author, date_time, summary, article_url):
    """Formats content for better SEO with metadata, Open Graph, and structured data."""

    # ✅ Extract keywords asynchronously
    keywords = await extract_keywords(content)

    seo_content = f"""
    <html lang="en">
    <head>
        <title>{title} | {' '.join(keywords)}</title>

        <!-- Basic SEO Meta Tags -->
        <meta name="description" content="{description}">
        <meta name="keywords" content="{', '.join(keywords)}">
        <meta name="author" content="{author}">
        <meta name="date&time" content="{date_time}">
        <meta name="robots" content="index, follow">

        <!-- Open Graph Metadata (For Social Media) -->
        <meta property="og:title" content="{title}">
        <meta property="og:description" content="{description}">
        <meta property="og:image" content="dfsf">
        <meta property="og:url" content="{article_url}">
        <meta property="og:type" content="article">

        <!-- Twitter Card Metadata -->
        <meta name="twitter:card" content="summary_large_image">
        <meta name="twitter:title" content="{title}">
        <meta name="twitter:description" content="{description}">
        <meta name="twitter:image" content="fdsfd">

        <!-- Structured Data (Schema.org) -->
        <script type="application/ld+json">
        {{
            "@context": "https://schema.org",
            "@type": "NewsArticle",
            "headline": "{title}",
            "author": {{
                "@type": "Person",
                "name": "{author}"
            }},
            "datePublished": "{date_time}",
            "mainEntityOfPage": {{
                "@type": "WebPage",
                "@id": "{article_url}"
            }},
            "image": "dfgdfg",
            "publisher": {{
                "@type": "Organization",
                "name": "Your News Website",
                "logo": {{
                    "@type": "ImageObject",
                    "url": "https://yourwebsite.com/logo.png"
                }}
            }}
        }}
        </script>
    </head>
    <body>
        <article>
            <h1>{title}</h1>

            <p><strong>Keywords:</strong> {', '.join(keywords)}</p>

            <h2>🔹 Summary</h2>
            <p>{summary}</p>

            <h2>📌 Key Takeaways</h2>
            <ul>
                <li>{summary[:100]}...</li>
                <li>Stay updated with the latest news.</li>
                <li>More details available at <a href="{article_url}">{article_url}</a></li>
            </ul>

            <p>For more updates, follow us on <a href="https://twitter.com/yournews">@YourNews</a>.</p>
        </article>
    </body>
    </html>
    """
    return seo_content

In [13]:
async def scrape_and_optimize(article):
    """Scrapes news articles from Indian Express and optimizes them asynchronously."""
    try:
        # ✅ Run summarization & SEO in parallel
        summary, seo_content = await asyncio.gather(
            async_summarize_long_text(article['content']),
            seo_optimize(
                article['content'], article['heading'], article['discription'],
                article['by'], article['date'], article['content'], article['og_link']
            )
        )

        return seo_content
    except Exception as e:
        print(f"Error: {e}")
        return None


In [14]:
# ✅ Choose CPU if no GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# ✅ Load Stable Diffusion Model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
pipe.to(device)

async def generate_image(prompt, filename="ai_newsroom.png"):
    """Generates an AI image using Stable Diffusion asynchronously and returns its path."""
    image = await asyncio.to_thread(pipe, prompt)
    image = image.images[0]

    save_path = os.path.join("/content/", filename)  # ✅ Store in Colab directory
    image.save(save_path)


    return save_path

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

(…)ure_extractor%2Fpreprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

safety_checker%2Fconfig.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

scheduler%2Fscheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

text_encoder%2Fconfig.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer%2Fspecial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

tokenizer%2Ftokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer%2Fmerges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

unet%2Fconfig.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

tokenizer%2Fvocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

vae%2Fconfig.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [15]:
for file in os.listdir("/content"):
    if file.endswith(".png"):
        print("Image Found:", file)


In [16]:
def update_seo_with_image(seo_content, image_path):
    """Finds and replaces all image placeholders in SEO content with the generated image path."""

    # ✅ Convert local file path to a web-accessible URL (Replace this with actual hosting logic)
    image_url = f"https://yourwebsite.com/{os.path.basename(image_path)}"

    # ✅ Replace all image placeholders
    seo_content = re.sub(r'content="dfsf"', f'content="{image_url}"', seo_content)  # Replace Open Graph image
    seo_content = re.sub(r'content="fdsfd"', f'content="{image_url}"', seo_content)  # Replace Twitter Card image
    seo_content = re.sub(r'"image": "dfgdfg"', f'"image": "{image_url}"', seo_content)  # Replace JSON-LD Schema image

    return seo_content


In [17]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m62.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.11.1


In [18]:
from pymongo import MongoClient

MONGO_URI = "mongodb+srv://jainshrutd211204:EhsTdMgsTHr2jAK7@fliprdb.ubake.mongodb.net/?retryWrites=true&w=majority&appName=fliprDB"
client = MongoClient(MONGO_URI)
client = MongoClient(MONGO_URI)
db = client["fliprDB"]  # Tumhara database name
collection = db["articles"]  # Collection jisme articles store honge

async def process_article():
    """Runs the entire scraping, summarization, SEO, and AI image generation pipeline."""
    # ✅ Step 1: Scrape the article
    article = await scrape_article()
    if not article:
        print("❌ No article fetched.")
        return None

    print(f"⏳ Fetching metadata, summarizing, and optimizing SEO... (Running in parallel)")

    # ✅ Step 2: Run all tasks asynchronously
    metadata_task = asyncio.create_task(get_article(article['og_link'],article['topic'],article['subtopic']))
    summary_task = asyncio.create_task(async_summarize_long_text(article['content']))
    seo_task = asyncio.create_task(seo_optimize(
        article['content'], article['heading'], article['discription'],
        article['by'], article['date'], article['content'], article['og_link']
    ))

    # ✅ Step 3: Generate AI-powered image
    image_filename = f"{article['heading'].replace(' ', '_')}.png"
    image_task = asyncio.create_task(generate_image(article['heading'], image_filename))

    # ✅ Step 4: Wait for all tasks to complete
    metadata, summary, seo_content, image_path = await asyncio.gather(metadata_task, summary_task, seo_task, image_task)

    # ✅ Step 5: Insert the correct image URL into `seo_content`
    updated_seo_content = update_seo_with_image(seo_content, image_path)

    # ✅ Step 6: Store everything into final structured output
    article.update(metadata)
    article['summary'] = summary
    article['seo_content'] = updated_seo_content
    article['ai_image'] = image_path  # ✅ Store the actual generated image path


    # ✅ Step 7: Store article into MongoDB
    result = collection.insert_one(article)

    return article

# ✅ Execute the fully optimized async pipeline
# ✅ Correct way in Jupyter Notebook
article = await process_article()



Unable to fetch https://indianexpress.com/section/maharashtra//page/2
❌ No article fetched.


In [20]:
!pip install flask flask-ngrok beautifulsoup4 spacy requests
!python -m spacy download en_core_web_sm


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [22]:
!pip uninstall -y flask-ngrok


Found existing installation: flask-ngrok 0.0.25
Uninstalling flask-ngrok-0.0.25:
  Successfully uninstalled flask-ngrok-0.0.25


In [23]:
!pip install pyngrok
!ngrok authtoken 2tUJfNKOiGzAUAbo3heOXMEJFw3_7WawmBYSHH5RZRTM3uvrd  # Replace with your actual auth token from https://dashboard.ngrok.com/auth


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [25]:
!pip install flask flask_cors pyngrok


Collecting flask_cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Downloading flask_cors-5.0.1-py3-none-any.whl (11 kB)
Installing collected packages: flask_cors
Successfully installed flask_cors-5.0.1


In [36]:
import os
from flask import Flask, render_template, request
from pyngrok import ngrok

# Kill any previous tunnels
!pkill -f ngrok

app = Flask(__name__)

# Open a new ngrok tunnel for Flask
port = 5000
public_url = ngrok.connect(port).public_url
print(f"Public URL: {public_url}")

# Sample article data
articles = {
    "Technology": {
        "AI": ["AI is transforming the world!", "Latest trends in AI"],
        "Cybersecurity": ["How to stay safe online?", "Latest security threats"]
    },
    "Health": {
        "Nutrition": ["Best foods for a healthy life", "Benefits of a balanced diet"],
        "Mental Health": ["How to manage stress?", "Importance of mindfulness"]
    }
}

@app.route('/')
def home():
    return '''
    <html>
    <head>
        <title>News Articles</title>
        <style>
            body {
                font-family: 'Arial', sans-serif;
                background: linear-gradient(135deg, #ff9a9e, #fad0c4);
                text-align: center;
                padding: 50px;
                color: #fff;
            }
            h1 {
                font-size: 2.5em;
                margin-bottom: 20px;
            }
            form {
                background: rgba(255, 255, 255, 0.9);
                padding: 20px;
                border-radius: 15px;
                box-shadow: 0px 10px 20px rgba(0, 0, 0, 0.2);
                display: inline-block;
                width: 50%;
                max-width: 400px;
            }
            select, input {
                width: 90%;
                padding: 12px;
                margin: 10px;
                border: none;
                border-radius: 8px;
                font-size: 16px;
            }
            input[type="submit"] {
                background: #ff758c;
                color: white;
                font-size: 18px;
                border: none;
                cursor: pointer;
                transition: 0.3s;
            }
            input[type="submit"]:hover {
                background: #ff5e7e;
                transform: scale(1.05);
            }
        </style>
    </head>
    <body>
        <h1>Discover News Articles</h1>
        <form action="/get_articles" method="post">
            <label for="topic"><b>Select Topic:</b></label>
            <select name="topic" id="topic">
                <option value="Technology">Technology</option>
                <option value="Health">Health</option>
            </select>
            <br><br>
            <label for="subtopic"><b>Select Subtopic:</b></label>
            <select name="subtopic" id="subtopic">
                <option value="AI">AI</option>
                <option value="Cybersecurity">Cybersecurity</option>
                <option value="Nutrition">Nutrition</option>
                <option value="Mental Health">Mental Health</option>
            </select>
            <br><br>
            <input type="submit" value="Get Articles">
        </form>
    </body>
    </html>
    '''

@app.route('/get_articles', methods=['POST'])
def get_articles():
    topic = request.form.get("topic")
    subtopic = request.form.get("subtopic")

    if topic in articles and subtopic in articles[topic]:
        articles_list = articles[topic][subtopic]
    else:
        articles_list = ["No articles found"]

    article_html = "".join(f"<li>{article}</li>" for article in articles_list)

    return f'''
    <html>
    <head>
        <title>Articles on {subtopic}</title>
        <style>
            body {{
                font-family: 'Arial', sans-serif;
                background: linear-gradient(135deg, #667eea, #764ba2);
                text-align: center;
                padding: 50px;
                color: #fff;
            }}
            h2 {{
                font-size: 2.2em;
                margin-bottom: 15px;
            }}
            ul {{
                list-style: none;
                padding: 0;
            }}
            li {{
                background: rgba(255, 255, 255, 0.9);
                margin: 10px auto;
                padding: 15px;
                border-radius: 10px;
                box-shadow: 0px 6px 12px rgba(0, 0, 0, 0.2);
                font-size: 18px;
                width: 50%;
                max-width: 500px;
                color: #333;
            }}
            a {{
                display: block;
                margin-top: 20px;
                text-decoration: none;
                color: #ffd700;
                font-weight: bold;
                font-size: 18px;
                transition: 0.3s;
            }}
            a:hover {{
                color: #ffea00;
                transform: scale(1.1);
            }}
        </style>
    </head>
    <body>
        <h2>Articles on {subtopic}</h2>
        <ul>{article_html}</ul>
        <a href="/">⬅ Go Back</a>
    </body>
    </html>
    '''

if __name__ == '__main__':
    app.run(port=port)


Public URL: https://c423-35-204-161-46.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:08] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:09] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:13] "POST /get_articles HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:16] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:21] "POST /get_articles HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Feb/2025 14:20:26] "GET / HTTP/1.1" 200 -


In [38]:
pip freeze > requirements.txt
