In [1]:
!pip install requests beautifulsoup4 scikit-learn nltk

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
   ---------------------------------------- 0.0/147.9 kB ? eta -:--:--
   -------------------------------------- - 143.4/147.9 kB ? eta -:--:--
   ---------------------------------------- 147.9/147.9 kB 2.2 MB/s eta 0:00:00
Downloading soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6



[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import nltk

In [3]:
# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WELCOME\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\WELCOME\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Function to scrape article content from URL
def fetch_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        content = " ".join([p.get_text() for p in paragraphs])
        return content
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

In [81]:
#URL of the news articles
urls = [
    'https://timesofindia.indiatimes.com/blogs/voices/water-contamination-still-a-serious-national-challenge/',
    'https://timesofindia.indiatimes.com/blogs/darksides/the-rising-toll-of-alcohol-addiction-in-goa-a-sobering-reality/',
    'https://timesofindia.indiatimes.com/blogs/fireflies-in-the-jar/sip-savor-and-snack-the-art-of-drink-couture/',
    'https://timesofindia.indiatimes.com/life-style/health-fitness/health-a-z/alcoholism-/-alcohol-use-disorder/diseasearticle/53598144.cms',
    'https://timesofindia.indiatimes.com/city/pune/rising-trend-of-binge-drinking-among-city-youths/articleshow/110547412.cms'
]

In [82]:
# Step 1: Read the articles and store content
articles = [fetch_article(url) for url in urls if fetch_article(url) is not None]

In [83]:
# Step 2: Preprocess and vectorize articles
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    words = text.lower().split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [84]:
# Preprocess each article
processed_articles = [preprocess_text(article) for article in articles]

In [85]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_articles)

In [86]:
# Step 3: Compute similarity
similarity_matrix = cosine_similarity(X)

In [97]:
# Step 4: Cluster articles
num_clusters = 3  # Adjust based on the data
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(X)
clusters = kmeans.labels_

In [98]:
# Step 5: Summarize articles in each cluster
def summarize_text(text):
    sentences = sent_tokenize(text)
    word_frequencies = FreqDist(word.lower() for word in text.split() if word not in stop_words)
    ranking = {sentence: sum(word_frequencies[word.lower()] for word in sentence.split()) for sentence in sentences}
    ranked_sentences = sorted(ranking, key=ranking.get, reverse=True)
    summary = " ".join(ranked_sentences[:3])  # Select top sentences as summary
    return summary

In [99]:
# Organize and summarize each cluster
clustered_articles = {}
for i in range(num_clusters):
    cluster_text = " ".join([articles[j] for j in range(len(articles)) if clusters[j] == i])
    clustered_articles[f'Cluster {i+1}'] = summarize_text(cluster_text)

In [100]:
# Output clustered summaries
for cluster, summary in clustered_articles.items():
    print(f"\n{cluster} Summary:\n{summary}\n")


Cluster 1 Summary:
Well, whether it’s a fruit or a flower, a carafe or a decanter, a goblet or a flute, a straw or a stirrer— one thing’s for sure: their main mission is to make your drink look like it belongs in an art gallery while sneaking in extra layers of flavour and aroma. Chhath puja can safeguard the environment “Dyslexia awareness month: Celebrating strengths and going beyond limitations” Barfi to Baklava Faith and the American promise One more disaster Blowing in the wind The safety net Gaining knowledge via quantum entanglement A journey of love, light and inner transformation British Raj is so yesterday Interested in blogging for timesofindia.com? Next time you raise your glass, remember: it’s not just about the sip, it’s about the experience—the art, the flavour, and the flair that makes your drink a masterpiece in a glass.


Cluster 2 Summary:
What you see first reveals your outlook towards life The reason why iconic artist MF Hussain agreed to design Shalini Passis' we

In [102]:
clustered_articles

{'Cluster 1': 'Well, whether it’s a fruit or a flower, a carafe or a decanter, a goblet or a flute, a straw or a stirrer— one thing’s for sure: their main mission is to make your drink look like it belongs in an art gallery while sneaking in extra layers of flavour and aroma. Chhath puja can safeguard the environment “Dyslexia awareness month: Celebrating strengths and going beyond limitations” Barfi to Baklava Faith and the American promise One more disaster Blowing in the wind The safety net Gaining knowledge via quantum entanglement A journey of love, light and inner transformation British Raj is so yesterday Interested in blogging for timesofindia.com? Next time you raise your glass, remember: it’s not just about the sip, it’s about the experience—the art, the flavour, and the flair that makes your drink a masterpiece in a glass.',
 'Cluster 2': "What you see first reveals your outlook towards life The reason why iconic artist MF Hussain agreed to design Shalini Passis' wedding car