In [95]:
import Fetcher
import transformers
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
import numpy as np
import random
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster

feed_url = 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'
articles = Fetcher.scrape_news_from_feed(feed_url)

#stores a list of articles in a dictionary
article_dict = {}
for i in range(0, len(articles)):
    article_dict["article{}".format(i)] = articles[i]


#variables:

#arbirtrary threshold for similarity
threshold = 1.3
#list of articles to be clustered
list_of_articles = list(article_dict.values())

#sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

#list of vectors about sentence semantics for each sentence
embeddings = model.encode(list_of_articles) 

#links embeddings using ward method
Z = linkage(embeddings, method='ward')

# Create clusters by cutting the dendrogram at a certain threshold (t)
clusters = fcluster(Z, t=threshold, criterion='distance')

#creates a dictionary with numbered articles and their clustered
cluster_dict = {}
for i in range(0,len(clusters)):
    cluster_dict["article{}".format(i)] = clusters[i]

#initializes a dictionary w keys being each of the clusters 
grouped_dict = {}
for cluster_num in np.unique(list(cluster_dict.values())):
    grouped_dict[cluster_num] = []

#values of cluster_dict defined as all articles with same cluster in a list
for k,v in cluster_dict.items():
    if k not in grouped_dict:
        grouped_dict[v].append(k)


#removes all clusters with only one article
filtered_grouped_dict = {k:v for k,v in grouped_dict.items() if len(v) > 1}

#creates list of articles where all articles of same topic are concatonated into one string
articles_to_summarize = []

for k, v in filtered_grouped_dict.items():
    one_topic_list = []
    for art in v:
        one_topic_list.append((article_dict[art])["content"])
    articles_to_summarize.append("\t".join(one_topic_list))


{'title': 'Why Is Trump Holding a Rally at Madison Square Garden?', 'author': ['Maggie Haberman'], 'publish_date': datetime.datetime(2024, 10, 26, 0, 0), 'content': 'When former President Donald J. Trump decided to take a day off the battleground campaign trail in the waning days of the race to hold a rally Sunday at Madison Square Garden, it prompted a question from many political observers: Why?\n\nNew York is hardly a battleground state, and New York City is still a Democratic stronghold. So how come Mr. Trump is planning an event in Midtown Manhattan in the final two weeks of his presidential campaign?\n\nHere are five reasons:\n\nHe will get to see his name in lights.\n\nMr. Trump was a performer and reality TV star before he was a political candidate and president. (It is worth recalling that at the Republican National Convention this summer in Milwaukee, Mr. Trump appeared onstage with a Broadway-style light display spelling out T-R-U-M-P.)\n\nFor years, Mr. Trump has measured t

In [105]:
#Loads model BERT summarization model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#defines tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")

summaries_list = []
for a in articles_to_summarize:
    input_ids = tokenizer.encode(a, return_tensors='pt', padding=True, truncation=True, max_length=512)
    summary_ids = model.generate(input_ids,
                min_length=40,
                max_length=256,
                num_beams=10,
                repetition_penalty=2.0,
                length_penalty=3.0,
                no_repeat_ngram_size=3,
                use_cache=True,
                do_sample = False,
                top_k = 50,
                )

    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries_list.append(summary_text)

print("done")


Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "transformers_version": "4.46.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "_name_or_path": "bert-base-uncased",
  "add_cross_attention"

done


In [106]:
for summary in summaries_list:
    print(summary, end = "\n\n")

israel's strikes on iran destroyed air - defense systems set up to protect several critical oil and petrochemical refineries, according to three iranian officials and three senior israeli officials. if it retaliates, it risks further escalation at a time when its economy is struggling and its allies are faltering, the three iranians say.

the tirupati temple in andhra pradesh has given laddu, a ball - shaped sweet, to devotees. it is the richest hindu holy site in the world with revenues each year of hundreds of millions of dollars. some states have banned the slaughter of cows and made transportation of beef a punishable offense.

eric adams, the mayor of america's largest city and one of the country's most prominent black elected officials, made the comments at a time when mr. trump has been trying to make inroads with black voters. he was asked if he believed the former president was a fascist or compared to adolf hitler.

the former president is planning an event in midtown manhatt