<a href="https://colab.research.google.com/github/MeIsAlien/MUN-researcher_code/blob/main/MUN_researcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy
!pip install transformers
!pip install sentencepiece
!pip install newspaper3k
!pip install google
!pip install fpdf
!pip install scraparazzie

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip 

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast
from newspaper import Article
from scraparazzie import scraparazzie
import nltk
from nltk.corpus import stopwords
import googlesearch
import re
import numpy as np
from datetime import datetime
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
#model initialization
def topic_modeller(text, amount_of_words=3):
  count_vect = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
  x_counts = count_vect.fit_transform([text])
  x_counts.todense()

  count_vect.get_feature_names_out()

  dimension = 2
  tfidf_transformer = TfidfTransformer()
  x_tfidf = tfidf_transformer.fit_transform(x_counts)
  lda = LDA(n_components = dimension)
  lda_array = lda.fit_transform(x_tfidf)
  lda_array

  components = [lda.components_[i] for i in range(len(lda.components_))]
  features = count_vect.get_feature_names_out()
  return [sorted(features, key = lambda x: components[j][np.where(features == x)], reverse = True)[:amount_of_words] for j in range(len(components))]

#paraphrase model
paraphraser_model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")
paraphraser_tokenizer = PegasusTokenizerFast.from_pretrained("tuner007/pegasus_paraphrase")

#paraphrasing code
def get_paraphrased_sentences(input_text,num_return_sequences,num_beams):
  batch = paraphraser_tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt")
  translated = paraphraser_model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = paraphraser_tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

#extractive summarization model
def extractive_summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary='\n'.join(final_summary)
    return summary

#google news scraper
def get_news(topic):
    client = scraparazzie.NewsClient(language = 'english', location = 'India', query = topic, max_results = 100)
    news_items = client.get_news()
    NEWS_LIMIT = 100
    items = []
    for news in news_items[:NEWS_LIMIT]:
        title = news.title.text.split(' - ', 1)[0]
        source = news.source.text
        link = news.link.text
        pubdate = news.pubDate.text

        item = {
            'title': title,
            'source': source,
            'link': link,
            'publish_date': pubdate,
        }

        items.append(item)

    sorted_items = sorted(items, key=lambda x: datetime.strptime(x['publish_date'], "%a, %d %b %Y %H:%M:%S %Z"),
                          reverse=True)
    return sorted_items

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#initialization code
searched_websites = []
failed_to_search_websites = []
def research(topic, should_goto_news=False, normal_search_limit=10):
  search_results = {"google": research_from_google(topic, normal_search_limit), "news": research_from_news(topic)}
  return search_results


def research_from_news(topic):
  search_results = []
  for n in get_news(topic):
      current_article = Article(n["link"], language="en")
      try:
        current_article.download()
        current_article.parse()
      except Exception as e:
        failed_to_search_websites.append(n["link"])
        continue
      if current_article.canonical_link not in searched_websites and ".pdf" not in n:
        try:
          current_article.nlp()
          r = {"topics": topic_modeller(current_article.text, 10)[0], "url": n["link"], "summary": current_article.summary, "important facts": extractive_summarize(current_article.text, 1).replace('\n\n', '')}
          search_results.append(r)
          searched_websites.append(current_article.canonical_link)
        except Exception as e:
          failed_to_search_websites.append(n["link"])

  return search_results

def research_from_google(topic, normal_search_limit):
  search_results = []
  for j in googlesearch.search(topic, tld="co.in", num=normal_search_limit, stop=20, pause=2):
    current_article = Article(j, language="en")
    try:
      current_article.download()
      current_article.parse()
    except Exception as e:
      failed_to_search_websites.append(j)
      continue
    if current_article.canonical_link not in searched_websites and ".pdf" not in j:
      try:
        current_article.nlp()
        r = {"topics": topic_modeller(current_article.text, amount_of_words=10)[0], "url": j, "summary": current_article.summary, "important facts": extractive_summarize(current_article.text, 1).replace('\n\n', '')}
        search_results.append(r)
        searched_websites.append(current_article.canonical_link)
      except Exception as e:
        failed_to_search_websites.append(j)


  return search_results

In [10]:
import warnings

topic = "Ram Mandir riots"

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    results = research(topic, should_goto_news=True, normal_search_limit=5)
    for k in results["google"]:
      print(f"\n\nResults from: {k['url']}")
      print(f"\nTopics: {k['topics']}")
      print(f"\nSummary: {k['summary']}")
      print(f"\nImportant notes: {k['important facts']}")

    print("results from the news")
    for j in results["news"]:
      print(f"\n\nResults from: {j['url']}")
      print(f"\nTopics: {j['topics']}")
      print(f"\nSummary: {j['summary']}")
      print(f"\nImportant notes: {j['important facts']}")


results from the news


Results from: https://news.google.com/rss/articles/CBMimwFodHRwczovL3d3dy5uZXdzMTguY29tL2luZGlhL25ld3MxOC1ldmVuaW5nLWRpZ2VzdC1haGVhZC1vZi1yYW0tdGVtcGxlLW9wZW5pbmctY29uZy1sZWFkZXItc2F5cy1nb2RocmEtbGlrZS1yaW90cy1saWtlbHktYW5kLW90aGVyLXRvcC1zdG9yaWVzLTg3MjY3NzguaHRtbNIBnwFodHRwczovL3d3dy5uZXdzMTguY29tL2FtcC9pbmRpYS9uZXdzMTgtZXZlbmluZy1kaWdlc3QtYWhlYWQtb2YtcmFtLXRlbXBsZS1vcGVuaW5nLWNvbmctbGVhZGVyLXNheXMtZ29kaHJhLWxpa2UtcmlvdHMtbGlrZWx5LWFuZC1vdGhlci10b3Atc3Rvcmllcy04NzI2Nzc4Lmh0bWw?oc=5

Topics: ['india', 'read', 'like', 'hariprasad', 'siddaramaiah', 'bk', 'chief', 'son', 'ahead', 'godhra']

Summary: In today’s evening digest, News18 brings to you the latest updates on Congress leader BK Hariprasad’s comments ahead of Ram Mandir inauguration, Siddaramaiah’s son Yatindra’s comment on India becoming a Hindu-Rashtra and other stories.
Ahead of Ram Temple Opening, Cong Leader BK Hariprasad Says ‘Godhra-like Riots likely’; BJP Demands His ArrestDays ahead of the inaugura