<a href="https://colab.research.google.com/github/Kirushikesh/Schlumberger-s-Hackathon/blob/main/Scl_hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Crawler

For now we are focussing on the 4 websites provided in the problem statement. We input a search query and get the top results url from the results.

Output will be the list of news feed url links

In [10]:
data_sources=[
    #'https://electrical-engineering-portal.com/?s=%s&post_type_page=&post_type_post=',
    #'https://climate.mit.edu/search/google?keys=%s',
    'https://netl.doe.gov/search/node?keys=%s',
    'https://www.iea.org/search/news?q=%s'
]

In [45]:
import requests
from bs4 import BeautifulSoup

def iea_crawler(site,k):
    r = requests.get(site)
    soup = BeautifulSoup(r.content, 'html.parser')

    out=[]
    for article in soup.find_all('article',class_='m-news-listing',limit=k):
        out.append('https://www.iea.org'+article.find('a').get('href'))
    return out

def netl_crawler(site,k):
    r = requests.get(site)
    soup = BeautifulSoup(r.content, 'html.parser')

    out=[]
    for content in soup.find_all('div',class_='netlsearch-results',limit=k):
        out.append(content.find('a').get('href'))
    return out

In [12]:
def return_topk(query,k):
    crawler_list=[]
    crawler_list.extend(iea_crawler('https://www.iea.org/search/news?q=%s' %query,k))

    crawler_list.extend(netl_crawler('https://netl.doe.gov/search/node?keys=%s' %query,k))
    return crawler_list

# Scraper

Find the crawled websites and web scrap each and every document in that list and preprocess it.

Output will be the list of articles in english

In [13]:
import re

def remove_script_code(data):
    pattern = r'<[ ]*script.*?\/[ ]*script[ ]*>'  # mach any char zero or more times
    return re.sub(pattern, '', data, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))

# remove whitespace from text
def remove_whitespace(text):
    return  " ".join(text.split())
 
# Condenses all repeating newline characters into one single newline character
def condense_newline(text):
    return ' '.join([p for p in re.split('\n|\r', text) if len(p) > 0])

def remove_htmltag(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub(' ', str(text))

def scrap(site):
    r = requests.get(site)
    soup = BeautifulSoup(r.content,'html.parser')
    return remove_whitespace(condense_newline(remove_htmltag(remove_script_code(str(soup)))))

In [14]:
def scrapper_agg(sites):
    texts=[]
    for site in sites:
        texts.append(scrap(site))
    
    return texts

# Summarizer

In [15]:
!pip install -q -U transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
#importing libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request  

def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table


def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words

       

    return sentence_weight

def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, .5 * threshold)

    return article_summary

In [17]:
from transformers import pipeline

pipe=pipeline("summarization",model='t5-small')

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [38]:
from nltk.tokenize import sent_tokenize

def summarize(texts):
    summary=[]
    
    for text in texts:
        summary_results = _run_article_summary(text)
        #summary_results=text
        pipe_out=pipe(summary_results)
        summary.append("\n".join(sent_tokenize(pipe_out[0]['summary_text'])))
    
    return summary

# Aggregation

In [27]:
def aggregate_results(query,sites,summaries,k):
    print('The Query is :',query,'\n')
    print(f'The Top {k} Results are :')

    print('\n Results from IEA :')
    for site,summary in zip(sites[:k],summaries[:k]):
        print('According to',site,':')
        print('Summary: '+summary)
        print('\n')

    print('Results from National Energy Technology Laboratory :')
    for site,summary in zip(sites[k:2*k],summaries[k:2*k]):
        print('According to',site,':')
        print('Summary: '+summary)
        print('\n')

    return

# Example Runs

In [42]:
query='carbon dioxide removal'
k=2

In [46]:
sites=return_topk(query,k)
sites[0]

'https://www.iea.org/news/belgium-needs-to-build-on-success-with-offshore-wind-to-reduce-reliance-on-imported-fossil-fuels-new-iea-policy-review-says'

In [47]:
sites

['https://www.iea.org/news/belgium-needs-to-build-on-success-with-offshore-wind-to-reduce-reliance-on-imported-fossil-fuels-new-iea-policy-review-says',
 'https://www.iea.org/news/new-iea-book-addresses-the-role-of-electricity-in-meeting-climate-change-goals',
 'https://netl.doe.gov/22CM-CDR-proceedings',
 'https://netl.doe.gov/carbon-dioxide-removal']

In [48]:
plain_texts=scrapper_agg(sites)
plain_texts[0][:500]

'Belgium needs to build on success with offshore wind to reduce reliance on imported fossil fuels, new IEA policy review says - News - IEA IEA Close Search Submit IEA Skip navigation Countries Find out about the world, a region, or a country All countries circle-arrow Explore world circle-arrow Member countries Australia Austria Belgium Canada Czech Republic Denmark Estonia Finland France Germany Greece Hungary Ireland Italy Japan Korea Lithuania Luxembourg Mexico New Zealand Norway Poland Portug'

In [49]:
summaries=summarize(plain_texts)
summaries[3]

'NETL is a cosponsor of NCCC .\nDOE Announces $1.236 billion in funding for four direct air capture projects .\nthe project is focusing on a wide array of CDR approaches .'

In [50]:
aggregate_results(query,sites,summaries,k)

The Query is : carbon dioxide removal 

The Top 2 Results are :

 Results from IEA :
According to https://www.iea.org/news/belgium-needs-to-build-on-success-with-offshore-wind-to-reduce-reliance-on-imported-fossil-fuels-new-iea-policy-review-says :
Summary: Belgium needs to build on success with offshore wind to reduce reliance on imported fossil fuels, new IEA policy review says .
in 2021, Belgium had the sixth highest offshore wind capacity in the world, a major accomplishment given the country’s small and busy territorial waters .
this output will continue to grow following recently announced steps to further accelerate and expand offshore wind deployment .


According to https://www.iea.org/news/new-iea-book-addresses-the-role-of-electricity-in-meeting-climate-change-goals :
Summary: a new book from the IEA addresses the role of electricity in meeting climate-change goals 27 May 2011 . the report aims to bring forward policy questions that must be faced if electricity is to play it