# Config File

In [48]:
import json, yaml
from datetime import timedelta, date, datetime

PAPER_SITES_FILE = "paper_to_site.json"
with open("paper_to_site.json", 'r') as f:
    paper_site = json.load(f)

class SERPLinkScraperConfig:
    def __init__(self, config_file = None):
        self.config = {
            "searchnws": False, 
            "gl": "bd", 
            "hl": "bn", 
            "filter": "0", 
            "num": 100,
            "output_dir": "",
            "output_file": "serp_{papers}_{query}.json",
            "quote_sub": "[QUOTE]",
            "overwrite": False,
            "skip_existing": True,
            "seen_sites_file": None
        }
        
        if config_file is not None:
            self.load_config(config_file)
    
    def range_dates(self, date_start:date, date_end:date, increment:timedelta):
        """
        Helper function to get time intervals
        :param date_start: Starting date
        :param date_end: ending date
        :param increment: increments dates by value
        :return: list of dates
        """
        while date_start + increment < date_end:
            yield date_start, date_start + increment
            date_start += increment
        yield date_start, date_end

    def generate_dates_from_date_ranges(self, date_ranges):
        dates = []
        for start, end, inc in date_ranges:
            start = datetime.strptime(start, "%m/%d/%Y").date()
            end = datetime.strptime(end, "%m/%d/%Y").date()
            inc = timedelta(days=inc)
            dates.extend((s.strftime("%m/%d/%Y"), t.strftime("%m/%d/%Y")) for s, t in self.range_dates(start, end, inc))
        return dates
    
    def query_paper_date_from_spec(self, specs):
        if 'query' in specs:
            queries = [specs['query']]
        elif 'queries' in specs:
            queries = specs['queries']
        else:
            raise KeyError("Expecting 'queries' in config file")
        
        if 'paper' in specs:
            papers = [specs['paper']]
        elif 'papers' in specs:
            papers = specs['papers']
        else:
            raise KeyError("Expecting 'paper' in config file")
            
        unknown_papers = [paper for paper in papers if paper not in paper_site]
        if len(unknown_papers) > 0:
            raise KeyError("Unknown papers: {}\nPlease add to paper list".format(", ".join(unknown_papers)))
        
        if 'date_ranges' not in specs:
            raise KeyError("Expecting 'date_ranges' in config file")
        
        dates = self.generate_dates_from_date_ranges(specs['date_ranges'])
        
        for q in queries:
            for p in papers:
                for d in dates:
                    yield q, p, d  
    
    def params(self, query, paper, date_interval):
        site = paper_site[paper]
        start_date, end_date = date_interval
        params = {
            "engine": "google",
            "q": "{} site:{}".format(query, site),
            "google_domain": "google.com",
            "gl": self.config['gl'],
            "hl": self.config['hl'],
            'filter':self.config['filter'],
            "num": self.config['num'],
            "api_key": self.config['SERPAPI_KEY'],
            "tbs": "cdr:1,cd_min:{},cd_max:{}".format(start_date, end_date)
        }

        if self.config['searchnws']:
            params["tbm"] = "nws"

        return params 
    
    def update_config(self, config_file):
        with open(config_file, "r", encoding='utf-8') as f:
            self.config.update(yaml.load(f, Loader=yaml.FullLoader))
    
    def check_config(self):
        missing = [param for param in ('SERPAPI_KEY',) if param not in self.config]
        if len(missing) > 0:
            raise KeyError("Config missing: {}".format(", ".join(missing)))

    def load_config(self, config_file):
        self.update_config(config_file)
        self.check_config()
        
    def searches(self, spec = None):
        if spec is None:
            spec = self.config
            
        if "searches" in spec:
            for spec in self.config['searches'].values():
                for search in self.searches(spec):
                    yield search
        else:
            for query, paper, date in self.query_paper_date_from_spec(spec):
                yield query, paper, date, self.params(query, paper, date)
                
    def outfile(self):
        filename = self.config["output_file"]
        outdir = self.config["output_dir"] if 'output_dir' in self.config else ""
        return os.path.join(outdir, filename)
        
    def __repr__(self):
        return self.config.__repr__()

In [49]:
s = SERPLinkScraperConfig("RandomDenseSERPSearch.yaml")

In [50]:
for q, i in zip(s.searches(), range(10)):
    print(q)

('"হয়েছে"', 'the_daily_janakantha', ('01/01/1996', '12/30/2000'), {'engine': 'google', 'q': '"হয়েছে" site:https://www.dailyjanakantha.com', 'google_domain': 'google.com', 'gl': 'bd', 'hl': 'bn', 'filter': '0', 'num': 100, 'api_key': 'c8f7c4a413d768d9b454c42f233867c4bf3f4dba7be4b31bae4ed7b9b05dd268', 'tbs': 'cdr:1,cd_min:01/01/1996,cd_max:12/30/2000'})
('"হয়েছে"', 'the_daily_janakantha', ('12/30/2000', '12/29/2005'), {'engine': 'google', 'q': '"হয়েছে" site:https://www.dailyjanakantha.com', 'google_domain': 'google.com', 'gl': 'bd', 'hl': 'bn', 'filter': '0', 'num': 100, 'api_key': 'c8f7c4a413d768d9b454c42f233867c4bf3f4dba7be4b31bae4ed7b9b05dd268', 'tbs': 'cdr:1,cd_min:12/30/2000,cd_max:12/29/2005'})
('"হয়েছে"', 'the_daily_janakantha', ('12/29/2005', '12/28/2010'), {'engine': 'google', 'q': '"হয়েছে" site:https://www.dailyjanakantha.com', 'google_domain': 'google.com', 'gl': 'bd', 'hl': 'bn', 'filter': '0', 'num': 100, 'api_key': 'c8f7c4a413d768d9b454c42f233867c4bf3f4dba7be4b31bae4ed7b9b0

In [51]:
s.outfile()

'testing/dense_serp.csv'

# Scraper

In [101]:
import os, sys, time
from datetime import timedelta, date, datetime

import csv
import pandas as pd

import logging
from logging.handlers import RotatingFileHandler

#from serpapi import GoogleSearch

# Logger for full SERP Queries
queryLogger = logging.getLogger('SERP Queries')
#queryLogger.setLevel(logging.INFO)
#ch = RotatingFileHandler("log/serpQuery.log", maxBytes=400000, backupCount=1000, encoding="utf-8")
#formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s \n%(message)s')
#ch.setFormatter(formatter)
#queryLogger.addHandler(ch)
if not queryLogger.handlers:
    sth = logging.StreamHandler()
    queryLogger.addHandler(sth)

# Logger for basic output
logger = logging.getLogger("SERP Logger")
logger.setLevel(logging.DEBUG)

#ch = logging.handlers.RotatingFileHandler("log/serpScrape.log", maxBytes=400000, encoding="utf-8")
#formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s \n%(message)s')
#ch.setFormatter(formatter)
#logger.addHandler(ch)

if not logger.handlers:
    sth = logging.StreamHandler()
    logger.addHandler(sth)
    
class SERPError(Exception):
    def __init__(self, msg):
        super().__init__(msg)

import random
class SERPLinkScraper:
    def __init__(self, config):
        self.config = config
        self.search_delay = 3.601 # Delay to avoid hiting the 1000 queries per hour limit
        self.last_search = - self.search_delay

    def search_results_for(self, params):
        results = {}
        
        for _ in range(10):
            rdm_link = str(random.randint(1000, 9999))[::-1]
            title = "title"
            results[rdm_link] = {'title': title, 'link': rdm_link}
            
        return results

        # Loop through till end of search results or error encountered
        MAX_HITS = int(1e10)
        for start in range(0, MAX_HITS, self.config['num']):
            params['start'] = start

            time.sleep(max(0, self.search_delay + self.last_search - time.time()))

            queryLogger.info(params)
            client = GoogleSearch(params)
            search_results = client.get_dict()
            search_results = {'error': "Google hasn't returned any results for this query"}


            self.last_search = time.time()

            if 'error' in search_results:
                if "Google hasn't returned any results for this query" in search_results['error']:
                    return results
                elif "Your searches for the month are exhausted" in search_results['error']:
                    raise SERPError(f"SERP searches exhausted\n{search_results['error']}")
                else:
                    raise SERPError(f"Found SERP error:\n{search_results['error']}")

            news_results = search_results['news_results'] if self.config['searchnws'] else search_results['organic_results']
            
            if news_results is None or news_results:
                return results

            for news in news_results:
                link = news['link']
                if link in results:
                    continue

                if link in self.seen_sites:
                    continue
                
                info = {k:v for k, v in news.items() if k in ('title', 'link', 'date')}
                results[link] = info
        raise OverflowError(f"Excepted at most {MAX_HITS} results for each query")

    def query_SERP(self, query:str, site:str, dates:list=[(None, None)]) -> list:
        """
        Make dict for serp query using make_params()
        query serpAPI
        :param query: query string for google news
        :param site: site root url (eg. www.google.com)
        :param dates: list of dates

        :return: List of results in format [{'title':[TITLE], 'link':[LINK], 'date':[DATE]}]: (date according to SERP)
        """
        self.seen_sites = set()
        if self.config["seen_sites_file"] is not None:
            with open(self.config["seen_sites_file"], 'r') as f:
                self.seen_sites = set(json.load(f))
        
        overlap = 0
        results = {}
        for start_date, end_date in dates:
            params = {
                "engine": "google",
                "q": "{} site:{}".format(query, site),
                "google_domain": "google.com",
                "gl": self.config['gl'],
                "hl": self.config['hl'],
                'filter':self.config['filter'],
                "num": self.config['num'],
                "api_key": self.config['SERPAPI_KEY'],
                "tbs": "cdr:1,cd_min:{},cd_max:{}".format(start_date, end_date)
            }
                        
            if self.config['searchnws']:
                params["tbm"] = "nws"

            try:
                new_results = self.search_results_for(params)
            except SERPError as e:
                logger.warning(f"Issue with SERP {e}")
                raise e
            results.update(new_results)
            logger.debug(f'Date Range: {start_date}-{end_date}\tSites Found: {len(new_results)}\tTotal Found: {len(results)}')

        logger.debug('Total Sites: {}, Overlap: {}'.format(len(results), overlap))
        return list(results.values())
    
    def execute_search(self, params):
        try:
            news_results = self.search_results_for(params)
        except SERPError as e:
            logger.warning(f"Issue with SERP {e}")
            raise e
        
        for result in news_results.values():
            yield result
        
    def run(self):
        columnnames = ('title', 'paper', 'date', 'link', 'query', 'search_date_start', 'search_date_end')
        outfile = self.config.outfile()
        if os.path.exists(outfile):
            logger.warning(f"An output File {outfile} Already Exists!")
            mode = 'a+'
            self.seen_links = set(pd.read_csv(outfile, delimiter = "\t")['link'])
        else:
            mode = 'w'
            self.seen_links = set()
            
        executed_searches, found_links = set(), set()
        total_searches = sum(1 for _ in self.config.searches())
        
        start = time.time()
        with open(outfile, mode, newline = '', encoding="utf-8") as f:
            writer = csv.DictWriter(f, delimiter = "\t", fieldnames = columnnames)
            
            if mode == 'w':
                writer.writeheader()
            
            q_links = 0
            for i, search in enumerate(self.config.searches()):
                query, paper, date, params = search
                if (query, paper, date) in executed_searches:
                    continue

                q_new_links = 0
                for result in self.execute_search(params):
                    if result['link'] in found_links:
                        continue
                    else:
                        q_new_links += 1
                        q_links += 1
                        found_links.add(result['link'])
                    
                    result['query'] = query
                    result['paper'] = paper
                    result['search_date_start'], result['search_date_end'] = date
                    writer.writerow({k:(v.replace("\t", "    ") if k != 'category' else v) for k, v in result.items()})
                
                executed_searches.add((query, paper, date))
                    
                logger.debug(f'Total New Sites from {query} on {paper} in {date[0]}-{date[1]}: {q_new_links}')
                
                if i % 50 == 0:
                    logger.info(f"Executed {i}/{total_searches} searches in {timedelta(seconds = time.time() - start)}\nTotal Sites: {q_links}")
                    
        logger.info(f"Completed in {timedelta(seconds = time.time() - start)}")

In [102]:
config = SERPLinkScraperConfig("RandomDenseSERPSearch.yaml")
scraper = SERPLinkScraper(config)

In [103]:
scraper.run()

An output File testing/dense_serp.csv Already Exists!
Total New Sites from "হয়েছে" on the_daily_janakantha in 01/01/1996-12/30/2000: 10
Executed 0/740 searches in 0:00:00.001246
Total Sites: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 12/30/2000-12/29/2005: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 12/29/2005-12/28/2010: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 12/28/2010-12/27/2015: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 12/27/2015-12/31/2015: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 01/01/2016-02/20/2016: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 02/20/2016-04/10/2016: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 04/10/2016-05/30/2016: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 05/30/2016-07/19/2016: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 07/19/2016-09/07/2016: 10
Total New Sites from "হয়েছে" on the_daily_janakantha in 09/07/2016-1

Total New Sites from "হয়েছে" on the_daily_ittefaq in 12/29/2005-12/28/2010: 8
Executed 100/740 searches in 0:00:00.062598
Total Sites: 947
Total New Sites from "হয়েছে" on the_daily_ittefaq in 12/28/2010-12/27/2015: 8
Total New Sites from "হয়েছে" on the_daily_ittefaq in 12/27/2015-12/31/2015: 7
Total New Sites from "হয়েছে" on the_daily_ittefaq in 01/01/2016-02/20/2016: 9
Total New Sites from "হয়েছে" on the_daily_ittefaq in 02/20/2016-04/10/2016: 10
Total New Sites from "হয়েছে" on the_daily_ittefaq in 04/10/2016-05/30/2016: 9
Total New Sites from "হয়েছে" on the_daily_ittefaq in 05/30/2016-07/19/2016: 10
Total New Sites from "হয়েছে" on the_daily_ittefaq in 07/19/2016-09/07/2016: 10
Total New Sites from "হয়েছে" on the_daily_ittefaq in 09/07/2016-10/27/2016: 8
Total New Sites from "হয়েছে" on the_daily_ittefaq in 10/27/2016-12/16/2016: 10
Total New Sites from "হয়েছে" on the_daily_ittefaq in 12/16/2016-02/04/2017: 9
Total New Sites from "হয়েছে" on the_daily_ittefaq in 02/04/2017-03/26/2017: 9

Total New Sites from "হয়েছে" on bhorer_kagoj in 09/07/2016-10/27/2016: 7
Total New Sites from "হয়েছে" on bhorer_kagoj in 10/27/2016-12/16/2016: 10
Total New Sites from "হয়েছে" on bhorer_kagoj in 12/16/2016-02/04/2017: 6
Total New Sites from "হয়েছে" on bhorer_kagoj in 02/04/2017-03/26/2017: 5
Total New Sites from "হয়েছে" on bhorer_kagoj in 03/26/2017-05/15/2017: 7
Total New Sites from "হয়েছে" on bhorer_kagoj in 05/15/2017-07/04/2017: 7
Total New Sites from "হয়েছে" on bhorer_kagoj in 07/04/2017-08/23/2017: 9
Total New Sites from "হয়েছে" on bhorer_kagoj in 08/23/2017-10/12/2017: 10
Total New Sites from "হয়েছে" on bhorer_kagoj in 10/12/2017-12/01/2017: 9
Total New Sites from "হয়েছে" on bhorer_kagoj in 12/01/2017-01/20/2018: 9
Total New Sites from "হয়েছে" on bhorer_kagoj in 01/20/2018-03/11/2018: 8
Total New Sites from "হয়েছে" on bhorer_kagoj in 03/11/2018-04/30/2018: 5
Total New Sites from "হয়েছে" on bhorer_kagoj in 04/30/2018-06/19/2018: 6
Total New Sites from "হয়েছে" on bhorer_kagoj in 0

Total New Sites from "হয়েছে" on alokito_bangladesh in 10/12/2017-12/01/2017: 9
Total New Sites from "হয়েছে" on alokito_bangladesh in 12/01/2017-01/20/2018: 7
Total New Sites from "হয়েছে" on alokito_bangladesh in 01/20/2018-03/11/2018: 6
Total New Sites from "হয়েছে" on alokito_bangladesh in 03/11/2018-04/30/2018: 5
Total New Sites from "হয়েছে" on alokito_bangladesh in 04/30/2018-06/19/2018: 7
Total New Sites from "হয়েছে" on alokito_bangladesh in 06/19/2018-08/08/2018: 7
Total New Sites from "হয়েছে" on alokito_bangladesh in 08/08/2018-09/27/2018: 7
Total New Sites from "হয়েছে" on alokito_bangladesh in 09/27/2018-11/16/2018: 5
Total New Sites from "হয়েছে" on alokito_bangladesh in 11/16/2018-01/05/2019: 8
Total New Sites from "হয়েছে" on alokito_bangladesh in 01/05/2019-02/24/2019: 9
Total New Sites from "হয়েছে" on alokito_bangladesh in 02/24/2019-04/15/2019: 9
Total New Sites from "হয়েছে" on alokito_bangladesh in 04/15/2019-06/04/2019: 5
Total New Sites from "হয়েছে" on alokito_bangladesh i

Total New Sites from "হয়েছে" on the_daily_inqilab in 01/05/2019-02/24/2019: 7
Total New Sites from "হয়েছে" on the_daily_inqilab in 02/24/2019-04/15/2019: 5
Total New Sites from "হয়েছে" on the_daily_inqilab in 04/15/2019-06/04/2019: 5
Total New Sites from "হয়েছে" on the_daily_inqilab in 06/04/2019-07/24/2019: 7
Total New Sites from "হয়েছে" on the_daily_inqilab in 07/24/2019-09/12/2019: 6
Total New Sites from "হয়েছে" on the_daily_inqilab in 09/12/2019-11/01/2019: 6
Total New Sites from "হয়েছে" on the_daily_inqilab in 11/01/2019-12/21/2019: 6
Total New Sites from "হয়েছে" on the_daily_inqilab in 12/21/2019-02/09/2020: 10
Total New Sites from "হয়েছে" on the_daily_inqilab in 02/09/2020-03/30/2020: 4
Total New Sites from "হয়েছে" on the_daily_inqilab in 03/30/2020-05/19/2020: 5
Total New Sites from "হয়েছে" on the_daily_inqilab in 05/19/2020-07/08/2020: 8
Total New Sites from "হয়েছে" on the_daily_inqilab in 07/08/2020-08/27/2020: 8
Total New Sites from "হয়েছে" on the_daily_inqilab in 08/27/2020

Total New Sites from "হয়েছে" on daily_naya_diganta in 05/19/2020-07/08/2020: 3
Total New Sites from "হয়েছে" on daily_naya_diganta in 07/08/2020-08/27/2020: 6
Total New Sites from "হয়েছে" on daily_naya_diganta in 08/27/2020-10/16/2020: 5
Total New Sites from "হয়েছে" on daily_naya_diganta in 10/16/2020-12/05/2020: 6
Total New Sites from "হয়েছে" on daily_naya_diganta in 12/05/2020-01/24/2021: 5
Total New Sites from "হয়েছে" on daily_naya_diganta in 01/24/2021-03/15/2021: 5
Total New Sites from "হয়েছে" on daily_naya_diganta in 03/15/2021-05/04/2021: 2
Total New Sites from "হয়েছে" on daily_naya_diganta in 05/04/2021-06/23/2021: 5
Total New Sites from "হয়েছে" on daily_naya_diganta in 06/23/2021-08/12/2021: 9
Total New Sites from "হয়েছে" on daily_naya_diganta in 08/12/2021-10/01/2021: 7
Total New Sites from "হয়েছে" on daily_naya_diganta in 10/01/2021-11/20/2021: 5
Total New Sites from "হয়েছে" on daily_naya_diganta in 11/20/2021-12/31/2021: 7
Total New Sites from "হয়েছে" on the_azadi in 01/01/1

Total New Sites from "হয়েছে" on sangbad_pratidin in 12/30/2000-12/29/2005: 3
Total New Sites from "হয়েছে" on sangbad_pratidin in 12/29/2005-12/28/2010: 4
Total New Sites from "হয়েছে" on sangbad_pratidin in 12/28/2010-12/27/2015: 3
Total New Sites from "হয়েছে" on sangbad_pratidin in 12/27/2015-12/31/2015: 4
Total New Sites from "হয়েছে" on sangbad_pratidin in 01/01/2016-02/20/2016: 4
Total New Sites from "হয়েছে" on sangbad_pratidin in 02/20/2016-04/10/2016: 8
Total New Sites from "হয়েছে" on sangbad_pratidin in 04/10/2016-05/30/2016: 2
Total New Sites from "হয়েছে" on sangbad_pratidin in 05/30/2016-07/19/2016: 6
Total New Sites from "হয়েছে" on sangbad_pratidin in 07/19/2016-09/07/2016: 6
Total New Sites from "হয়েছে" on sangbad_pratidin in 09/07/2016-10/27/2016: 5
Total New Sites from "হয়েছে" on sangbad_pratidin in 10/27/2016-12/16/2016: 6
Total New Sites from "হয়েছে" on sangbad_pratidin in 12/16/2016-02/04/2017: 3
Total New Sites from "হয়েছে" on sangbad_pratidin in 02/04/2017-03/26/2017: 1

In [2]:
import json

with open("paper_to_site.json", 'w') as f:
    json.dump(names_site, f)

In [22]:
config_file = "RandomDenseSERPSearch.yaml"
with open(config_file, "r", encoding='utf-8') as f:
    loaded = yaml.load(f, Loader=yaml.FullLoader)

In [23]:
loaded

{'searches': {'popular': {'queries': ['"হয়েছে"'],
   'papers': ['the_daily_janakantha',
    'the_daily_jugantor',
    'the_daily_ittefaq',
    'amader_shomoy',
    'bhorer_kagoj',
    'daily_manab_zamin',
    'alokito_bangladesh',
    'the_sangbad',
    'the_daily_inqilab',
    'jaijaidin',
    'daily_naya_diganta',
    'the_azadi',
    'daily_sangram',
    'sangbad_pratidin',
    'the_daily_star'],
   'date_ranges': [['1/1/1996', '12/31/2015', 1825],
    ['1/1/2016', '12/31/2021', 50]]},
  'prothom_alo': {'queries': ['"হয়েছে"'],
   'papers': ['prothom_alo'],
   'date_ranges': [['1/1/1996', '12/31/2015', 1825]]}},
 'output_dir': 'testing',
 'output_file': '{paper}_serp_{query}.json',
 'quote_sub': '[RVDQUOTE]',
 'overwrite': False,
 'skip_existing': True,
 'searchnws': False,
 'gl': 'bd',
 'hl': 'bn',
 'filter': '0',
 'num': 100,
 'SERPAPI_KEY': 'c8f7c4a413d768d9b454c42f233867c4bf3f4dba7be4b31bae4ed7b9b05dd268'}

# Paper Sites

In [1]:
paper_sites_str ="""1. prothom_alo - https://www.prothomalo.com
2. bangladesh_pratidin - https://www.bd-pratidin.com
3. the_daily_ittefaq - https://www.ittefaq.com.bd
4. kaler_kantho - https://www.kalerkantho.com
5. daily_naya_diganta - https://dailynayadiganta.com
6. daily_amar_sangbad - https://www.amarsangbad.com
7. protidiner_sangbad - https://www.protidinersangbad.com
8. the_daily_jugantor - https://www.jugantor.com
9. daily_sangram - https://dailysangram.com
10. daily_manab_zamin - https://mzamin.com
11. amader_shomoy - http://www.dainikamadershomoy.com
12. samakal - https://samakal.com
13. the_daily_janakantha - https://www.dailyjanakantha.com
14. jaijaidin - https://www.jaijaidinbd.com
15. bhorer_kagoj - https://www.bhorerkagoj.com
16. arthoniteer_kagoj - http://www.arthoniteerkagoj.com
17. the_daily_inqilab - https://www.dailyinqilab.com
18. sangbad - http://sangbad.net.bd
19. manob_kantha - https://www.manobkantha.com.bd
20. daily_suprobhat - https://suprobhat.com
21. bangladesh_journal - https://www.bd-journal.com
22. bonik_barta - https://bonikbarta.net
23. alokito_bangladesh - https://www.alokitobangladesh.com
24. ajker_bazar - https://www.ajkerbazzar.com
25. amadar_orthoneeti - https://amaderorthoneeti.com/new/
26. bangladesh_post - https://bn.bangladeshpost.net
27. sorejomin_barta - http://sorejominbarta.com
28. ajker_patrika - https://www.ajkerpatrika.com
29. khabar_patra - https://khoborpatrabd.com
30. vorer_pata - https://dailyvorerpata.com
31. shomoyer_alo - https://www.shomoyeralo.com
32. dhakar_dak - https://dhakardak-bd.com
33. natun_barta - http://www.natun-barta.com
34. share_biz - https://sharebiz.net
35. daily_bartoman - http://dailybartoman.com
36. ajkaler_khobor - https://www.ajkalerkhobor.net
37. sangbad_konika - https://www.sangbadkonika.com
38. somoyer_kantha - http://www.gonokantho.com
39. daily_star - https://www.thedailystar.net/bangla/
40. khola_kagoj - http://kholakagojbd.com
41. desh_rupantor - https://www.deshrupantor.com
42. bangladesher_khabor - https://www.bangladesherkhabor.net
43. amar_desh - http://bd-bulletin.com
44. daily_jagran - https://dailyjagaran.com
45. business_standard - https://www.tbsnews.net/bangla
46. dhaka_tribune - https://bangla.dhakatribune.com
47. khulna_news - https://khulnanews.com
48. daily_azadi - https://dainikazadi.org
49. daily_purbokone - https://dainikpurbokone.net
50. amader_comilla - https://dailyamadercomilla.com
51. daily_purbanchal - https://purbanchal.com
52. ukhiya_news - https://ukhiyanews.com
53. matha_bhanga - https://mathabhanga.com
54. rajshahi_news_24 - https://rajshahinews24.com
55. chattagram_news - https://chattagramnews.com
56. daily_khowai - https://dailykhowai.com
57. daily_karatoa - https://karatoa.com.bd
58. satkhira_news - https://satkhiranews.com
59. dainik_sylhet - https://dainiksylhet.com
60. daily_coxsbazar - https://dailycoxsbazar.com
61. uttorpurbo - https://euttorpurbo.com
62. ajker_jamalpur - https://ajkerjamalpur.com
63. amader_barisal - https://amaderbarisal.com
64. surma_times - https://surmatimes.com
65. dinajpur_news - https://dinajpurnews.com
66. kuakata_news - https://kuakatanews.com
67. chadpur_times - https://chandpurtimes.com
68. mukto_khobor_24 - https://muktokhobor24.com
69. teknaf_news - https://teknafnews.com
70. bogra_sangbad - https://bograsangbad.com
71. rajshahir_somoy - https://rajshahirsomoy.com
72. chandpurweb - https://chandpurweb.com
73. barisal_news - https://barisalnews.com
74. sylhet_express - https://sylhetexpress.com
75. laksmipur_24 - https://lakshmipur24.com
76. prothom_feni - https://prothom-feni.com
77. amar_noakhali - https://amarnoakhali.com
78. chakaria_news - https://chakarianews.com
79. gramer_kagoj - https://gramerkagoj.com
80. andoloner_bazar - https://dailyandolonerbazar.com
81. khulnanchal - https://khulnanchal.com
82. mymensingh_pratidin - https://mymensinghpratidin.com"""

In [2]:
import re
ext = re.compile("\d+\.\s([a-z_1-9]+)\s-\s([^$]*)")
paper_site = {ext.match(line)[1]: ext.match(line)[2] for line in paper_sites_str.split('\n')}

In [3]:
import json
PAPER_SITES_FILE = "paper_to_site.json"
with open("paper_to_site.json", 'r') as f:
    old_paper_site = json.load(f)

In [4]:
for k, v in old_paper_site.items():
    paper_site[k] = old_paper_site[k]
    assert(old_paper_site[k] == paper_site[k] or print(k, old_paper_site[k], paper_site[k]))
    
#for site in 

In [39]:
https://www.dailynayadiganta.com
https://dailynayadiganta.com

SyntaxError: invalid syntax (<ipython-input-39-2057a7048f9b>, line 1)

In [50]:
import requests
resp = requests.get("https://www.samakal.com")

In [55]:
for url in paper_site.values():
    try:
        resp = requests.get(url)
    except:
        print(url, " error")
    if not resp.ok:
        print(url)

https://www.bd-pratidin.com
https://www.dailysangram.com  error
https://www.samakal.com
http://www.natun-barta.com
https://www.sangbadkonika.com
http://www.gonokantho.com
https://www.bangla.dhakatribune.com  error
https://www.khulnanews.com  error
https://www.dailyamadercomilla.com
https://www.ukhiyanews.com  error
https://www.chattagramnews.com  error
https://www.dailykhowai.com  error
https://www.satkhiranews.com
https://www.dailycoxsbazar.com  error
https://www.euttorpurbo.com
https://www.amaderbarisal.com  error
https://www.surmatimes.com
https://www.dinajpurnews.com  error
https://www.dinajpurnews.com
https://www.teknafnews.com  error
https://www.barisalnews.com  error
https://www.sylhetexpress.com  error


In [62]:
from fake_useragent import UserAgent
import urllib
from bs4 import BeautifulSoup

url = "http://www.sylhetexpress.com"
ua = UserAgent()
request = urllib.request.Request(
    url,
    headers={
        'User-Agent': ua.random
    }
)
page = urllib.request.urlopen(request)
soup = BeautifulSoup(page, "html.parser")
soup

<html lang="bn">
<head>
<title>Sylhet Express | সিলেট এক্সপ্রেস</title>
<meta charset="utf-8" content="text/html" http-equiv="Content-Type"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, minimum-scale=1, initial-scale=1, maximum-scale=1, user-scalable=0" name="viewport"/>
<style type="text/css">
		.topicTextColor{
			color: white
		}
		.topicShow{
		  background-color: #887474;
		}
	</style>
<link href="http://www.sylhetexpress.com" rel="canonical"/>
<meta content="bd" name="x-country"/>
<meta content="Asia" name="x-audience"/>
<meta content="Asia" name="CPS_AUDIENCE"/>
<meta content="on" http-equiv="cleartype"/>
<meta content="Global" name="distribution"/>
<meta content="fc13e7a3d58fc9ca" name="yandex-verification">
<meta content="bn" http-equiv="Content-Language"/>
<!--meta http-equiv="refresh" content="120"-->
<link href="http://www.sylhetexpress.com/wp-content/themes/sylhet-expresss/images/favicon.png" rel="shortcut icon"/>
<lin

In [63]:
if 

page.status != 200

200

In [64]:
from fake_useragent import UserAgent
import urllib

def requestWithUA(url):
    ua = UserAgent()
    request = urllib.request.Request(
        url,
        headers={
            'User-Agent': ua.random
        }
    )
    page = urllib.request.urlopen(request)
    return page


In [69]:
for url in paper_site.values():
    try:
        resp = requestWithUA(url)
    except:
        print(url, " error")
    if resp.status != 200:
        print(url)

https://ukhiyanews.com  error
https://chattagramnews.com  error
https://dailykhowai.com  error
https://dailycoxsbazar.com  error
https://amaderbarisal.com  error
https://surmatimes.com  error
https://teknafnews.com  error
https://barisalnews.com  error
https://sylhetexpress.com  error


In [5]:
old_paper_site

{'prothom_alo': 'https://www.prothomalo.com',
 'the_daily_janakantha': 'https://www.dailyjanakantha.com',
 'kaler_kantho': 'https://www.kalerkantho.com',
 'the_daily_jugantor': 'https://www.jugantor.com',
 'the_daily_ittefaq': 'https://www.ittefaq.com.bd',
 'samakal': 'https://samakal.com',
 'amader_shomoy': 'http://www.dainikamadershomoy.com',
 'bhorer_kagoj': 'https://www.bhorerkagoj.com',
 'daily_manab_zamin': 'https://mzamin.com',
 'alokito_bangladesh': 'https://www.alokitobangladesh.com',
 'the_sangbad': 'http://sangbad.net.bd',
 'the_daily_inqilab': 'https://www.dailyinqilab.com',
 'jaijaidin': 'https://www.jaijaidinbd.com',
 'daily_naya_diganta': 'https://www.dailynayadiganta.com',
 'the_azadi': 'https://dainikazadi.net',
 'daily_sangram': 'https://dailysangram.com',
 'sangbad_pratidin': 'https://www.sangbadpratidin.in',
 'the_daily_star': 'https://www.thedailystar.net/bangla',
 'bd_news': 'bangla.bdnews24.com'}

In [6]:
paper_site

{'prothom_alo': 'https://www.prothomalo.com',
 'bangladesh_pratidin': 'https://www.bd-pratidin.com',
 'the_daily_ittefaq': 'https://www.ittefaq.com.bd',
 'kaler_kantho': 'https://www.kalerkantho.com',
 'daily_naya_diganta': 'https://www.dailynayadiganta.com',
 'daily_amar_sangbad': 'https://www.amarsangbad.com',
 'protidiner_sangbad': 'https://www.protidinersangbad.com',
 'the_daily_jugantor': 'https://www.jugantor.com',
 'daily_sangram': 'https://dailysangram.com',
 'daily_manab_zamin': 'https://mzamin.com',
 'amader_shomoy': 'http://www.dainikamadershomoy.com',
 'samakal': 'https://samakal.com',
 'the_daily_janakantha': 'https://www.dailyjanakantha.com',
 'jaijaidin': 'https://www.jaijaidinbd.com',
 'bhorer_kagoj': 'https://www.bhorerkagoj.com',
 'arthoniteer_kagoj': 'http://www.arthoniteerkagoj.com',
 'the_daily_inqilab': 'https://www.dailyinqilab.com',
 'sangbad': 'http://sangbad.net.bd',
 'manob_kantha': 'https://www.manobkantha.com.bd',
 'daily_suprobhat': 'https://suprobhat.com'

In [None]:
https://ukhiyanews.com  error
https://chattagramnews.com  error
https://dailykhowai.com  error
https://dailycoxsbazar.com  error
https://amaderbarisal.com  error
https://surmatimes.com  error
https://teknafnews.com  error
https://barisalnews.com  error
https://sylhetexpress.com  error

In [17]:
del paper_site['ukhiya_news']
del paper_site['chattagram_news']
del paper_site['daily_khowai']
del paper_site['daily_coxsbazar']
del paper_site['amader_barisal']
del paper_site['surma_times']
del paper_site['teknaf_news']
del paper_site['barisal_news']
del paper_site['sylhet_express']

In [18]:
paper_site

{'prothom_alo': 'https://www.prothomalo.com',
 'bangladesh_pratidin': 'https://www.bd-pratidin.com',
 'the_daily_ittefaq': 'https://www.ittefaq.com.bd',
 'kaler_kantho': 'https://www.kalerkantho.com',
 'daily_naya_diganta': 'https://www.dailynayadiganta.com',
 'daily_amar_sangbad': 'https://www.amarsangbad.com',
 'protidiner_sangbad': 'https://www.protidinersangbad.com',
 'the_daily_jugantor': 'https://www.jugantor.com',
 'daily_sangram': 'https://dailysangram.com',
 'daily_manab_zamin': 'https://mzamin.com',
 'amader_shomoy': 'http://www.dainikamadershomoy.com',
 'samakal': 'https://samakal.com',
 'the_daily_janakantha': 'https://www.dailyjanakantha.com',
 'jaijaidin': 'https://www.jaijaidinbd.com',
 'bhorer_kagoj': 'https://www.bhorerkagoj.com',
 'arthoniteer_kagoj': 'http://www.arthoniteerkagoj.com',
 'the_daily_inqilab': 'https://www.dailyinqilab.com',
 'sangbad': 'http://sangbad.net.bd',
 'manob_kantha': 'https://www.manobkantha.com.bd',
 'daily_suprobhat': 'https://suprobhat.com'

In [26]:
for v in paper_site.keys() - old_paper_site.keys():
    print(f"      - '{v}'")

      - 'daily_azadi'
      - 'dhaka_tribune'
      - 'daily_karatoa'
      - 'somoyer_kantha'
      - 'protidiner_sangbad'
      - 'ajker_patrika'
      - 'amar_noakhali'
      - 'laksmipur_24'
      - 'daily_amar_sangbad'
      - 'ajkaler_khobor'
      - 'satkhira_news'
      - 'dainik_sylhet'
      - 'amadar_orthoneeti'
      - 'vorer_pata'
      - 'sorejomin_barta'
      - 'business_standard'
      - 'chadpur_times'
      - 'prothom_feni'
      - 'share_biz'
      - 'daily_purbanchal'
      - 'bangladesh_journal'
      - 'sangbad'
      - 'bangladesh_pratidin'
      - 'dinajpur_news'
      - 'daily_jagran'
      - 'chandpurweb'
      - 'ajker_jamalpur'
      - 'gramer_kagoj'
      - 'sangbad_konika'
      - 'natun_barta'
      - 'ajker_bazar'
      - 'daily_purbokone'
      - 'mymensingh_pratidin'
      - 'shomoyer_alo'
      - 'kuakata_news'
      - 'arthoniteer_kagoj'
      - 'khulnanchal'
      - 'desh_rupantor'
      - 'chakaria_news'
      - 'daily_star'
      - 'khulna_news'


In [21]:
keywords = {"জলমগ্ন": "submerged",
    "জোয়ারের":"tidal", 
    "প্লাবিত": "flooded", 
    "বন্যা": "flood", 
    "জলাবদ্ধ": "waterlogged", 
    "উজান": "upstream", 
    "ঘূর্ণিঝড়": "cyclone",
    "নদী": "river", 
    "ভাঙ্গন": "erosion",
    "বাঁধ": "embankment",
    "বেড়িবাঁধ": "embankment",
    "পোল্ডার": "polder"
}

In [25]:
for v in keywords.keys():
    print(f"      - \"\\\"{v}\\\"\"")

      - "\"জলমগ্ন\""
      - "\"জোয়ারের\""
      - "\"প্লাবিত\""
      - "\"বন্যা\""
      - "\"জলাবদ্ধ\""
      - "\"উজান\""
      - "\"ঘূর্ণিঝড়\""
      - "\"নদী\""
      - "\"ভাঙ্গন\""
      - "\"বাঁধ\""
      - "\"বেড়িবাঁধ\""
      - "\"পোল্ডার\""


In [27]:
with open("paper_to_site.json", "w") as f:
    json.dump(paper_site, f)