<a href="https://colab.research.google.com/github/KhanShaheb34/BD-ProtidinScraper/blob/master/ScrapeBDProtidin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import bs4
import requests
import os
import pandas as pd
import re
from datetime import timedelta, datetime

In [2]:
def getNewsFromCategoryAndDate(category, date):
  url = f"https://www.bd-pratidin.com/{category}/{date}"
  page = requests.get(url)
  soup = bs4.BeautifulSoup(page.content,"html.parser").find("div",{"class": "container-left-area col-md-9"})
  newsLinks = []

  for a in soup.findAll("a", {"href": re.compile(r"^[a-z-]+\/\d+\/\d+\/\d+\/\d+$")}):
    newsLinks.append(a["href"])

  return newsLinks

In [3]:
def getNewsFromLink(link):
  url = f"https://www.bd-pratidin.com/{link}"
  page = requests.get(url)
  soup = bs4.BeautifulSoup(page.content, "html.parser")

  title = soup.find("h1", {"class":"post-title"}).text.strip()
  description = soup.find("meta",{"property":"og:description"})["content"].strip()
  category = link.split("/")[0]
  id = int(link.split("/")[4])
  date = "/".join(link.split("/")[1:4])

  articleSoup = soup.find("article")
  article = ""
  for p in articleSoup.findAll("p"):
    article += p.text

  return {"id": id,
          "title": title, 
          "description": description, 
          "category": category,
          "date": date,
          "article": article}

In [7]:
def getNewsFromDate(date, save=0, verbose=1):
  url = f"https://www.bd-pratidin.com/archive/{date}"
  page = requests.get(url)
  soup = bs4.BeautifulSoup(page.content, "html.parser").find("div", {"class": "container-left-area printversion col-md-9"})
  categories = set()

  for a in soup.findAll("a", {"href": re.compile(r"^[a-z-]+\/\d+\/\d+\/\d+")}):
    categories.add(a["href"].split("/")[0])
  categories = list(categories)

  if verbose==1:
    print(f"There are {len(categories)} categories.")

  news = []
  for category in categories:
    newsLink = getNewsFromCategoryAndDate(category, date)

    if verbose==1:
      print(f"Downloading {len(newsLink)} news from '{category}' category...")

    for link in newsLink:
      news.append(getNewsFromLink(link))

    if verbose==1:
      print(f"Done!")

  news_df = pd.DataFrame(news)

  if save != 0:
    news_df.to_csv(save, index=False)

  return news_df

In [8]:
def saveNewsFromMultipleDate(days=0, startDate=datetime.today(), verbose=1):
  for day in range(days+1):
    date = (startDate - timedelta(days=day)).strftime("%Y/%m/%d")
    filename = "-".join(date.split("/")) + ".csv"

    if verbose==1:
      print(f"Saving news from {date} in {filename}...")

    getNewsFromDate(date, filename, verbose)

    if verbose==1:
      print(f"Saving news from {date} in {filename} is complete.\n")

In [None]:
saveNewsFromMultipleDate(10)

Saving news from 2020/09/02 in 2020-09-02.csv...
There are 11 categories.
Downloading 6 news from 'editorial' category...
Done!
Downloading 23 news from 'country-village' category...
Done!
Downloading 15 news from 'sport-news' category...
Done!
Downloading 26 news from 'last-page' category...
Done!
Downloading 12 news from 'international' category...
Done!
Downloading 5 news from 'news' category...
Done!
Downloading 16 news from 'first-page' category...
Done!
Downloading 9 news from 'entertainment-news' category...
Done!
Downloading 1 news from 'horoscope' category...
Done!
Downloading 30 news from 'city' category...
Done!
Downloading 5 news from 'various-lifestyles' category...
Done!
Saving news from 2020/09/02 in 2020-09-02.csv is complete.

Saving news from 2020/09/01 in 2020-09-01.csv...
There are 12 categories.
Downloading 6 news from 'editorial' category...
Done!
Downloading 24 news from 'country-village' category...
Done!
Downloading 5 news from 'various-city-roundup' category..