<div dir="auto" align="center">
    <h3>
        بسم الله الرحمن الرحیم
    </h3>
    <br>
    <h1>
        <strong>
            بازیابی پیشرفته اطلاعات
        </strong>
    </h1>
    <h2>
        <strong>
            تمرین سوم (موتور جستجوی اخبار)
        </strong>
    </h2>
    <br>
    <h3>
        محمد هجری - ٩٨١٠٦١٥٦
        <br><br>
        ارشان دلیلی - ٩٨١٠٥٧٥١
        <br><br>
        سروش جهان‌زاد - ٩٨١٠٠٣٨٩
    </h3>
    <br>
</div>
<hr>

In [1]:
import re
import csv
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

In [6]:
class Scraper:

    def __init__(self, current_year, current_month):
        self.current_year = current_year
        self.current_month = current_month

    def get_URL_content(self, URL):
        while True:
            try:
                return requests.get(URL).content
                break
            except:
                pass

    def generate_page_URL(self, page_index, category, year, month):
        tp = {'IranPolitics': 6, 'World': 11, 'Economy': 10, 'Society': 5, 'City': 7,
              'LifeSkills': 21, 'IT': 718, 'Science': 20, 'Culture': 26, 'Sport': 9}[category]
        return f'https://www.hamshahrionline.ir/archive?pi={page_index}&tp={tp}&ty=1&ms=0&mn={month}&yr={year}'

    def get_page_URLs_by_time(self, category, year, month):
        URLs = []
        page_index = 1
        while True:
            URL = self.generate_page_URL(page_index, category, year, month)
            content = self.get_URL_content(URL)
            if re.findall('pagination', str(content)):
                URLs.append(URL)
                page_index += 1
            else:
                break
        return URLs

    def get_page_URLs_since(self, category, year, month):
        URLs = []
        with tqdm() as pbar:
            while True:
                if month > 12:
                    month = 1
                    year += 1
                pbar.set_description(f'[{category}] [Extracting page URLs] [Date: {year}/{month}]')
                URLs_by_time = self.get_page_URLs_by_time(category, year, month)
                if URLs_by_time:
                    for URL in URLs_by_time:
                        URLs.append(URL)
                    month += 1
                elif self.current_year > year or (self.current_year == year and self.current_month > month):
                    month += 1
                else:
                    break
        return URLs

    def get_news_URLs_since(self, category, year, month):
        news_URLs = []
        page_URLs = self.get_page_URLs_since(category, year, month)
        with tqdm(page_URLs) as pbar:
            for page_URL in pbar:
                content = self.get_URL_content(page_URL)
                soup = BeautifulSoup(content, 'html5lib')
                for item in soup.findAll('li', attrs={'class': 'news'}):
                    URL = item.find('div', attrs={'class': 'desc'}).find('h3').find('a')['href']
                    URL = 'https://www.hamshahrionline.ir' + URL
                    news_URLs.append(URL)
                pbar.set_description(f'[{category}] [Extracting news URLs] [{len(news_URLs)} news until now]')
        return news_URLs

    def parse_news(self, URL):
        content = self.get_URL_content(URL)
        soup = BeautifulSoup(content, 'html.parser')
        date = soup.find("div", {"class": "col-6 col-sm-4 col-xl-4 item-date"}).span.text.strip()
        title = soup.find("div", {"class": "item-title"}).h1.text.strip()
        intro = soup.find("p", {"class": "introtext", "itemprop": "description"}).text.strip()
        body = soup.find("div", {"class": "item-text", "itemprop": "articleBody"}).text.strip()
        category = soup.find_all("li", {"class": "breadcrumb-item"})
        category = list(map(lambda x: x.text.strip(), category))[1:]
        return {
            'date': date,
            'title': title,
            'intro': intro,
            'body': body,
            'category': category,
        }

    def scrape(self, from_year, from_month):
        categories = ['IranPolitics', 'World', 'Economy', 'Society', 'City', 'LifeSkills', 'IT', 'Science', 'Culture', 'Sport']
        # categories = ['IranPolitics', 'World', 'Economy'] # Mohammad
        categories = ['Society', 'City', 'LifeSkills'] # Arshan
        # categories = ['IT', 'Science', 'Culture', 'Sport'] # Soroush
        # TODO: uncomment yours
        category_news = {}
        for category in categories:
            news = []
            URLs = self.get_news_URLs_since(category, from_year, from_month)
            with tqdm(URLs) as pbar:
                pbar.set_description(f'[{category}] [Scraping news]')
                for URL in pbar:
                    news.append(self.parse_news(URL))
            category_news[category] = news
        return category_news

scraper = Scraper(current_year=1401, current_month=3)
category_news = scraper.scrape(from_year=1399, from_month=1)

[Society] [Extracting page URLs] [Date: 1400/9]: : 0it [06:48, ?it/s] 

In [None]:
for category, news in category_news.items():
    df = pd.DataFrame(news)
    df.to_csv(f"{category}_dataset.csv", encoding='utf-8')

In [None]:
news_info = ['date', 'title', 'intro', 'body', 'category']
with open('dataset.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = news_info)
    writer.writeheader()
    writer.writerows(news)