In [1]:
from requests import Session ,get
from bs4 import BeautifulSoup
from collections import deque
from concurrent.futures import ThreadPoolExecutor, wait
from threading import Lock
import json

import time
import random

import warnings
warnings.filterwarnings("ignore")

In [None]:
class Crawler:
    """
    put your own user agent in the headers
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US'
    }
    top_URL = 'https://healthunlocked.com/'

    def __init__(self, crawling_threshold=1000):
        """
        Initialize the crawler

        Parameters
        ----------
        crawling_threshold: int
            The number of pages to crawl
        """
        
        self.crawling_threshold = crawling_threshold
        self.session = None
        self.not_crawled = []
        self.crawled = []
        self.crawled_ids = []
        self.added_ids = []
        self.add_list_lock = Lock()
        self.add_queue_lock = Lock()

    def get_id_from_URL(self, URL):
        """
        Get the id from the URL of the site. The id is what comes exactly after title.

        Parameters
        ----------
        URL: str
            The URL of the site
        Returns
        ----------
        str
            The id of the site
        """
        
        return '/'.join(URL.split('/')[3:])

    def write_to_file_as_json(self):
        """
        Save the crawled files into json
        """
        
        with open('HealthUnlocked_crawled.json', 'w') as f:
            f.write(json.dumps(self.crawled))
            f.close()

        with open('HealthUnlocked_not_crawled.json', 'w') as f:
            f.write(json.dumps(self.not_crawled))
            f.close()

    def read_from_file_as_json(self):
        """
        Read the crawled files from json
        """
        
        with open('HealthUnlocked_crawled.json', 'r') as f:
            raw_data_0 = f.read()
            f.close()

        self.crawled = json.loads(raw_data_0)

        with open('HealthUnlocked_not_crawled.json', 'w') as f:
            raw_data_1 = f.read()
            f.close()

        self.not_crawled = json.loads(raw_data_1)

        self.added_ids = [self.get_id_from_URL(link) for link in self.not_crawled]

    def login(self):
      self.session = Session()

      login_url = 'https://healthunlocked.com/login'

      credentials = {
          'username': 'amirhoseinrezayi95@gmail.com',
          'password': 'H@ji1380'
      }

      response = self.session.post(login_url, data=credentials)
      if response.ok:
          pass
      else:
          raise Exception('Login failed')

    def crawl(self, URL):
        """
        Make a get request to the URL and return the response

        Parameters
        ----------
        URL: str
            The URL of the site
        Returns
        ----------
        requests.models.Response
            The response of the get request
        """
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
        ]
        Crawler.headers['User-Agent'] = random.choice(user_agents)
        res = self.session.get(URL, headers=Crawler.headers)
        if res.status_code == 200:
            return res
        else:
            return None


    def extract_top_links(self):
        """
        Extract the top 250 movies from the top 250 page and use them as seed for the crawler to start crawling.
        """
        

        for i in range(1, 11):
          top_URL = f'https://healthunlocked.com/tag/diabetes?page={i}'
          res = self.crawl(top_URL)
          top_soup = BeautifulSoup(res.content, 'html.parser')
          links = self.extract_source_links(top_soup)
          for link in links:
            if link not in self.not_crawled:
              self.not_crawled.append(link)
              self.added_ids.append(self.get_id_from_URL(link))

    def extract_source_links(self, soup):
      extracted_links = []
      A = soup.find_all('div', class_ = 'sc-6d595cea-6 ecnMcz')
      for x in A:
        a = x.find_all('a')
        for b in a:
          if 'posts' in b['href'] and '?responses' not in b['href']:
            extracted_links.append('https://healthunlocked.com' + b['href'])
      return extracted_links

    def get_page_instance(self):
        return {
            'id': None,
            'title': None,
            'date': None,
            '#replies': None,
            'post': None,
            'comments': None,
            'related_links': None
        }

    def start_crawling(self):
        """
        Start crawling the movies until the crawling threshold is reached.
        TODO:
            replace WHILE_LOOP_CONSTRAINTS with the proper constraints for the while loop.
            replace NEW_URL with the new URL to crawl.
            replace THERE_IS_NOTHING_TO_CRAWL with the condition to check if there is nothing to crawl.
            delete help variables.

        ThreadPoolExecutor is used to make the crawler faster by using multiple threads to crawl the pages.
        You are free to use it or not. If used, not to forget safe access to the shared resources.
        """

    

        self.extract_top_links()
        futures = []

        with ThreadPoolExecutor(max_workers=20) as executor:
            while len(self.crawled) < self.crawling_threshold:

                self.add_list_lock.acquire()
                URL = self.not_crawled.pop(0)
                self.add_list_lock.release()
                futures.append(executor.submit(self.crawl_page_info, URL))
                if len(self.not_crawled) == 0:
                    wait(futures)
                    # print(futures[0].result())
                    futures = []

    def crawl_page_info(self, URL):
        """
        Main Logic of the crawler. It crawls the page and extracts the information of the movie.
        Use related links of a movie to crawl more movies.

        Parameters
        ----------
        URL: str
            The URL of the site
        """
       

        if self.get_id_from_URL(URL) in self.crawled_ids or len(self.crawled) >= self.crawling_threshold:
          return

        page_id = self.get_id_from_URL(URL)

        res = self.crawl(URL)
        time.sleep(10)

        if (res is not None):
            page_instance = self.get_page_instance()
            self.extract_movie_info(res, page_instance, URL)

            self.add_list_lock.acquire()
            self.crawled.append(page_instance)
            self.added_ids.remove(page_id)
            self.crawled_ids.append(page_id)

            for link in page_instance['related_links']:
                self.not_crawled.append(link)
                self.added_ids.append(self.get_id_from_URL(link))

            self.add_list_lock.release()

            return page_instance
        else:

            self.add_list_lock.acquire()
            self.not_crawled.append(URL)
            self.add_list_lock.release()


    def extract_movie_info(self, res, page, URL):
        """
        Extract the information of the movie from the response and save it in the movie instance.

        Parameters
        ----------
        res: requests.models.Response
            The response of the get request
        movie: dict
            The instance of the movie
        URL: str
            The URL of the site
        """


        main_soup = BeautifulSoup(res.content, 'html.parser')

        page['id'] = self.get_id_from_URL(URL)
        page['title'] = self.get_title(main_soup)
        page['date'] = self.get_date(main_soup)
        page['#replies'] = self.get_num_replies(main_soup)
        page['post'] = self.get_post(main_soup)
        page['comments'] = self.get_comments(main_soup)
        page['related_links'] = self.get_related_links(main_soup)

    def get_title(self, soup):
      title = soup.find('h1', class_='sc-20504436-1 leApID').text
      return title

    def get_date(self, soup):
      A = soup.find_all('header', class_ = 'sc-20504436-0 KVPgp post-header')
      a = A[0].find_all('time')
      return a[0].text

    def get_num_replies(self, soup):
      A = soup.find_all('header', class_ = 'sc-20504436-0 KVPgp post-header')
      try:
        a = A[0].find_all('a')
        ans = int(a[2].text.split(' ')[0])
        return ans
      except:
        return 0

    def get_post(self, soup):
      A = soup.find_all('div', class_ = 'sc-eceb18a8-1 gdVBMy js-post-body')

      ans = ''
      for x in A:
        a = x.find_all('p')
        for b in a:
          ans = ans + b.text + '\n'
      return ans

    def get_comments(self, soup):
      A = soup.find_all('div', class_ = 'sc-4221636f-0 bYgtgn')

      ans = []
      for x in A:
        a = x.find_all('p')
        txt = ''
        for b in a:
          txt = txt + b.text + '\n'
        ans.append(txt)
      return ans

    def get_related_links(self, soup):
        """
        Get the related links of the movie from the More like this section of the page from the soup

        Parameters
        ----------
        soup: BeautifulSoup
            The soup of the page
        Returns
        ----------
        List[str]
            The related links of the movie
        """
        A = soup.find_all('div', class_ = 'sc-b58d9291-1 dNPdhv')
        ans = []
        for x in A:
          a = x.find_all('a')
          for b in a:
            if 'posts' in b['href'] and '?responses' not in b['href']:
              ans.append('https://healthunlocked.com' + b['href'])
        return ans



In [3]:
crawler = Crawler(crawling_threshold=20000)
crawler.login()
crawler.start_crawling()
crawler.write_to_file_as_json()