In [1]:
from requests import get
from bs4 import BeautifulSoup
from collections import deque
from concurrent.futures import ThreadPoolExecutor, wait
from threading import Lock
import json

import time
import random

In [None]:
class Crawler:
    """
    put your own user agent in the headers
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US'
    }
    top_URL = 'https://forum.tudiabetes.org/'

    def __init__(self, crawling_threshold=1000):
        """
        Initialize the crawler

        Parameters
        ----------
        crawling_threshold: int
            The number of pages to crawl
        """
        
        self.crawling_threshold = crawling_threshold
        self.not_crawled = []
        self.crawled = []
        self.crawled_ids = []
        self.added_ids = []
        self.add_list_lock = Lock()
        self.add_queue_lock = Lock()

    def get_id_from_URL(self, URL):
        """
        Get the id from the URL of the site. The id is what comes exactly after title.
        for example the id for the movie https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1 is tt0111161.

        Parameters
        ----------
        URL: str
            The URL of the site
        Returns
        ----------
        str
            The id of the site
        """
        
        return '/'.join(URL.split('/')[3:])

    def write_to_file_as_json(self):
        """
        Save the crawled files into json
        """
        with open('TU_crawled.json', 'w') as f:
            f.write(json.dumps(self.crawled))
            f.close()

        with open('TU_not_crawled.json', 'w') as f:
            f.write(json.dumps(self.not_crawled))
            f.close()

    def read_from_file_as_json(self):
        """
        Read the crawled files from json
        """
        
        with open('TU_crawled.json', 'r') as f:
            raw_data_0 = f.read()
            f.close()

        self.crawled = json.loads(raw_data_0)

        with open('TU_not_crawled.json', 'w') as f:
            raw_data_1 = f.read()
            f.close()

        self.not_crawled = json.loads(raw_data_1)

        self.added_ids = [self.get_id_from_URL(link) for link in self.not_crawled]

    def crawl(self, URL):
        """
        Make a get request to the URL and return the response

        Parameters
        ----------
        URL: str
            The URL of the site
        Returns
        ----------
        requests.models.Response
            The response of the get request
        """
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
        ]
        Crawler.headers['User-Agent'] = random.choice(user_agents)
        res = get(URL, headers=Crawler.headers)
        if res.status_code == 200:
            return res
        else:
            return None


    def extract_top_links(self):
        """
        Extract the top 250 movies from the top 250 page and use them as seed for the crawler to start crawling.
        """
        
        res = self.crawl(self.top_URL)
        top_soup = BeautifulSoup(res.content, 'html.parser')
        link_elements = top_soup.select('a[href]')
        abs = 'https://forum.tudiabetes.org/'
        for link in link_elements:
            if link['href'].startswith('https://forum.tudiabetes.org/t/'):
                id = self.get_id_from_URL(link['href'])
                new_url = abs + id
                if new_url not in self.not_crawled:
                    self.not_crawled.append(new_url)
                    self.added_ids.append(id)


    def get_page_instance(self):
        return {
            'id': None,
            'title': None,
            'category': None,
            'chat_list': None,
            'related_links': None
        }

    def start_crawling(self):
        """
        Start crawling the movies until the crawling threshold is reached.
    
        ThreadPoolExecutor is used to make the crawler faster by using multiple threads to crawl the pages.
        You are free to use it or not. If used, not to forget safe access to the shared resources.
        """

        

        self.extract_top_links()
        futures = []
        crawled_counter = 0

        with ThreadPoolExecutor(max_workers=20) as executor:
            while len(self.crawled) < self.crawling_threshold:

                self.add_list_lock.acquire()
                URL = self.not_crawled.pop(0)
                self.add_list_lock.release()
                futures.append(executor.submit(self.crawl_page_info, URL))
                if len(self.not_crawled) == 0:
                    wait(futures)
                    futures = []

    def crawl_page_info(self, URL):
        """
        Main Logic of the crawler. It crawls the page and extracts the information of the movie.
        Use related links of a movie to crawl more movies.

        Parameters
        ----------
        URL: str
            The URL of the site
        """
        

        if self.get_id_from_URL(URL) in self.crawled_ids or len(self.crawled) >= self.crawling_threshold:
          return

        page_id = self.get_id_from_URL(URL)

        res = self.crawl(URL)
        time.sleep(10)

        if (res is not None):
            page_instance = self.get_page_instance()
            self.extract_movie_info(res, page_instance, URL)

            self.add_list_lock.acquire()
            self.crawled.append(page_instance)
            self.added_ids.remove(page_id)
            self.crawled_ids.append(page_id)

            for link in page_instance['related_links']:
                self.not_crawled.append(link)
                self.added_ids.append(self.get_id_from_URL(link))

            self.add_list_lock.release()

            return page_instance
        else:

            self.add_list_lock.acquire()
            self.not_crawled.append(URL)
            self.add_list_lock.release()

    def extract_movie_info(self, res, page, URL):
        """
        Extract the information of the movie from the response and save it in the movie instance.

        Parameters
        ----------
        res: requests.models.Response
            The response of the get request
        movie: dict
            The instance of the movie
        URL: str
            The URL of the site
        """
        


        main_soup = BeautifulSoup(res.content, 'html.parser')

        page['id'] = self.get_id_from_URL(URL)
        page['title'] = URL.split('/')[-2]
        page['category'] = self.get_category(main_soup)
        page['chat_list'] = self.get_chatlist(main_soup)
        page['related_links'] = self.get_related_links(main_soup)


    def get_chatlist(self, soup):
      chat_elements = soup.find_all(class_ = 'post')
      chats = []
      for chat in chat_elements:
        s = chat.find_all('p')
        text = ''
        for d in s:
          text = text +  d.text + '\n'
        chats.append(text)
      return chats

    def get_category(self, soup):
      spans = soup.find_all('span', class_ = 'category-name')
      return spans[0].get_text()

    def get_related_links(self, soup):
        """
        Get the related links of the movie from the More like this section of the page from the soup

        Parameters
        ----------
        soup: BeautifulSoup
            The soup of the page
        Returns
        ----------
        List[str]
            The related links of the movie
        """
        links = []
        link_elements = soup.select('a[href]')
        for link in link_elements:
          if link['href'].startswith('https://forum.tudiabetes.org/t'):
            links.append(link['href'])
        return links



In [3]:
crawler = Crawler(crawling_threshold=20000)
crawler.start_crawling()
crawler.write_to_file_as_json()