In [4]:
import requests
from bs4 import BeautifulSoup
from itertools import chain

class PostMetaDataScraper:
        
    # example link: 'http://www.1point3acres.com/bbs/forum-80-1.html'
    def getLinksForMainPage(self):
        main_page_links = []
        prefix = 'http://www.1point3acres.com/bbs/forum.php?mod=forumdisplay&fid=80&sortid=195&filter=sortid&sortid=195&page='
        for i in range(9):
            link = prefix + str(i)
            main_page_links.append(link)
        return main_page_links
    
    # soup: <td> elements of a post
    def getPostsSoupForMainPage(self, main_page_links):
        soups = []
        for i in range(len(main_page_links)):
            page = requests.get(main_page_links[i])
            soup = BeautifulSoup(page.text, 'html.parser')
            soup = soup.findAll("tbody", id=lambda x: x and x.startswith('normalthread_'))
            soups.append(soup)
        soups = list(chain(*soups))
        return soups
        
    # parse the soup <td> elements into dictionary meta data
    # also do some filtering (return None if the post is not a dating post)
    def getPostMetaDataFromSoup(self, soup):
        type_special_href = 'http://www.1point3acres.com/bbs/forum.php?mod=forumdisplay&fid=80&filter=sortid&sortid=195'
        is_dating_post = soup.find('a', {'href': type_special_href})
        if is_dating_post is None:
            return None
        
        post_creation_date = soup.findAll('td')[1].find('span').text
        post_link = soup.find('a')['href']
        post_comment_num = int(soup.find('td', {'class': 'num'}).find('a').text)
        post_title = soup.find('a', {'class': 's xst'}).text
        
        return {
            'post_creation_date': post_creation_date,
            'post_link': post_link,
            'post_comment_num': post_comment_num,
            'post_title': post_title,
        }
    
    # main function, get all filtered posts link with meta data
    def run(self):
        main_page_links = self.getLinksForMainPage()
        posts_soup = self.getPostsSoupForMainPage(main_page_links)
        posts_metadata = []
        for post_soup in posts_soup:
            post_metadata = self.getPostMetaDataFromSoup(post_soup)
            if post_metadata is None:
                continue
            posts_metadata.append(post_metadata)   
        return posts_metadata
    

In [5]:
post_metadata_scraper = PostMetaDataScraper()
posts_metadata = post_metadata_scraper.run()

In [6]:
len(posts_metadata)

450

In [7]:
%run FileManager.ipynb
file_manager = FileManager()
file_manager.saveToFile(posts_metadata, 'posts_metadata')