In [1]:
import requests
from bs4 import BeautifulSoup
from itertools import chain
import sys

class PostDetailedDataScraper:
    def getSoupForPost(self, post_link):
        page = requests.get(post_link)
        soup = BeautifulSoup(page.text, 'html.parser')
        post_div = soup.find("div", id=lambda x: x and x.startswith('post_'))
        return post_div

    def getPostDetailFromSoup(self, soup):
        post_message = soup.find("td", id=lambda x: x and x.startswith('postmessage_'))
        
        message_text = post_message.text.split(u'注册获取更多干货')
        if len(message_text) > 1:
            message_text = message_text[1]
        else:
            message_text = message_text[0]
        message_text = message_text.strip().replace('\n', '')
        
        dating_type_text = soup.find("span", {'style': 'margin-top: 3px'}).text
        source_gender, target_gender, location = self.getSourceAndTargetGendersFromDatingTypeText(dating_type_text)
        # filter out same gender dating, volume too small
        if source_gender == target_gender:
            return None
        
        return {
            'message_text': message_text,
            'source_gender': source_gender,
            'target_gender': target_gender,
            'location': location,
        }
    
    def getSourceAndTargetGendersFromDatingTypeText(self, dating_type_text):
        dating_type_texts = dating_type_text.split('找')
        
        if '女' in dating_type_texts[0]:
            source_gender = 'Female'
        else:
            source_gender = 'Male'
        
        if '女' in dating_type_texts[1]:
            target_gender = 'Female'
        else:
            target_gender = 'Male'
            
        location_texts = dating_type_text.split('@')
        location = None
        if len(location_texts) > 1:
            location = location_texts[1]
            
        return (source_gender, target_gender, location)
        
    def getPostDetailFromPostLink(self, post_link):
        soup = self.getSoupForPost(post_link)
        post_detail_data = self.getPostDetailFromSoup(soup)
        return post_detail_data
    
    # helper function to continue on from PostMetaDataScraper
    # and combine the detail data with the metadata
    def run(self, posts_metadata):
        posts_data = []
        for i in range(len(posts_metadata)):
            
            if i % 10 == 0:
                print('starting for # ' + str(i) + ' post')
            
            post_metadata = posts_metadata[i]
            post_link = post_metadata['post_link']
            
            try:
                post_detail = self.getPostDetailFromPostLink(post_link)
            except:
#                 print(post_link)
                continue
                
            if post_detail is None:
                print(post_link)
                continue
            post_data = {**post_metadata, **post_detail}
            posts_data.append(post_data)
        return posts_data

In [2]:
%run FileManager.ipynb
file_manager = FileManager()
posts_metadata = file_manager.readFromFile('posts_metadata')
posts_data = file_manager.readFromFile('posts_data')

In [3]:
detail_scraper = PostDetailedDataScraper()
posts_data = detail_scraper.run(posts_metadata)

starting for # 0 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=426487&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=407479&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
starting for # 10 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=432963&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=218654&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
starting for # 20 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=299693&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=311517&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
starting for # 30 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=334965&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sorti

http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=8140&extra=page%3D4%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
starting for # 250 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=311618&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=273639&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=276343&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=101050&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=201322&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=173563&extra=page%3D5%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=

http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=275206&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=172731&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=204959&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=229915&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=218700&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=217477&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=5690&extra=page%3D6%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=202709&extra=page%3D6%26

http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=100117&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=98263&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=191970&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=44365&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=93578&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
starting for # 410 post
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=59865&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=79971&extra=page%3D8%26filter%3Dsortid%26sortid%3D195%26sortid%3D195
http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=974

In [66]:
file_manager.saveToFile(posts_data, 'posts_data')

In [4]:
len(posts_data)

216

In [11]:
posts_data[2]

{'location': ' 尔湾orange 地区',
 'message_text': 'x大家好，\r本人女IT，现在南加州尔湾地区，orange county地区，在大公司稳定工作，来美已久，有身份。开始考虑个人问题。嘻嘻，长相75-85，个人自己评判，身高165公分，体重110，CS硕士毕业，所以现在也从事本职行业。 个性友善随和，为人真诚踏实，老家是长江三角洲地区。 \r希望找个。。。。。。\r1.年龄在29-39左右，但是如果心里年龄成熟，就不太care了。：）\r2. 身材偏瘦，不要太胖，(sorry，只是个人偏好而已，没有别的意思，opps.)\r3,身高176以上，聪明，最好也是理工科的或者码农吧。人家说同行是冤家，不过我倒不这么认为，哈哈。\r4.工作稳定，期望长期留美，至少5年内打算在美国吧，以后的事情谁也不好说。啧啧。\r5.喜欢看书（option..lol）\r6.学历硕士、博士（特聪明的本科也可以，O(∩_∩)O哈哈~）\r随缘吧。缘分天注定，如果有想交朋友的，可以留言，我会加你的，平时工作有点小忙碌，不过偶也会尽快回复的。谢谢大家啦，围观的不要喷我。(#^.^#)',
 'post_comment_num': 8,
 'post_creation_date': '2018-7-9',
 'post_link': 'http://www.1point3acres.com/bbs/forum.php?mod=viewthread&tid=433264&extra=page%3D1%26filter%3Dsortid%26sortid%3D195%26sortid%3D195',
 'post_title': '尔湾OC地区真诚找男朋友',
 'source_gender': 'Female',
 'target_gender': 'Male'}