In [4]:
from bs4 import BeautifulSoup as bs 
import urllib 
import pandas as pd

def get_article(query):
    return bs(urllib.request.urlopen(query), 'html.parser').find("section", attrs={"class":"sidebar_1"})

class Article(object)  : 
    def __init__(self, article_soup):
        self.article = article_soup
        self.author = self.get_author()
        self.datetime = self.get_datetime() 
        self.text = self.get_text()
        self.synopsis = self.get_synopsis()
        self.imgs = self.get_imgs() #This is a tuple of (alt, src)
        self.videos = self.get_videos()
    
    def get_author(self): 
        try: 
            return self.article.find_all('p', attrs={'class': 'author_mail'}).strong.get_text()
        except AttributeError: return None

    def get_datetime(self): 
        try: 
            return self.article.find_all('span', attrs={'class':'time left'})[0].get_text() 
        except AttributeError: return None

    def get_text(self):     
        try : 
            article_text = ''
            for text in self.article.find_all('p', attrs={'class':'Normal'}):
                article_text+= text.get_text()    
            return article_text.replace('\t', '').replace(u'\xa0', u' ').replace('\n','')        
        except AttributeError: return None

    def get_synopsis(self): 
        try: 
            return self.article.h2.get_text()
        except AttributeError: return None 
    
    def get_imgs(self):
        try: 
            return {'alt': [img['alt'] for img in self.article.find_all('img')], 'src': [img['src'] for img in self.article.find_all('img')]}
        except AttributeError: return {'alt': None, 'src': None}
        except KeyError: return {'alt': None, 'src': None}
    
    def get_videos(self): 
        try: 
            return [video['data-src'] for video in self.article.find_all('video')]
        except AttributeError: return None
        except KeyError: return None
    

class Articles(object): 
    def __init__(self, articles_list, output_msg = False):
        self.articles_list = articles_list
        self.show_output_msg = output_msg
        
        self.titles = []
        self.links = []
        self.ids = []
        self.authors = []
        self.datetime = []
        self.text = []
        self.synopses = []
        self.img_alts = []
        self.img_links = []
        #current_article
        self.video_links = []
        
    def generate_articles_content(self): 
        
        for link in self.articles_list.link: 
            current_article = Article(get_article(link))            
            
            self.titles.append(self.articles_list.title[self.articles_list.link == link].values[0])
            self.links.append(link)
            self.ids.append(self.articles_list.id[self.articles_list.link == link].values[0])
            self.authors.append(current_article.author)
            self.datetime.append(current_article.datetime)
            self.text.append(current_article.text)
            self.synopses.append(current_article.synopsis)
            self.img_alts.append(current_article.imgs['alt'])
            self.img_links.append(current_article.imgs['src'])
            self.video_links.append(current_article.videos)
            
            if self.show_output_msg:
                self.output_msg_update_article_info(current_article, self.titles[-1], self.links[-1],self.ids[-1])
            
        return len(self.titles) == len(self.links) == len(self.ids) == len(self.authors) == len(self.datetime) == len(self.text) == len(self.synopses) == len(self.img_alts) == len(self.img_links) == len(self.video_links)
            
            
    def generate_articles_content_dict(self):
        
        if self.generate_articles_content(): 
        
            return {'title': self.titles, #From filtered_result object
                     'link':self.links, #in Articles object 
                     'id': self.ids, #from filtered_result object
                     'author': self.authors, #attr of Article Object 
                     'datetime': self.datetime,
                     'text': self.text,
                     'synopsis': self.synopses,
                     'img_alt': self.img_alts,
                     'img_link': self.img_links,
                     #'video_alt': self.video_alts,
                     'video_link': self.video_links}
        
        else: return {}
        
    def output_msg_update_article_info(self, this_article, this_title, this_link, this_id):
        print("==================================================={}===========================================================".format(this_id))
        print("Title: {}".format(this_title))
        print("Author: {}".format(this_article.author))
        print("Link: {}\n".format(this_link))
    

In [None]:

query_str = 'https://vnexpress.net/category/day/page/{}.html?cateid=1001002&fromdate=1529964000&todate=1531517760&allcate=1001002|1001002|'
#query_str = "https://vnexpress.net/chien-dich-giai-cuu/tag-810752-1.html"

def get_articles(query):
    return bs(urllib.request.urlopen(query), 'html.parser').find("section", attrs={"class":"sidebar_1"}).find_all("article",attrs={"class":"list_news"})

def distribute_articles(articles, return_type='df'):      
    article_dict = {'title': [],
             'link': [],
               'description': [],
               'id' : []}
    
    for article in articles: 
        #article_dict['title'].append(article.a.get('title')) # if this does not work, use the one below
        article_dict['title'].append(article.img.get('alt'))
        article_dict['link'].append(article.a.get('href'))
        article_dict['description'].append(article.h4.get_text())
        article_dict['id'].append(article.a.get('href').split('-')[-1][:-5])
    if return_type == 'df':  
        return pd.DataFrame(data=article_dict)
    
    elif return_type == 'dict':
        return article_dict 
    
df_list = [] 

for page in range(1, 23): 
    page_url = query_str.format(str(page))
    articles = get_articles(page_url)
    df_list.append( distribute_articles(articles))
    
result = pd.concat(df_list).reset_index(drop=True)

kws = ['Thái Lan', 'đội bóng', 'Đội bóng', 'hang', 'mắc kẹt', 'thiếu niên']
temp = []
kw_filtered = [] 

for kw in kws: 
    for title in result.title: 
        if kw in title:  
            temp.append(title)
    
    kw_filtered.append(pd.concat([result[result.title == tmp] for tmp in temp])) 
    temp =[ ]#pd.DataFrame(columns = ['title', 'link','description','id']) 

filtered_result = pd.concat(kw_filtered).drop_duplicates().reset_index(drop = True)

In [None]:
art_df = {'tiltle': [], #From filtered_result object
         'link':[], #in Articles object 
         'id': [], #from filtered_result object
         'author': [], #attr of Article Object 
         'datetime':[],
         'text': [],
         'synopsis':[],
         'img_alt':[],
         'img_link': [],
         'video_alt': [],
         'video_link': []}

articles_data = Articles(filtered_result)
    
articles_data_dict = articles_data.generate_articles_content_dict() 

In [5]:
test = get_article('https://vnexpress.net/infographics/tu-lieu/hanh-trinh-gian-nan-thoat-khoi-hang-cua-doi-bong-nhi-thai-lan-mac-ket-3775068.html')

In [6]:
test.find_all('p', attrs = {'class': 'author_mail'})

[]

In [10]:
test

<section class="sidebar_1">
<!-- BOX SHARE BOTTOM -->
<div class="social_share width_common" id="social_like">
<div class="item_social font_icon" data-component="true" data-component-objectid="3775068" data-component-siteid="1000000" data-component-type="like" data-objecttype="1"></div> <div class="right luu_tin myvne_save_for_later" data-article-id="3775068" data-token="40035e52601f8c3e1672fdeda44c16ad"></div>
</div>
<!-- END BOX SHARE BOTTOM -->
<!-- BOX TIN KHAC VA LINK CHEO -->
<div class="box_bottom_detail width_common clearfix" id="box_topic_detail">
<h4 class="header_toppic">
<a href="https://vnexpress.net/giai-cuu-doi-bong-nhi-thai-lan-mac-ket-trong-hang/topic-23121.html"> Giải cứu đội bóng nhí Thái Lan mắc kẹt trong hang </a>
</h4>
<ul class="list_title">
<li>
<a href="https://vnexpress.net/tin-tuc/the-gioi/phan-tich/18-ngay-thuc-hien-nhiem-vu-bat-kha-thi-giai-cuu-cac-thieu-nien-thai-lan-3776032.html" title="18 ngày thực hiện 'nhiệm vụ bất khả thi' giải cứu các thiếu niên Thái