In [1]:
#Notebook for experiments in gathering data

In [8]:
import os
import re
import csv
import requests
import warnings
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm, tqdm_notebook
#warnings.filterwarnings('ignore')

In [3]:
media_name = 'hollywood_reporter'
df_path = f"../1-most-common-page-structure/mod_df/mod_hwr_df.csv"
errors_path = f"../1-most-common-page-structure/errors/hwr_errors.csv"

In [4]:
df = pd.read_csv(df_path)
err_df = pd.read_csv(errors_path)
print("df size =", len(df))
print("err_df size =", len(err_df))
err_df.head()

df size = 101918
err_df size = 2561


Unnamed: 0,url,reason
0,https://www.hollywoodreporter.com/features/woo...,cannot get text
1,https://www.hollywoodreporter.com/news/directo...,cannot get meta
2,https://www.hollywoodreporter.com/features/dir...,cannot get text
3,https://www.hollywoodreporter.com/features/lor...,cannot get text
4,https://www.hollywoodreporter.com/features/jam...,cannot get text


In [5]:
try:
    os.mkdir(media_name)
except:
    print("cannot create dir")

df_article_class_names = ["article__body js-fitvids-content", "longform__body-primary"]
df_meta_names = ['title', 'description', 'date', 'author', 'vertical', 'tags']

cannot create dir


In [6]:
# import requests
def get_html(url):
    fake_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers=fake_headers)
    return response.text

def get_html_test():
    test_url = df.iloc[0]['article_url']
    print(test_url)
    print(get_html(test_url))

In [7]:
#from bs4 import BeautifulSoup
def get_article_text(html_text):
    soup = BeautifulSoup(html_text, "lxml")
    for class_name_str in df_article_class_names:
        mydivs = soup.find("div", {"class": class_name_str})
        if mydivs != None:
            text = ''
            for p in mydivs.find_all("p"):
                text += p.text + ' '
            return text
    return None    

def get_article_meta(html_text, meta_names):
    ret = {}
    soup = BeautifulSoup(html_text, "lxml")
    for name in meta_names:
        for meta in soup.find_all("meta", {"name": "sailthru."+name}):
            if name not in ret:
                ret[name] = meta['content']
            elif ret[name] != meta['content']:
                ret[name] = ret[name] + ', ' + meta['content']
    return ret

def get_article_text_test():    
    test_url = df.iloc[0]['article_url']
    html_text = get_html(test_url)
    return get_article_text(html_text)

def get_article_meta_test():    
    test_url = df.iloc[0]['article_url']    
    html_text = get_html(test_url)
    return get_article_meta(html_text, ['title', 'description', 'date', 'author', 'vertical', 'tags'])

#get_article_text_test()

In [8]:
def parse_article_urls(df, err_df, meta_names):    
    modified_df = df        
    cannot_parsed = {} # {url : reason}
    with tqdm(desc="rows", total=len(err_df)) as pbar_outer:
        for row in err_df.itertuples():
            url = getattr(row, 'url')
            # part 0: get html text
            html_text = None
            try:
                html_text = get_html(url)
            except:
                cannot_parsed[url] = 'cannot get html'
                pbar_outer.update(1)
                continue                
            # part 1: get article text
            article_text = None
            try:
                article_text = get_article_text(html_text)                          
            except:
                cannot_parsed[url] = 'cannot get text'
                pbar_outer.update(1)
                continue
            # part 2: get article meta
            article_meta = None
            try:
                article_meta = get_article_meta(html_text, meta_names)
            except:
                cannot_parsed[url] = 'cannot get meta'
                pbar_outer.update(1)
                continue
            # part 3: save data
            try:
                df_row = df.loc[df.article_url == url].to_dict()
                index = list(df_row['text_path'].keys())[0]
                text_path = list(df_row['text_path'].values())[0]
                for name in meta_names:
                    if name in article_meta:
                        modified_df.at[index,name] = article_meta[name]        
                file = open(text_path,'w')
                file.write(article_text)  
                file.close()
            except:
                cannot_parsed[url] = 'cannot save data'
            pbar_outer.update(1)
    return (modified_df, cannot_parsed)

In [9]:
mod_df, errors = parse_article_urls(df, err_df, df_meta_names)

rows: 100%|██████████| 2561/2561 [38:18<00:00,  1.08s/it] 


In [10]:
errors

{'https://www.hollywoodreporter.com/news/isabelle-huppert-wins-best-performance-by-an-actress-a-motion-picture-drama-golden-globes-2017': 'cannot save data',
 'https://www.hollywoodreporter.com/features/making-hidden-figures-how-taraji-p-henson-octavia-spencer-pharrell-williams-revisited-60': 'cannot save data',
 'https://www.hollywoodreporter.com/features/elton-johns-oscar-party-turns-25': 'cannot save data',
 'https://www.hollywoodreporter.com/news/gotham-awards-red-carpet-interviews-2016': 'cannot save data',
 'https://www.hollywoodreporter.com/news/office-christmas-party-trailer-2': 'cannot save data',
 'https://www.hollywoodreporter.com/news/dev-patel-lion-just-be-an-man-a-modern-20': 'cannot save data',
 'https://www.hollywoodreporter.com/features/composer-roundtable-6': 'cannot save data',
 'https://www.hollywoodreporter.com/features/actress-roundtable-emma-stone-natalie-portman-taraji-p-henson-4': 'cannot save data',
 'https://www.hollywoodreporter.com/news/kirk-douglass-100': 

In [11]:
if len(errors)>0:
    file = open(f'{media_name}_errors.csv', "w")
    f = csv.writer(file)
    f.writerow(["url", "reason"])
    for key, value in errors.items():
      f.writerow([key, value])
    file.close()
    
#mod_df.head()    

In [12]:
mod_df = mod_df.drop(df.columns[0], axis=1)

In [13]:
mod_df.to_csv(f"full_{media_name}_df.csv", index=True)

In [24]:
df = pd.read_csv(f'./full_df/full_hollywood_reporter_df.csv')
err_df = pd.read_csv('hollywood_reporter_errors.csv')
print(len(df), len(err_df))

101918 38


In [25]:
for row in err_df.itertuples():
    url = getattr(row, 'url')
    df_row = df.loc[df.article_url == url].to_dict()
    index = list(df_row['text_path'].keys())[0]
    df = df.drop(index=index)
len(df)    

101880