In [1]:
#Notebook for experiments in gathering data

In [1]:
import os
import re
import csv
import requests
import warnings
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm, tqdm_notebook
#warnings.filterwarnings('ignore')

In [4]:
df_path = './raw_data/film_media_df.csv'
media_name = 'filmcomment'

In [5]:
df = pd.read_csv(df_path)
print(len(df))
print(df['media'].unique())
df.head()

344225
['deadline' 'hollywood_reporter' 'variety' 'filmcomment']


Unnamed: 0,media,article_url,text_path,category
0,deadline,https://deadline.com/2019/01/4-percent-challen...,deadline/0.txt,film
1,deadline,https://deadline.com/2019/03/jared-harris-jare...,deadline/1.txt,film
2,deadline,https://deadline.com/2019/03/sky-tin-star-tim-...,deadline/2.txt,film
3,deadline,https://deadline.com/2019/02/jj-abrams-jedi-st...,deadline/3.txt,film
4,deadline,https://deadline.com/2019/03/june-harding-dies...,deadline/4.txt,film


In [6]:
df = df.loc[df.media == media_name]
print(len(df))
df.head()

4046


Unnamed: 0,media,article_url,text_path,category
340179,filmcomment,https://www.filmcomment.com/blog/interview-bre...,filmcomment/340179.txt,blog
340180,filmcomment,https://www.filmcomment.com/blog/deep-focus-ma...,filmcomment/340180.txt,blog
340181,filmcomment,https://www.filmcomment.com/blog/lois-weber-du...,filmcomment/340181.txt,blog
340182,filmcomment,https://www.filmcomment.com/blog/making-case-c...,filmcomment/340182.txt,blog
340183,filmcomment,https://www.filmcomment.com/blog/the-film-comm...,filmcomment/340183.txt,blog


In [7]:
# import requests
def get_html(url):
    fake_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers=fake_headers)
    return response.text

def get_html_test():
    #test_url = df.iloc[0]['article_url']
    test_url = 'https://www.filmcomment.com/article/jia-zhangke-interview/'
    print(test_url)
    print(get_html(test_url))

get_html_test()

https://www.filmcomment.com/article/jia-zhangke-interview/

<!doctype html>
<!--[if IE 8 ]>
<html lang="en" class="no-js oldie ie8">
<![endif]-->
<!--[if IE 9 ]>
<html lang="en" class="no-js oldie ie9">
<![endif]-->
<!--[if (gt IE 9)|!(IE)]><!--> <html lang="en" class="no-js"> <!--<![endif]-->
    <head>
        <meta charset="UTF-8">
                <meta content="True" name="HandheldFriendly">
        <meta name="viewport" content="initial-scale=1.0001, minimum-scale=1.0001, maximum-scale=1.0001, user-scalable=no,width=device-width"/>

        <title>Interview: Jia Zhang-ke - Film Comment</title>

        <script src="https://use.typekit.net/qdo0kfw.js"></script>
        <script>try{Typekit.load({ async: true });}catch(e){}</script>

                    <link rel="stylesheet" href="https://fgmxi4acxur9qbg31y9s3a15-wpengine.netdna-ssl.com/wp-content/themes/filmcomment/static/css/screen-77f46cb6.css">
        
        <link rel="shortcut icon" href="https://fgmxi4acxur9qbg31y9s3a15-wpe

In [9]:
#from bs4 import BeautifulSoup
def get_article_text(html_text):
    soup = BeautifulSoup(html_text, "lxml")
    mydivs = soup.find("div", {"class": 'post-content'})
    text = ''
    for p in mydivs.find_all("p"):
        text += p.text + ' '
    return text

def get_article_meta(html_text, meta_names):
    ret = {}
    soup = BeautifulSoup(html_text, "lxml")
    for name in meta_names:
        for meta in soup.find_all("meta", {"property": name}):
            if name not in ret:
                ret[name] = meta['content']
            elif ret[name] != meta['content']:
                ret[name] = ret[name] + ', ' + meta['content']
    return ret

def get_article_text_test():    
    #test_url = df.iloc[0]['article_url']
    test_url = 'https://www.filmcomment.com/article/jia-zhangke-interview/'
    html_text = get_html(test_url)
    return get_article_text(html_text)

def get_article_meta_test():    
    #test_url = df.iloc[0]['article_url']
    test_url = 'https://www.filmcomment.com/article/jia-zhangke-interview/'
    html_text = get_html(test_url)
    return get_article_meta(html_text, ['og:type', 'og:title', 'og:description', 'article:section', 'article:published_time'])

get_article_meta_test()

{'og:type': 'article',
 'og:title': 'Interview: Jia Zhang-ke - Film Comment',
 'og:description': '24 City director Jia Zhangke discusses the realities and responsibilities of Chinese cinema with Andrew Chan in this expanded, exclusively-online interview',
 'article:section': 'Interviews'}

In [33]:
try:
    os.mkdir(media_name)
except:
    print("cannot create dir")
    
df_meta_names = ['og:type', 'og:title', 'og:description', 'article:section', 'article:published_time']

In [38]:
def parse_article_urls(df, meta_names):    
    modified_df = df
    # expand dataframe, add columns for meta    
    for name in meta_names:
        prefix, real_name = name.split(':')
        modified_df[real_name] = None
        
    cannot_parsed = {} # {url : reason}
    with tqdm(desc="rows", total=len(df)) as pbar_outer:
        for row in df.itertuples():
            url = getattr(row, 'article_url')
            # part 0: get html text
            html_text = None
            try:
                html_text = get_html(url)
            except:
                cannot_parsed[url] = 'cannot get html'
                pbar_outer.update(1)
                continue                
            # part 1: get article text
            article_text = None
            try:
                article_text = get_article_text(html_text)                          
            except:
                cannot_parsed[url] = 'cannot get text'
                pbar_outer.update(1)
                continue
            # part 2: get article meta
            article_meta = None
            try:
                article_meta = get_article_meta(html_text, meta_names)
            except:
                cannot_parsed[url] = 'cannot get meta'
                pbar_outer.update(1)
                continue
            # part 3: save data
            try:                
                for name in meta_names:
                    prefix, real_name = name.split(':')
                    modified_df.at[getattr(row,'Index'),real_name] = article_meta[name]
                file = open(getattr(row,'text_path'),'w')
                file.write(article_text)  
                file.close()
            except:
                cannot_parsed[url] = 'cannot save data'
            pbar_outer.update(1)
    return (modified_df, cannot_parsed)

In [39]:
mod_df, errors = parse_article_urls(df, df_meta_names)

rows:   0%|          | 1/4046 [00:00<1:01:53,  1.09it/s]


In [40]:
if len(errors)>0:
    file = open(f'{media_name}_errors.csv', "w")
    f = csv.writer(file)
    f.writerow(["url", "reason"])
    for key, value in errors.items():
      f.writerow([key, value])
    file.close()
    
#mod_df.head()    

Unnamed: 0,media,article_url,text_path,category,type,title,description,section,published_time
340179,filmcomment,https://www.filmcomment.com/blog/interview-bre...,filmcomment/340179.txt,blog,article,Interview: Brett Story - Film Comment,Warming up: the filmmaker behind The Hottest A...,Interviews,2019-03-11T16:57:43+00:00
340180,filmcomment,https://www.filmcomment.com/blog/deep-focus-ma...,filmcomment/340180.txt,blog,,,,,
340181,filmcomment,https://www.filmcomment.com/blog/lois-weber-du...,filmcomment/340181.txt,blog,,,,,
340182,filmcomment,https://www.filmcomment.com/blog/making-case-c...,filmcomment/340182.txt,blog,,,,,
340183,filmcomment,https://www.filmcomment.com/blog/the-film-comm...,filmcomment/340183.txt,blog,,,,,


In [18]:
mod_df.to_csv("mod_df.csv", index=True)