In [1]:
#Notebook for experiments in gathering data

In [2]:
import os
import re
import csv
import requests
import warnings
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm, tqdm_notebook
warnings.filterwarnings('ignore')

In [5]:
df_path = '../../raw_data/film_media_df.csv'
media_name = 'variety'

In [6]:
df = pd.read_csv(df_path)
print(len(df))
print(df['media'].unique())
df.head()

344225
['deadline' 'hollywood_reporter' 'variety' 'filmcomment']


Unnamed: 0,media,article_url,text_path,category
0,deadline,https://deadline.com/2019/01/4-percent-challen...,deadline/0.txt,film
1,deadline,https://deadline.com/2019/03/jared-harris-jare...,deadline/1.txt,film
2,deadline,https://deadline.com/2019/03/sky-tin-star-tim-...,deadline/2.txt,film
3,deadline,https://deadline.com/2019/02/jj-abrams-jedi-st...,deadline/3.txt,film
4,deadline,https://deadline.com/2019/03/june-harding-dies...,deadline/4.txt,film


In [7]:
df = df.loc[df.media == media_name]
print(len(df))
df.head()

187170


Unnamed: 0,media,article_url,text_path,category
153009,variety,https://variety.com/2016/film/columns/spotligh...,variety/153009.txt,columns
153010,variety,https://variety.com/2017/film/columns/sofia-co...,variety/153010.txt,columns
153011,variety,https://variety.com/2016/film/columns/reboots-...,variety/153011.txt,columns
153012,variety,https://variety.com/2016/film/columns/oscars-2...,variety/153012.txt,columns
153013,variety,https://variety.com/2018/film/columns/oscars-2...,variety/153013.txt,columns


In [9]:
# import requests
def get_html(url):
    fake_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers=fake_headers)
    return response.text

def get_html_test():
    test_url = df.iloc[0]['article_url']
    print(test_url)
    print(get_html(test_url))

#get_html_test()

https://variety.com/2016/film/columns/spotlight-oscars-analysis-2016-1201719746/
<!DOCTYPE html>
<!--[if IE 6]>
<html id="ie6" lang="en">
<![endif]-->
<!--[if IE 7]>
<html id="ie7" lang="en">
<![endif]-->
<!--[if IE 8]>
<html id="ie8" lang="en">
<![endif]-->
<!--[if !(IE 6) | !(IE 7) | !(IE 8) ]><!-->
<html lang="en"  itemscope itemtype="http://schema.org/NewsArticle"  xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
<!--<![endif]-->
<head>
	<meta charset="UTF-8" />
	<!-- Google Chrome Frame for IE -->
	<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">

	<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" />
	<link rel="profile" href="http://gmpg.org/xfn/11" />
		<link rel="pingback" href="https://variety.com/xmlrpc.php" />

	
	
<meta name="description" content="What was the key to &quot;Spotlight&#039;s&quot; success in navigating the closest Oscar race in recent memory?" />
<meta name="alexaV

In [10]:
#from bs4 import BeautifulSoup
def get_article_text(html_text):
    soup = BeautifulSoup(html_text, "lxml")
    text = soup.find("meta", {"name": 'body'})['content']
    return text

def get_article_meta(html_text, meta_names):
    ret = {}
    soup = BeautifulSoup(html_text, "lxml")
    for name in meta_names:
        for meta in soup.find_all("meta", {"name": name}):
            if name not in ret:
                ret[name] = meta['content']
            elif ret[name] != meta['content']:
                ret[name] = ret[name] + ', ' + meta['content']
    return ret

def get_article_text_test():    
    test_url = df.iloc[0]['article_url']
    html_text = get_html(test_url)
    return get_article_text(html_text)

def get_article_meta_test():    
    test_url = df.iloc[0]['article_url']
    html_text = get_html(test_url)
    return get_article_meta(html_text, ['content_type', 'topics', 'title', 'author', 'published_at', 'tags'])

get_article_meta_test()

{'content_type': 'Article',
 'topics': 'Awards, Columns, Features',
 'title': "'Spotlight': A Win for 'the Film Everyone Loved' Caps an Unconventional Season",
 'author': 'Kristopher Tapley',
 'published_at': '2016-03-02 10:00:41',
 'tags': 'Oscars 2016, Spotlight'}

In [11]:
try:
    os.mkdir(media_name)
except:
    print("cannot create dir")
    
df_meta_names = ['content_type', 'topics', 'title', 'author', 'published_at', 'tags']

In [15]:
def parse_article_urls(df, meta_names):    
    modified_df = df
    # expand dataframe, add columns for meta    
    for name in meta_names:
        modified_df[name] = None
        
    cannot_parsed = {} # {url : reason}
    with tqdm(desc="rows", total=len(df)) as pbar_outer:
        for row in df.itertuples():
            url = getattr(row, 'article_url')
            # part 0: get html text
            html_text = None
            try:
                html_text = get_html(url)
            except:
                cannot_parsed[url] = 'cannot get html'
                pbar_outer.update(1)
                continue                
            # part 1: get article text
            article_text = None
            try:
                article_text = get_article_text(html_text)                          
            except:
                cannot_parsed[url] = 'cannot get text'
                pbar_outer.update(1)
                continue
            # part 2: get article meta
            article_meta = None
            try:
                article_meta = get_article_meta(html_text, meta_names)
            except:
                cannot_parsed[url] = 'cannot get meta'
                pbar_outer.update(1)
                continue
            # part 3: save data
            try:                
                for name in meta_names:
                    modified_df.at[getattr(row,'Index'),name] = article_meta[name]
                file = open(getattr(row,'text_path'),'w')
                file.write(article_text)  
                file.close()
            except:
                cannot_parsed[url] = 'cannot save data'
            pbar_outer.update(1)
    return (modified_df, cannot_parsed)

In [16]:
mod_df, errors = parse_article_urls(df, df_meta_names)

rows:   0%|          | 1/187170 [00:01<60:15:03,  1.16s/it]


In [17]:
if len(errors)>0:
    file = open(f'{media_name}_errors.csv', "w")
    f = csv.writer(file)
    f.writerow(["url", "reason"])
    for key, value in errors.items():
      f.writerow([key, value])
    file.close()
    
mod_df.head()    

Unnamed: 0,media,article_url,text_path,category,content_type,topics,title,author,published_at,tags
153009,variety,https://variety.com/2016/film/columns/spotligh...,variety/153009.txt,columns,Article,"Awards, Columns, Features",'Spotlight': A Win for 'the Film Everyone Love...,Kristopher Tapley,2016-03-02 10:00:41,"Oscars 2016, Spotlight"
153010,variety,https://variety.com/2017/film/columns/sofia-co...,variety/153010.txt,columns,,,,,,
153011,variety,https://variety.com/2016/film/columns/reboots-...,variety/153011.txt,columns,,,,,,
153012,variety,https://variety.com/2016/film/columns/oscars-2...,variety/153012.txt,columns,,,,,,
153013,variety,https://variety.com/2018/film/columns/oscars-2...,variety/153013.txt,columns,,,,,,


In [18]:
mod_df.to_csv("mod_df.csv", index=True)