# Request & Save Data from The Guardian

## Settings & Libraries

In [None]:
import argparse
from datetime import date, timedelta
import json
import numpy as np
import os
from os import makedirs
from os.path import join, exists
import pandas as pd
import requests
import sys

In [None]:
os.chdir('/Users/M/Google_Drive/Thesis/Topic-Modeling')

## 1. Request Data (The Guardian API)
data was collected on 07/10/2020

#### Parameters

In [None]:
path_json = 'Data/Technology-Data/raw/'
makedirs(path_json, exist_ok=True)

In [None]:
API_Key = open('Data/Infos/API_Key_Guardian.txt').read().strip()

In [None]:
from_date = date(2000,1,1)
to_date = date(2019,12,31) #date.today()-timedelta(days=1)
dayrange = range((to_date - from_date).days + 1)
print('Request Data from {} to {}, i.e. {} days'.format(from_date, to_date, len(dayrange)))

Request Data from 2000-01-01 to 2019-12-31, i.e. 7305 days


In [None]:
params = {
    'api-key': API_Key,
    'use-date': 'published',
    'from-date': '',
    'to-date': '',
    'order-by': "newest",
    'show-fields': "all",
    'show-tags': 'all',
    'show-sections': 'true',
    'section': 'technology',
    'show-references': 'all',
    'lang': 'en'
    }

### Request & Save Data (.json)

In [None]:
empty_response_dates = []
already_existing_dates = []
successful_download_count = 0

In [None]:
# partly adapted from https://gist.github.com/dannguyen/c9cb220093ee4c12b840, 09/2020
for day in dayrange:
    try:
        dt = from_date + timedelta(days=day)
        date_str = dt.strftime('%Y-%m-%d')
        file_name = join(path_json, date_str + '.json')

        if not exists(file_name):
            all_results = []
            params['from-date'] = date_str
            params['to-date'] = date_str
            article_number = 1
            amount_articles = 1

            while article_number <= amount_articles:
                params['page'] = article_number
                resp = requests.get('http://content.guardianapis.com/search', params)
                data = resp.json()
                all_results.extend(data['response']['results'])
                amount_articles = data['response']['pages']
                article_number += 1

            if len(all_results) > 0:
                with open(file_name, 'w') as f:
                    f.write(json.dumps(all_results, indent=2))
                    successful_download_count = successful_download_count + 1
            else:
                empty_response_dates.append(date_str)
        else:
            already_existing_dates.append(date_str)

    except Exception as e:
        if e == 'response':
            print('Stop Iteration (API limitations): ', e)
        else:
            print('Exception: ', e)
        break

In [None]:
print('  ...Number of dates for which articles exist in the json-directory: %d.' % (len(already_existing_dates)+successful_download_count))
print('  ...Number of dates for which articles were successfully downloaded: %d.' % successful_download_count)
print('  ...Number of dates for which articles were already saved: %d.' % len(already_existing_dates))
print('  ...Number of dates for which no data could be received: %d.' % len(empty_response_dates))

  ...Number of dates for which articles exist in the json-directory: 6979.
  ...Number of dates for which articles were successfully downloaded: 6979.
  ...Number of dates for which articles were already saved: 0.
  ...Number of dates for which no data could be received: 326.


## 2. Create DataFrame from nested .json files (.csv)

In [None]:
path_csv = 'Data/Technology-Data/raw/'
makedirs(path_csv, exist_ok=True)

In [None]:
csv_filename =  os.path.join(path_csv, 'raw.csv')
meta_columns = ['id', 'type', 'sectionId', 'sectionName', 'webPublicationDate', 'webTitle', 'webUrl', 'apiUrl',
                'fields', 'isHosted', 'pillarId', 'pillarName']
csv_columns =['filename','id','webUrl',
              'author',
              'headline','trailText','bodyText',
              'publication',
              'webPublicationDate','newspaperEditionDate','firstPublicationDate',
              'charCount','wordcount']

In [None]:
news = pd.DataFrame()
error_files = []

In [None]:
for root, dirs, files in sorted(os.walk(path_json)):
    for file in files:
        try:
            if file.endswith(".json") and 'checkpoint' not in file:
                json_file = os.path.join(root, file)
                with open(json_file) as f:
                    json_data = json.load(f)
                # normalise data from nested .json:
                df1 = pd.json_normalize(json_data, 'tags', meta=meta_columns, record_prefix='tag_')
                df2 = pd.json_normalize(df1['fields']).copy()
                df1.drop(columns=['fields'], inplace=True)
                df_all = pd.concat([df1, df2], axis=1)
                df_all['filename'] = json_file
                # extract authors (some articles have multiple authors!):
                authors = df_all.loc[df_all['tag_type'] == 'contributor', ['id','tag_webTitle']].copy()
                authors.drop_duplicates(inplace=True)
                authors.rename({'tag_webTitle':'author'}, axis=1, inplace=True)
                authors = authors.groupby('id')['author'].apply(list)
                authors = authors.astype('str')
                # add data to dataframe for selected columns
                df_all = df_all[df_all.columns.drop(list(df_all.filter(regex='tag_')))]
                df_all.drop_duplicates(inplace=True)
                df_all = df_all.merge(authors, on = 'id', how = 'left')
                df = pd.DataFrame(columns=csv_columns)
                df = df.append(df_all[df.columns.intersection(df_all.columns)])
                df = df.drop_duplicates().copy()
                news = news.append(df, ignore_index=True)
                del df1, df2, authors, df_all, df
        except Exception as E:
            print(E, 'in line: ', sys.exc_info()[2].tb_lineno)
try:
    news.to_csv(csv_filename, sep=';')
    print('  ...A (new) DataFrame called "news" has been constructed and saved as ', csv_filename)
    print('  ...Shape of that DataFrame:', news.shape)

except Exception as e:
    print('  ...Saving of .csv-file failed: ', e)

if len(error_files)>0:
    print('  ...Files that could not be added to the DataFrame: ', error_files)

  ...A (new) DataFrame called "news" has been constructed and saved as  Data/Technology-Data/raw/raw.csv
  ...Shape of that DataFrame: (51819, 13)
