In [None]:
import requests
from time import sleep
from guardian_key import KEY

In [None]:
def fmt(query):
    return query.replace(' ', '%20').replace('/','%2F')

def make_query(query, section='uk-news',page_num=1, page_size=50):
    base_url = 'https://content.guardianapis.com/search?'
    parameters = {'api-key':KEY,
                  'q':fmt(query),
                  'page-size':page_size,
                  'show-tags':'keyword',
                  'tag':fmt('type/article'),
                  'order-by':'newest',
                  'page':page_num,
                  'section':section
                  }


    parameter_string = '&'.join([f"{k}={v}" for k,v in parameters.items()])
    full_url = base_url + parameter_string
    return full_url


In [None]:
def extract_story_data(article):
    article_data = dict(
        id=article['id'],
        creation_data=article['webPublicationDate'],
        title=article['webTitle'],
        url=article['webUrl'],
        sectionid=article['sectionId'],
        section=article['sectionName'],
        type=article['type']
    )
    return article_data

def extract_tags(article):
    tag_data = article['tags']

    tags = []
    for tag in tag_data:
        tags.append(tag['webTitle'])
    return tags



In [None]:
response = requests.get(url).json()['response']
response

In [None]:
collected_data = []
n_pages = 1
query = '"royal family"'

for page in range(1,n_pages+1):
    print(f'Collecting Page {page} of {n_pages}')
    url = make_query(query, page_num=page)
    response = requests.get(url).json()['response']
    results = response['results']
    for article in results:
        main_data = extract_story_data(article)
        tags = extract_tags(article)
        main_data['tags'] = tags
        collected_data.append(main_data)
    sleep(2)


In [None]:
import pandas as pd
pd.DataFrame(collected_data)

In [None]:
pd.DataFrame(collected_data).to_json('guardian_data.json')

In [None]:
data = pd.read_json('guardian_data.json')

In [None]:
data['section'].value_counts()

In [None]:
data = data.explode('tags').dropna()

In [None]:
import networkx as nx

In [None]:
edge_list = data[['id','tags']].copy()

In [None]:
edge_list['source'] = edge_list.index

In [None]:
edge_list['target'] = edge_list['tags']

In [None]:
edge_list = edge_list[['source','target']]

In [None]:
G = nx.from_pandas_edgelist(edge_list)

In [None]:
nx.is_bipartite(G)

In [None]:
new_G = nx.bipartite.weighted_projected_graph(G,edge_list['target'].unique())

In [None]:
from netwulf import visualize

In [None]:
nx.write_gexf(new_G,'G.gexf')

In [None]:
edge_list