Project for analyzing news about "Subject", using the Guardian Media Group API and Python programming language, identify any
unusual events in the time series, and investigate the cause of these unusual events. For using this code, you must replace "subject" with your query. Also enter your own API-Key

1- Extract information about subject:

In [1]:
import requests

# Fetch articles about subject
def get_subject_articles(api_key, page_size=200): 
    # based on default behavior of the Guardian API, it limits the number of results returned per request.
    # by default, the API returns 10 results per request unless specified otherwise. Based on my test, The max page_size in 4/18/2024 could be 200!
    base_url = "https://content.guardianapis.com/search"
    params = {
        "q": "Subject",  # Enter your own query
        "api-key": api_key,
        "page-size": page_size  # specify the number of results per page
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200: #success
        data = response.json()
        return data['response']['results']
    else:
        print("Failed to retrieve articles.")
        return []

api_key = #"Enter your own API-Key here"
subject_articles = get_subject_articles(api_key)

#print subject articles
for result in subject_articles:
    print("Title:", result['webTitle'])
    print("URL:", result['webUrl'])
    print("Publication Date:", result['webPublicationDate'])
    print("Section Name:", result['sectionName'])
    print("---------------------------------------------------------------------------------------")


Failed to retrieve articles.


2- Count the number of articles about subject since 01.01.2018:

In [2]:
# based on API limitation, we have just 200 articles about subject, and the min date of them is 2022-02-17, 
# as it's becuase of API limitation, I already write a code for filter articles to be after 01.01.2018
# our API is Developer version, we can order commercial API to have more articles

from collections import defaultdict
from datetime import datetime

# convert string to datetime
def convert_to_datetime(date_string):
    return datetime.strptime(date_string , '%Y-%m-%dT%H:%M:%SZ' )
    
# count num of subject articles for each date 
def count_articles_by_date(articles):
    article_counts = defaultdict(int)
    for article in articles:
        publication_date = datetime.strptime(article['webPublicationDate'], "%Y-%m-%dT%H:%M:%SZ").date()
        article_counts[publication_date] += 1
    return article_counts

subject_articles_since_2018 = [article for article in subject_articles
                                      if convert_to_datetime(article['webPublicationDate']) > convert_to_datetime('2018-01-01T00:00:00Z')]
article_counts = count_articles_by_date(subject_articles_since_2018)
print("Number of articles about subject since 01.01.2018:")
for date, count in sorted(article_counts.items()):
    print(f"{date}: {count}")

Number of articles about subject since 01.01.2018:


3. Calculate the average of all days for the above-mentioned period from “No. of articles”.

In [3]:
# if I had complete output from API, the correct code is that we count number of days from 01.01.2018 till now and use it as divisor
# but now I just count num of days between min and max dates that we have article
def calculate_average(article_counts):
    print("min date: ", min(article_counts.keys()))
    print("max date: ", max(article_counts.keys()))
    total_days = (max(article_counts.keys()) - min(article_counts.keys())).days + 1
    total_articles = sum(article_counts.values())
    return total_articles / total_days

average_articles_per_day = calculate_average(article_counts)
print("average_articles_per_day : ",average_articles_per_day)

ValueError: min() arg is an empty sequence

4. In which section are most articles written?

In [None]:
# calculate section with most articles
def find_most_common_section(articles):
    section_counts = defaultdict(int)
    for article in articles:
        section_counts[article['sectionName']] += 1
        most_common_section = max(section_counts, key=section_counts.get)
    return (most_common_section , section_counts[most_common_section])

most_common_section = find_most_common_section(subject_articles_since_2018)
print("most common section since 01.2018: " , most_common_section[0]," with ",most_common_section[1] , " Articles")

5. Show the evolution of the "No. of articles" over time for the above period.

In [None]:
# I use line chart becauseas we are looking for trend
import plotly.graph_objects as go

def plot_article_counts_interactive(article_counts):
    dates = sorted(article_counts.keys())
    counts = [article_counts[date] for date in dates]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dates, y=counts, mode='lines+markers', name='No. of Articles'))
    
    fig.update_layout(
        title="Evolution of No. of Articles about subject (01.01.2018 - Today)",
        xaxis_title="Date",
        yaxis_title="No. of Articles",
        xaxis=dict(tickangle=45),
        hovermode="x",
        showlegend=False
    )
    
    fig.show()
plot_article_counts_interactive(article_counts)


6. Are there any unusual events in the time series under investigation?

In [None]:
import plotly.graph_objects as go

# Extract x and y data for the box plot
x_data = list(article_counts.keys())
y_data = list(article_counts.values())

# Create the box plot
fig = go.Figure()
fig.add_trace(go.Box(
    y=y_data,
    name='Article Counts',
    hoverinfo='y+text',  # Display y-value and custom text on hover
    text=x_data,          # Custom text (date) to display on hover
    customdata=x_data     # Additional data (date) to be used in hover labels
))

# Update layout
fig.update_layout(
    title="Box Plot of Article Counts since 01.01.2018",
    xaxis_title="Date",
    yaxis_title="Number of Articles",
    showlegend=False
)

# Show the plot
fig.show()


In [None]:
import numpy as np

# Calculate Q1 and Q3
counts_sorted = sorted(article_counts.values())
q1 = np.percentile(counts_sorted, 25)
q3 = np.percentile(counts_sorted, 75)

print("Q1 (first quartile):", q1)
print("Q3 (third quartile):", q3)

# Calculate interquartile range (IQR)
iqr = q3 - q1

# Identify outliers
outliers = [count for count in article_counts.values() if count < q1 - 1.5 * iqr or count > q3 + 1.5 * iqr]
outlier_dates = []
if outliers:
    print("Unusual events detected:")
    for date, count in article_counts.items():
        if count in outliers:
            outlier_dates.append(str(date))
            print(f"Date: {date}, Number of Articles: {count}")
else:
    print("No unusual events detected.")


7. If so, show these. Why are these unusual?
This code calculates Q1 and Q3 of the number of articles per day and identifies any unusual events as outliers beyond 1.5 times the interquartile range.
In statistical analysis, outliers are data points that significantly differ from other observations in a dataset and may indicate anomalies, errors, or rare events. These outliers could be unusual in the context of the dataset being analyzed.Outliers could represent days with an unusually high or low number of articles compared to the rest of the dataset. These deviations from the typical pattern may warrant further investigation to understand the underlying reasons, such as major events or errors in data collection.

8. Based on question one. Show the cause of the unusual event.


In [None]:
from collections import Counter
from bs4 import BeautifulSoup

subject_articles_Unusual = [d for d in subject_articles if d['webPublicationDate'][0:10] in outlier_dates]

# Dictionary to store article texts by date
articles_by_date = {}

# Fetch and store article texts by date
for result in subject_articles_Unusual:
    # Fetch the HTML content of the page
    response = requests.get(result['webUrl'])
    html_content = response.text

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the article content
    article_content = soup.find("div", class_="dcr-ch7w1w")

    # Extract and store the text
    if article_content:
        article_text = article_content.get_text()
        publication_date = result['webPublicationDate'][0:10]
        if publication_date not in articles_by_date:
            articles_by_date[publication_date] = []
        articles_by_date[publication_date].append(article_text)

# Analyze article texts for each date
for date, articles in articles_by_date.items():
    print("Date:", date)
    all_text = ' '.join(articles)
    # Tokenize the text (split by spaces)
    words = all_text.split()
    # Count the occurrences of each word
    word_counts = Counter(words)
    # Print the most common words
    for word, count in word_counts.most_common(70):
        if word not in ['the','of','in','and','to','a','on','that','was','for','with','is',
                        'by','as','has','said','will','his','he','be','I','it','at','been',
                        'had','an','this','are','this','have','–','were','from','not','our',
                        'about','new','New','more','The', 'their','which','who','all','its','we',
                        'one','also','would','or','but','do','so', 'in','In','But','out','It','see',
                        'We','like','content','they','these','some','than','after','For','may','two',
                        'other','can','see'
                       ] :
           print(f"{word}: {count}")
    print()