In [1]:
import requests
import pandas as pd

## Getting a key to the Guardian API

In [2]:
API_KEY = "408cea44-7deb-4ad1-a1a4-bbe559b52e56"

## Search function and a URL creator

In [3]:
def search_articles(api_key, search_term='', page=1, page_size=100, format_='json'):
    search_term = search_term.replace(' ' , '%20')
    url = 'https://content.guardianapis.com/search'
    params = {
        'api-key': API_KEY,
        'format': 'json',
        'page': page,
        'page-size': page_size,
        'q': search_term
    }
    response = requests.get(url, params=params)
    return response.json()

In [4]:
def create_urls(search_results):
    urls = []
    for result in search_results:
        api_url = result['apiUrl']
        urls.append(api_url + '?show-fields=body&api-key=408cea44-7deb-4ad1-a1a4-bbe559b52e56')
    return urls

## Preparing the search terms 

In [5]:
search_terms = ['donald trump', 'joe biden', 'ukraine', 'economy', 'israel', 'cryptocurrency', 'immigration']

In [6]:
urls_list = []
for term in search_terms:
    search_results = search_articles(API_KEY, search_term=term)['response']['results']
    urls_list.append(create_urls(search_results))

In [7]:
from bs4 import BeautifulSoup

params = {
    'show-fields': 'body',
}

texts = []

for urls in urls_list:
    for url in urls:
        response = requests.get(url, params=params)
        response_json = response.json()
        try:
            response_body = response_json['response']['content']['fields']['body']
            soup = BeautifulSoup(response_body, 'html.parser')
            text = soup.get_text(separator=' ')
            texts.append(text)
        except:
            pass
        

In [8]:
data_frame = pd.DataFrame(texts, columns=['Texts'])
data_frame

Unnamed: 0,Texts
0,Donald Trump continued his march toward the GO...
1,"Donald Payne Jr, a US congressman from New Jer..."
2,Donald Trump has been jubilant over the US su...
3,"The New York City mayor, Eric Adams, said the ..."
4,"Andrzej Duda, Poland’s conservative president,..."
...,...
528,Lenny Henry says he hopes his new Windrush-bas...
529,"Indefinite immigration detention is unlawful, ..."
530,There has been a self-harm incident or suicide...
531,It was the late 1970s when Bartley Duggan fled...


## Importing libraries for text preprocessing and cleaning

In [9]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mwojc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Cleaning the text and adding a cleaned_text column to the dataframe

In [10]:
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [12]:
data_frame['cleaned_text'] = data_frame['Texts'].apply(clean_and_lemmatize)

In [14]:
data_frame['cleaned_text']

0      donald trump continued his march toward the go...
1      donald payne jr a u congressman from new jerse...
2      donald trump ha been jubilant over the u supre...
3      the new york city mayor eric adam said the cit...
4      andrzej duda poland conservative president is ...
                             ...                        
528    lenny henry say he hope his new windrushbased ...
529    indefinite immigration detention is unlawful t...
530    there ha been a selfharm incident or suicide a...
531    it wa the late 1970s when bartley duggan fled ...
532    we seem to be living in time of unprecedented ...
Name: cleaned_text, Length: 533, dtype: object

## Saving the dataframe and displaying a sample preprocessed text

In [15]:
data_frame_cleaned = pd.DataFrame(data_frame, columns=['Texts', 'cleaned_text'])
data_frame_cleaned.to_csv('article_data_cleaned.csv', index=False, encoding='utf-8')

In [16]:
data_frame_cleaned['cleaned_text'][0]

'donald trump continued his march toward the gop nomination at the michigan republican party convention on saturday sweeping all 39 delegate related a farright u youth group is ramping up it movement to back election denier the delegate awarded will fuel the former president ahead of tuesday 5 march when 15 state will hold primary and trump nomination could be all but decided the michigan state party delegate met on saturday at the sprawling amway plaza hotel in grand rapid huddling in 13 separate meeting room representing the state 13 congressional district their nearuniform support for trump at the convention eclipsed the support he earned in the primary when former un ambassador nikki haley garnered about 26 of the vote she did not win any delegate awarded on saturday for the republican national convention in milwaukee where the party in july will officially nominate a candidate for the november presidential election the michigan republican party process for awarding delegate to the