# The Guardian 

In [2]:
import pandas as pd
import requests
import spacy

## Scrape the Data from API

I want to scrape all the articles under the "World" section from November 1, 2023 and I define the URL.

In [4]:
url="https://content.guardianapis.com/search?section=world&from-date=2023-11-01&show-blocks=all&api-key=4c043d21-d53e-4a99-a6f3-1a08745b7575&page="

In [5]:
urllist=[]
for i in range(1,5): #115
    a=url
    b=str(i)
    c=a+b
    urllist.append(c)
info=[]
def json(url1):
    response=requests.get(url1)
    x=response.json()
    info.append(x)

In [6]:
output=[json(url1) for url1 in urllist]

I only need specific data, which includes the title, date, URL, and contents. I retrieve this data using the API.

In [7]:
extracted_data = [
    {
        'webTitle': item['webTitle'],
        'sectionName': item['sectionName'],
        'webPublicationDate': item['webPublicationDate'],
        'webUrl': item['webUrl'],
        #'elements':[result['bodyTextSummary'] for result in item['blocks']['body']],
        'elements':[{'id':  result['id'], 
                     'bodyTextSummary': result['bodyTextSummary'],                
                     'lastModifiedDate': result['lastModifiedDate'],} for result in item['blocks']['body']],
        #'blocks': item['blocks']['body'],
        
    }
    for response in info if 'results' in response['response']
    for item in response['response']['results']
    
]

## Save corpus in text files

I want to use the title as the filename, so I need to remove special characters

In [8]:
#save corpus to txt 
import os
def cleanFilename(filename):
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename
output_directory = 'guardian_corpus_spacy/'
for line in extracted_data:
    filename= cleanFilename(line['webTitle'])+'.txt'
    filepath = os.path.join(output_directory, filename)
    with open(filepath, 'w', encoding='utf-8') as file:
    # Write the data dictionary to the file
        file.write(str(line))

## Bring corpus into csv 

In [10]:
import glob
import os
# Use glob to get a list of file paths
file_paths = glob.glob('guardian_corpus_spacy/*.txt')

# Create an empty list to store DataFrames
dfs = []

# Iterate over each file and read its content into the DataFrame
for file_path in file_paths:
    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()
        #print(text_content[:20])
    # Extract the filename from the file path
    filename = os.path.basename(file_path)

    # Create a DataFrame with the filename and text content
    df = pd.DataFrame({'Filename': [filename], 'Document': [text_content]})
    
    # Append the DataFrame to the list
    dfs.append(df)
    
# Concatenate all DataFrames into one
corpus_df = pd.concat(dfs, ignore_index=True)

# Display the resulting DataFrame
corpus_df.to_csv('Guardian_spacy.csv', index=False)
print(corpus_df.head(2))

                                            Filename  \
0  Israel-Gaza war live_ any attempt to isolate G...   
1  Macron confident Orbán can be persuaded to sup...   

                                            Document  
0  {'webTitle': 'Israel-Gaza war live: any attemp...  
1  {'webTitle': 'Macron confident Orbán can be pe...  


## Get Tokens with spacy

In [3]:
df = pd.read_csv('Guardian_spacy.csv')
df['Token'] = df['Document'].copy()
print(df['Token'].head(2))

0    {'webTitle': 'Israel-Gaza war live: any attemp...
1    {'webTitle': 'Macron confident Orbán can be pe...
Name: Tokens, dtype: object


In [21]:
def get_token(text): 
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`'
    text = ''.join(character for character in text
                   if character not in punctuation)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return doc

df['Token'] = df['Document'].apply(get_token)

print(df['Token'].head(2))

0    (webTitle, IsraelGaza, war, live, any, attempt...
1    (webTitle, Macron, confident, Orbán, can, be, ...
Name: Token, dtype: object


## Get Lemmas with Spacy

In [26]:
def lemma(text):
    return [(token.lemma_) for token in text]
df['Lemma'] = df['Token'].apply(lemma)
print(df['Lemma'].head(2))

0    [webTitle, IsraelGaza, war, live, any, attempt...
1    [webTitle, Macron, confident, Orbán, can, be, ...
Name: Lemma, dtype: object


## Get POS with Spacy

In [25]:
def pos(text):
    return [(token.pos_) for token in text]

df['Pos'] = df['Token'].apply(pos)
print(df['Pos'].head(2))

0    [PROPN, PROPN, NOUN, VERB, DET, NOUN, PART, VE...
1    [PROPN, PROPN, ADJ, PROPN, AUX, AUX, VERB, PAR...
Name: Pos, dtype: object


In [33]:
print(df.head(2))

                                            Filename  \
0  Israel-Gaza war live_ any attempt to isolate G...   
1  Macron confident Orbán can be persuaded to sup...   

                                            Document  \
0  {'webTitle': 'Israel-Gaza war live: any attemp...   
1  {'webTitle': 'Macron confident Orbán can be pe...   

                                               Token  \
0  (webTitle, IsraelGaza, war, live, any, attempt...   
1  (webTitle, Macron, confident, Orbán, can, be, ...   

                                                 Pos  \
0  [PROPN, PROPN, NOUN, VERB, DET, NOUN, PART, VE...   
1  [PROPN, PROPN, ADJ, PROPN, AUX, AUX, VERB, PAR...   

                                               Lemma  
0  [webTitle, IsraelGaza, war, live, any, attempt...  
1  [webTitle, Macron, confident, Orbán, can, be, ...  


In [34]:
df.to_csv('Guardian_pandas_spacy_03.csv', index=False)