# Step 1. Import the Library

In [160]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Step 2. Access the HTML content from the webpage by assigning the URL and creating a soup object

In [191]:
# Downloading cna data
headers = {'Accept-Language': 'en-US,en;q=0.8'}
url = 'https://www.channelnewsasia.com/international'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Step 3. Extract the File

In [192]:
newstitle = soup.select('h6.list-object__heading')
print(newstitle[0])
print(" ")
print("how many titles are available? ", len(soup.select('h6.list-object__heading')))

<h6 class="h6 list-object__heading">
<a class="h6__link list-object__heading-link" href="/world/putin-xi-meet-october-russia-china-3782246">
      Putin and Xi to meet in Beijing in October, Russia says 
  </a>
</h6>
 
how many titles are available?  67


In [193]:
text = soup.select('h6 a')
print(text)

[<a class="h6__link list-object__heading-link" href="/world/putin-xi-meet-october-russia-china-3782246">
      Putin and Xi to meet in Beijing in October, Russia says 
  </a>, <a class="h6__link list-object__heading-link" href="/world/libya-floods-climate-change-conflict-derna-likely-research-3782936">
      Climate change, conflict made Libya deluge more likely: Study
  </a>, <a class="h6__link list-object__heading-link" href="/business/china-files-appeal-against-wto-panel-report-us-steel-aluminium-tariffs-3783066">
      China files appeal against WTO panel report on US steel, aluminium tariffs
  </a>, <a class="h6__link list-object__heading-link" href="/world/uk-says-no-evidence-political-bank-account-closures-3783016">
      UK says no evidence of political bank account closures
  </a>, <a class="h6__link list-object__heading-link" href="/asia/philippines-activists-freed-after-alleged-military-abduction-3782986">
      Philippines activists freed after alleged military abduction
  

In [194]:
links = []

for a in soup.select('h6 a'):
  links.append('https://www.channelnewsasia.com' + a.attrs.get('href'))

print(links[0])

https://www.channelnewsasia.com/world/putin-xi-meet-october-russia-china-3782246


# Step 4. Looping the process and storing the data

In [195]:
# create article title and links list
article_title = []
article_link = []

for a in soup.select('h6 a'):
  # extract the article title text and clean the title by removing extra spaces and newline characters
  title = a.get_text()
  cleaned_title = ' '.join(title.split())

  article_title.append(cleaned_title)

  # extract the article link
  article_link.append('https://www.channelnewsasia.com' + a.attrs.get('href'))

print(article_title)
print(article_link)

['https://www.channelnewsasia.com/world/putin-xi-meet-october-russia-china-3782246', 'https://www.channelnewsasia.com/world/libya-floods-climate-change-conflict-derna-likely-research-3782936', 'https://www.channelnewsasia.com/business/china-files-appeal-against-wto-panel-report-us-steel-aluminium-tariffs-3783066', 'https://www.channelnewsasia.com/world/uk-says-no-evidence-political-bank-account-closures-3783016', 'https://www.channelnewsasia.com/asia/philippines-activists-freed-after-alleged-military-abduction-3782986', 'https://www.channelnewsasia.com/asia/india-will-receive-heavy-monsoon-rains-september-says-weather-chief-3782781', 'https://www.channelnewsasia.com/world/strained-ties-relations-between-india-canada-accusations-sikh-leader-murder-3782811', 'https://www.channelnewsasia.com/business/taiwans-tsmc-help-train-german-students-semiconductor-careers-3782786', 'https://www.channelnewsasia.com/world/strained-ties-relations-between-india-canada-accusations-sikh-leader-murder-3782

In [197]:
# create description, datetime, and image link list
article_genre = []
article_description = []
article_published_datetime = []
article_image_link = []
article_source = []

# loop through the 'article_link' link, where each link redirects to an article
for link in article_link:
  # re-send the HTTP Get requests and parse the HTML content inside the article
  response_link = requests.get(link, headers=headers)
  soup_link = BeautifulSoup(response_link.text, "html.parser")

  genre = soup_link.select('p.content-detail__category a')
  paragraph_tag = soup_link.select('.text .text-long p')
  published_datetime = soup_link.select('.article-publish')
  image_link = soup_link.select('picture.image img')
  source = soup_link.select('.source__block .source')

  # check if all the elements are found on the page
  if genre and paragraph_tag and published_datetime and image_link and source:
    # extract the genre text and clean the genre by removing extra spaces and newline characters
    genre_text = genre[0].get_text()
    cleaned_genre = ' '.join(genre_text.split())

    article_genre.append(cleaned_genre)

    # extract the text content from the first p element
    article_description.append(paragraph_tag[0].get_text())

    # extract the published datetime text and clean the datetime by removing extra spaces and newline characters
    datetime = published_datetime[0].get_text()
    cleaned_datetime = ' '.join(datetime.split())

    article_published_datetime.append(cleaned_datetime)

    # extract the image link
    article_image_link.append(image_link[4].attrs.get('src'))

    # extract the source text, replace "Source: " string with "", and clean the source by removing extra spaces and newline characters
    source_text = source[0].get_text()
    source_only = source_text.replace("Source: ", "")
    cleaned_source = ' '.join(source_only.split())

    article_source.append(cleaned_source)

  else:
    # make missing values on the table when the above elements are not found
    article_genre.append(None)
    article_description.append(None)
    article_published_datetime.append(None)
    article_image_link.append(None)
    article_source.append(None)

print(article_genre)
print(article_description)
print(article_published_datetime)
print(article_image_link)
print(article_source)

['World', 'World', 'Business', 'World', 'Asia', 'Asia', 'World', 'Business', 'World', 'World', 'Asia', None, 'Asia', 'Wellness', 'Asia', 'Singapore', 'Singapore', 'Singapore', 'World', 'Asia', 'World', 'Commentary', 'Business', 'Business', 'Asia', 'World', 'Asia', 'Sport', 'Asia', 'World', 'World', None, 'World', 'World', 'World', 'Singapore', 'Singapore', 'Singapore', 'Singapore', 'Singapore', 'Singapore', 'Commentary', 'Commentary', 'Commentary', 'Commentary', 'Commentary', 'Commentary', 'Business', 'Business', 'World', 'Business', 'Business', 'Business', 'Wellness', 'Dining', 'Experiences', 'Sport', 'Sport', 'Sport', 'World', 'World', 'Business', 'World', 'Asia', 'Asia', 'World', 'Business']
["MOSCOW:\xa0Russian President Vladimir Putin will travel to Beijing in October for talks with China's Xi Jinping, the Kremlin's chief's first known trip abroad since the International Criminal Court (ICC) issued an arrest warrant against him.", 'PARIS: Climate change made torrential rains that 

# Step 5. Save as DataFrame and store it as CSV for further analysis

In [198]:
print(len(article_title))
print(len(article_published_datetime))
print(len(article_genre))
print(len(article_description))
print(len(article_image_link))
print(len(article_source))
print(len(article_link))

67
67
67
67
67
67
67


In [199]:
df = pd.DataFrame(
    {'Article Title': article_title,
     'Genre' : article_genre,
     'Published Datetime': article_published_datetime,
     'Description': article_description,
     'Image Link' : article_image_link,
     'Source' : article_source,
     'Article Link': article_link
     }
)

print(df.head())

df.to_csv('cnaarticle.csv', index=False)

                                       Article Title     Genre  \
0  Putin and Xi to meet in Beijing in October, Ru...     World   
1  Climate change, conflict made Libya deluge mor...     World   
2  China files appeal against WTO panel report on...  Business   
3  UK says no evidence of political bank account ...     World   
4  Philippines activists freed after alleged mili...      Asia   

                                  Published Datetime  \
0  19 Sep 2023 10:11PM (Updated: 19 Sep 2023 10:2...   
1  19 Sep 2023 10:08PM (Updated: 19 Sep 2023 10:0...   
2                                19 Sep 2023 10:06PM   
3  19 Sep 2023 09:42PM (Updated: 19 Sep 2023 09:4...   
4  19 Sep 2023 09:26PM (Updated: 19 Sep 2023 09:4...   

                                         Description  \
0  MOSCOW: Russian President Vladimir Putin will ...   
1  PARIS: Climate change made torrential rains th...   
2  BEIJING: China has filed an appeal against a W...   
3  LONDON: Britain on Tuesday (Sep 19) sai