In [19]:
import re
import time
import pandas as pd
from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Click the "Load More" button for scraping more news

In [20]:
def extendPage():
    while True:
        try:
            try:
                # Just to locate to the Load More button to be clicked
                load_more = driver.find_element_by_class_name('quicklinks')
                actions = ActionChains(driver)
                actions.move_to_element(load_more).perform()
            except NoSuchElementException:
                print('Skip')
                break
            except StaleElementReferenceException:
                print('Skip')
                break

            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "loadMorestories"))).click()
            break
        except ElementClickInterceptedException:
            print("Try again")
            pass
        except TimeoutException:
            print("Try again")
            pass
        except NoSuchElementException:
            print("Try again")
            pass

### Iterate the given list to text or url

In [21]:
def toList(given_list, condition):
    convert = []
    for i in given_list:
        if condition == 'link':
            convert.append(i.get_attribute('href'))
        else:
            convert.append(i.text)
    non_empty_ele = [j for j in convert if j]
    return non_empty_ele

### Activate chrome driver

In [22]:
driver = webdriver.Chrome()
driver.get('https://www.thestar.com.my/')
time.sleep(10)

### Extract all types of news

In [23]:
types = driver.find_elements_by_xpath(".//li[@class = 'dropdown']")
all_type = toList(types, 'text')
all_type.remove('Videos')
all_type.remove('Photos')
print(all_type)

['StarPlus', 'News', 'Asean+', 'Business', 'Sport', 'Metro', 'Lifestyle', 'Food', 'Tech', 'Education', 'Opinion', 'StarPicks']


In [24]:
links_list = []
news_list = []

### Browse through every type of webpage, scrape the links and store into a list


In [25]:
for element in all_type:
    xpath = f"//a[contains(text(), '{element}')]"
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, xpath))).click()
    print(xpath)
    extendPage()

    travel1 = driver.find_elements_by_xpath(".//main//div[5]//h2/a")
    if element.lower() == 'starplus' or element.lower() == 'business' or element.lower() == 'lifestyle' or element.lower() == 'opinion':
        travel2 = driver.find_elements_by_xpath(".//main//div[6]//h2/a")
    else:
        travel2 = driver.find_elements_by_xpath(".//main//div[6]/div/div/div[2]//h2/a")

    links_list1 = toList(travel1, 'link')
    links_list2 = toList(travel2, 'link')
    links_list = links_list + links_list1 + links_list2
    print(len(links_list))

driver.close()
driver.quit()

//a[contains(text(), 'StarPlus')]
Skip
27
//a[contains(text(), 'News')]
76
//a[contains(text(), 'Asean+')]
Skip
115
//a[contains(text(), 'Business')]
165
//a[contains(text(), 'Sport')]
214
//a[contains(text(), 'Metro')]
261
//a[contains(text(), 'Lifestyle')]
Skip
303
//a[contains(text(), 'Food')]
342
//a[contains(text(), 'Tech')]
Skip
381
//a[contains(text(), 'Education')]
Skip
391
//a[contains(text(), 'Opinion')]
Skip
418
//a[contains(text(), 'StarPicks')]
Skip
435


### Remove duplicates from list


In [29]:
filtered_links = []
[filtered_links.append(x) for x in links_list if x not in filtered_links]
print(f'No. of non-duplicated link: {len(filtered_links)}')

No. of non-duplicated link: 298


### Considering the Food category contains some recipes which will lead to another web page, therefore I decided to remove non The Star link here


In [30]:
for i, ele in enumerate(filtered_links):
    if not ele.startswith('https://www.thestar.com.my/'):
        del filtered_links[i]
print(f'No. of non-TheStar link: {len(filtered_links)}')
filtered_links

No. of non-TheStar link: 296


['https://www.thestar.com.my/news/nation/2021/09/19/interactive-how-single-dose-covid-19-vaccines-help-protect-malaysia039s-remote-communities',
 'https://www.thestar.com.my/news/nation/2021/09/02/interactive-how-much-are-you-at-risk-from-accidents-at-work',
 'https://www.thestar.com.my/news/nation/2021/09/10/prevent-suicide-together-creating-hope-through-action?utm_medium=thestar&utm_source=rosnawidget&utm_campaign=KKM+10Sep21',
 'https://www.thestar.com.my/news/nation/2021/09/16/interactive-the-bahasa-in-malaysia',
 'https://www.thestar.com.my/news/nation/2021/09/09/interactive-from-the-desks-of-our-journalists',
 'https://www.thestar.com.my/news/nation/2021/09/02/interactive-how-to-boost-vaccinations-in-malaysia039s-six-target-states',
 'https://www.thestar.com.my/news/nation/2021/08/27/interactive-over-two-million-students-eligible-for-vaccination',
 'https://www.thestar.com.my/news/nation/2021/08/19/interactive-hospital-admissions-and-icu-cases-fall-in-some-states-as-vaccinations-

### Retrieve yesterday news

In [31]:
# Get yesterday date
yesterday = datetime.now() - timedelta(1)
date_required = f'{yesterday.year}/{yesterday.month:02d}/{yesterday.day:02d}'

yesterday_links = []
[yesterday_links.append(y) for y in filtered_links if re.search(date_required, y)]

print(f'No. of non-TheStar link: {len(yesterday_links)}')
yesterday_links

No. of non-TheStar link: 86


['https://www.thestar.com.my/starpicks/2021/09/18/mark-your-calendar-for-huawei-wearable-and-audio-day?utm_medium=thestar&utm_source=rosnawidget&utm_campaign=20210918_Huawei',
 'https://www.thestar.com.my/news/nation/2021/09/18/road-leading-to-camerons-via-simpang-pulai-closed-after-part-of-hillslope-collapses',
 'https://www.thestar.com.my/news/nation/2021/09/18/sabah-govt-announces-further-relaxation-of-activities-for-fully-vaccinated-from-monday-sept-20',
 'https://www.thestar.com.my/news/nation/2021/09/18/selangor-police-issue-423-compounds-for-disregarding-sop',
 'https://www.thestar.com.my/news/nation/2021/09/18/over-2500-people-entering-langkawi-screened-one-tested-positive-for-covid-19-says-health-dg',
 'https://www.thestar.com.my/news/nation/2021/09/18/landslide-area-at-kemensah-heights-declared-disaster-site-says-amirudin',
 'https://www.thestar.com.my/news/nation/2021/09/18/bereaved-woman-loses-almost-rm400000-after-falling-for-love-scam',
 'https://www.thestar.com.my/news/n

In [32]:
new_news = []
try:
    with open('checking.txt', 'r') as f:
        checking_links = f.read().splitlines()
        
    if checking_links[0] == date_required:
        with open('checking.txt', 'a') as f:
            for url1 in yesterday_links:
                for index, url2 in enumerate(checking_links):
                    if url1 == url2:
                        break
                    elif index == len(checking_links) - 1:
                        new_news.append(url1)
                        f.write(url1 + '\n')
    else:
        with open('checking.txt', 'w') as f:
            f.write(date_required + '\n')
            for url in yesterday_links:
                new_news.append(url)
                f.write(url + '\n')  

except FileNotFoundError:
    with open('checking.txt', 'w') as f:
        f.write(date_required + '\n')
        for url in yesterday_links:
            new_news.append(url)
            f.write(url + '\n')       
                    
print(f'Links to crawl: {len(new_news)}')
new_news

Links to crawl: 3


['https://www.thestar.com.my/business/smebiz/2021/09/18/getting-smes-on-the-digital-bandwagon',
 'https://www.thestar.com.my/business/business-news/2021/09/18/infineon-opens-austria-plant-early-in-chip-capacity-boost',
 'https://www.thestar.com.my/business/business-news/2021/09/18/oil-price-falls-as-storm-hit-us-supply-trickles-back-into-market']

### Browse through each url and crawl the news content

In [33]:
for link in new_news:
    driver = webdriver.Chrome()
    url = f"{link}"
    print(link)
    driver.get(url)

    try:
        title = driver.find_element_by_tag_name('h1').text
    except NoSuchElementException:
        driver.close()
        driver.quit()
        continue

    try:
        author = driver.find_elements_by_xpath(".//div[@class = 'credit__authors']/a")
        author = toList(author, 'text')
        author = re.sub(",", " ", '，'.join(author))
    except NoSuchElementException:
        author = ''
        pass

    try:
        date = driver.find_element_by_xpath(".//p[@class = 'date']").text
    except NoSuchElementException:
        date = ''
        pass

    try:
        timestamp = driver.find_element_by_xpath(".//time[@class = 'timestamp']").text
    except NoSuchElementException:
        timestamp = ''
        pass

    # Extract all the text of the news content
    parent = driver.find_element_by_id('story-body')
    content = list(parent.text.split('\n'))

    # The following 4 statements are to extract all irrelevant content such as chart and image caption
    chart_text = parent.find_elements_by_xpath(".//div[contains(@class, 'flourish')]")
    inline_text = parent.find_elements_by_xpath(".//span[contains(@class, 'inline-caption')]")
    caption_text = parent.find_elements_by_xpath(".//p[contains(@class, 'caption')]")
    extra_text = chart_text + inline_text + caption_text

    full_content = toList(extra_text, 'text')
    full_content = list(set(full_content))

    # Filter out irrelevant content
    for index, i in enumerate(content):
        for j in full_content:
            if i == j:
                content[index] = ''

    # Remove the enter and empty field from the extracted news content
    real_content = ''
    for element in content:
        real_content += element

    # Determine the type of the news by using the url
    for category in all_type:
        string = f"https://www.thestar.com.my/{category.lower()}"
        result = re.search(string, link)
        if result is not None:
            break

    print(date)
    print(timestamp)
    print(title)
    print(category)
    print(author)
    print(real_content)

    news_items = {'Date': date, 'Time': timestamp, 'Type': category, 'Author': author, 'Title': title, 'Content': real_content}
    news_list.append(news_items)
    print(len(news_list))
    driver.close()
    driver.quit()
    time.sleep(3)

https://www.thestar.com.my/business/smebiz/2021/09/18/getting-smes-on-the-digital-bandwagon
Saturday, 18 Sep 2021

Getting SMEs on the digital bandwagon
Business

MALAYSIA Digital Economy Corp (MDEC) will be collaborating with the Centre of Entrepreneur Development And Research (Cedar), a subsidiary of SME Bank, to enable SMEs to adopt digitalisation.Cedar has been appointed as a partner in MDEC’s SME Digital Accelerator programme, where it aims to train 1,000 businesses and match them with digital solutions and incentive facilities.The programme provides SMEs with a structured approach to kick-start their digital adoption journey and ensure implementation with outcome-based results.Qualified and registered SMEs will participate in a three-day online training workshop where they will be exposed to ‘Lean Model Canvas’ and ‘Design Thinking’ modules, plus technology matching with a choice of over 400 technology solution providers (TSPs).“SMEs are hit hard by the pandemic and need to reloo

### Store the crawled data into data frame

In [34]:
data = pd.DataFrame(news_list)
data = data.sort_values('Time')
data.head()

Unnamed: 0,Date,Time,Type,Author,Title,Content
0,"Saturday, 18 Sep 2021",,Business,,Getting SMEs on the digital bandwagon,MALAYSIA Digital Economy Corp (MDEC) will be c...
2,"Saturday, 18 Sep 2021",6:37 AM MYT,Business,,Oil price falls as storm-hit US supply trickle...,"NEW YORK,: Oil prices fell on Friday as energy..."
1,"Saturday, 18 Sep 2021",6:59 AM MYT,Business,,Infineon opens Austria plant early in chip cap...,"VILLACH, Austria: Germany's Infineon opened a ..."


In [35]:
print("Shape    : ", data.shape)

Shape    :  (3, 6)


### Store news to csv file

In [36]:
data.to_csv('News.csv', mode = 'a', index = False, header = False)

In [37]:
df = pd.read_csv('News.csv')
print("Shape    : ", df.shape)

Shape    :  (331, 6)
