In [229]:
import re
import time
import pandas as pd
from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Click the "Load More" button for scraping more news

In [230]:
def extendPage():
    while True:
        try:
            try:
                # Just to locate to the Load More button to be clicked
                load_more = driver.find_element_by_class_name('quicklinks')
                actions = ActionChains(driver)
                actions.move_to_element(load_more).perform()
            except NoSuchElementException:
                print('Skip')
                break
            except StaleElementReferenceException:
                print('Skip')
                break

            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "loadMorestories"))).click()
            break
        except ElementClickInterceptedException:
            print("Try again")
            pass
        except TimeoutException:
            print("Try again")
            pass
        except NoSuchElementException:
            print("Try again")
            pass

### Iterate the given list to text or url

In [231]:
def toList(given_list, condition):
    convert = []
    for i in given_list:
        if condition == 'link':
            convert.append(i.get_attribute('href'))
        else:
            convert.append(i.text)
    non_empty_ele = [j for j in convert if j]
    return non_empty_ele

### Activate chrome driver

In [232]:
driver = webdriver.Chrome()
driver.get('https://www.thestar.com.my/')
time.sleep(10)

### Extract all types of news

In [233]:
types = driver.find_elements_by_xpath(".//li[@class = 'dropdown']")
all_type = toList(types, 'text')
all_type.remove('Videos')
all_type.remove('Photos')
print(all_type)

['StarPlus', 'News', 'Asean+', 'Business', 'Sport', 'Metro', 'Lifestyle', 'Food', 'Tech', 'Education', 'Opinion', 'StarPicks']


In [234]:
links_list = []
news_list = []

### Browse through every type of webpage, scrape the links and store into a list


In [235]:
for element in all_type:
    xpath = f"//a[contains(text(), '{element}')]"
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, xpath))).click()
    print(xpath)
    extendPage()

    travel1 = driver.find_elements_by_xpath(".//main//div[5]//h2/a")
    if element.lower() == 'starplus' or element.lower() == 'business' or element.lower() == 'lifestyle' or element.lower() == 'opinion':
        travel2 = driver.find_elements_by_xpath(".//main//div[6]//h2/a")
    else:
        travel2 = driver.find_elements_by_xpath(".//main//div[6]/div/div/div[2]//h2/a")

    links_list1 = toList(travel1, 'link')
    links_list2 = toList(travel2, 'link')
    links_list = links_list + links_list1 + links_list2
    print(len(links_list))

driver.close()
driver.quit()

//a[contains(text(), 'StarPlus')]
Skip
27
//a[contains(text(), 'News')]
76
//a[contains(text(), 'Asean+')]
Skip
115
//a[contains(text(), 'Business')]
165
//a[contains(text(), 'Sport')]
214
//a[contains(text(), 'Metro')]
263
//a[contains(text(), 'Lifestyle')]
Skip
305
//a[contains(text(), 'Food')]
344
//a[contains(text(), 'Tech')]
Skip
383
//a[contains(text(), 'Education')]
Skip
393
//a[contains(text(), 'Opinion')]
Skip
420
//a[contains(text(), 'StarPicks')]
Skip
437


### Remove duplicates from list


In [236]:
filtered_links = []
[filtered_links.append(x) for x in links_list if x not in filtered_links]
print(f'No. of non-duplicated link: {len(filtered_links)}')

No. of non-duplicated link: 295


### Considering the Food category contains some recipes which will lead to another web page, therefore I decided to remove non The Star link here


In [237]:
for index in range(5):
    for i, ele in enumerate(filtered_links):
        if not ele.startswith('https://www.thestar.com.my/'):
            del filtered_links[i]
print(f'No. of non-TheStar link: {len(filtered_links)}')
filtered_links

No. of non-TheStar link: 293


['https://www.thestar.com.my/news/nation/2021/09/16/interactive-the-bahasa-in-malaysia',
 'https://www.thestar.com.my/news/nation/2021/08/27/interactive-over-two-million-students-eligible-for-vaccination',
 'https://www.thestar.com.my/starpicks/2021/09/13/shaping-doctors-of-the-future?utm_medium=thestar&utm_source=rosnawidget&utm_campaign=Monash+Uni+13Sep21',
 'https://www.thestar.com.my/news/nation/2021/09/09/interactive-from-the-desks-of-our-journalists',
 'https://www.thestar.com.my/news/nation/2021/09/02/interactive-how-to-boost-vaccinations-in-malaysia039s-six-target-states',
 'https://www.thestar.com.my/news/nation/2021/09/02/interactive-how-much-are-you-at-risk-from-accidents-at-work',
 'https://www.thestar.com.my/news/nation/2021/08/19/interactive-hospital-admissions-and-icu-cases-fall-in-some-states-as-vaccinations-rise',
 'https://www.thestar.com.my/news/nation/2021/08/10/unvaccinated-form-bulk-of-malaysia039s-new-covid-19-cases',
 'https://www.thestar.com.my/news/nation/2021

### Retrieve yesterday news

In [238]:
# Get yesterday date
yesterday = datetime.now() - timedelta(1)
date_required = f'{yesterday.year}/{yesterday.month:02d}/{yesterday.day:02d}'

yesterday_links = []
[yesterday_links.append(y) for y in filtered_links if re.search(date_required, y)]

print(f'No. of non-TheStar link: {len(yesterday_links)}')
yesterday_links

No. of non-TheStar link: 8


['https://www.thestar.com.my/starpicks/2021/09/15/supplementing-your-childs-nutritional-needs?utm_medium=thestar&utm_source=rosnawidget&utm_campaign=Shyan+15Sep21',
 'https://www.thestar.com.my/food/food-news/2021/09/15/enjoy-mid-autumn-celebratory-feasts-at-home',
 'https://www.thestar.com.my/food/food-news/2021/09/15/dining-amid-hornbills',
 'https://www.thestar.com.my/news/nation/2021/09/15/varsities-open-for-admission',
 'https://www.thestar.com.my/news/nation/2021/09/15/higher-education-sector-to-focus-on-safety-and-health',
 'https://www.thestar.com.my/news/nation/2021/09/15/results-of-public-university-applications-to-be-released-on-wednesday-sept-15',
 'https://www.thestar.com.my/starpicks/2021/09/15/supplementing-your-childs-nutritional-needs',
 'https://www.thestar.com.my/news/nation/2021/09/15/focus-on-high-value-added-activities']

In [239]:
new_news = []
try:
    with open('checking.txt', 'r') as f:
        checking_links = f.read().splitlines()
        
    if checking_links[0] == date_required:
        with open('checking.txt', 'a') as f:
            for url1 in yesterday_links:
                for index, url2 in enumerate(checking_links):
                    if url1 == url2:
                        break
                    elif index == len(checking_links) - 1:
                        new_news.append(url1)
                        f.write(url1 + '\n')
    else:
        with open('checking.txt', 'w') as f:
            f.write(date_required + '\n')
            for url in yesterday_links:
                new_news.append(url)
                f.write(url + '\n')  

except FileNotFoundError:
    with open('checking.txt', 'w') as f:
        f.write(date_required + '\n')
        for url in yesterday_links:
            new_news.append(url)
            f.write(url + '\n')       
                    
print(f'Links to crawl: {len(new_news)}')
new_news

Links to crawl: 0


[]

### Browse through each url and crawl the news content

In [240]:
for link in new_news:
    driver = webdriver.Chrome()
    url = f"{link}"
    print(link)
    driver.get(url)

    try:
        title = driver.find_element_by_tag_name('h1').text
    except NoSuchElementException:
        driver.close()
        driver.quit()
        continue

    try:
        author = driver.find_elements_by_xpath(".//div[@class = 'credit__authors']/a")
        author = toList(author, 'text')
        author = re.sub(",", " ", '，'.join(author))
    except NoSuchElementException:
        author = ''
        pass

    try:
        date = driver.find_element_by_xpath(".//p[@class = 'date']").text
    except NoSuchElementException:
        date = ''
        pass

    try:
        timestamp = driver.find_element_by_xpath(".//time[@class = 'timestamp']").text
    except NoSuchElementException:
        timestamp = ''
        pass

    # Extract all the text of the news content
    parent = driver.find_element_by_id('story-body')
    content = list(parent.text.split('\n'))

    # The following 4 statements are to extract all irrelevant content such as chart and image caption
    chart_text = parent.find_elements_by_xpath(".//div[contains(@class, 'flourish')]")
    inline_text = parent.find_elements_by_xpath(".//span[contains(@class, 'inline-caption')]")
    caption_text = parent.find_elements_by_xpath(".//p[contains(@class, 'caption')]")
    extra_text = chart_text + inline_text + caption_text

    full_content = toList(extra_text, 'text')
    full_content = list(set(full_content))

    # Filter out irrelevant content
    for index, i in enumerate(content):
        for j in full_content:
            if i == j:
                content[index] = ''

    # Remove the enter and empty field from the extracted news content
    real_content = ''
    for element in content:
        real_content += element

    # Determine the type of the news by using the url
    for category in all_type:
        string = f"https://www.thestar.com.my/{category.lower()}"
        result = re.search(string, link)
        if result is not None:
            break

    print(date)
    print(timestamp)
    print(title)
    print(category)
    print(author)
    print(real_content)

    news_items = {'Date': date, 'Time': timestamp, 'Type': category, 'Author': author, 'Title': title, 'Content': real_content}
    news_list.append(news_items)
    print(len(news_list))
    driver.close()
    driver.quit()
    time.sleep(3)

### Store the crawled data into data frame

In [241]:
data = pd.DataFrame(news_list)
data = data.sort_values('Time')
data.head()

KeyError: 'Time'

In [None]:
print("Shape    : ", data.shape)

### Store news to csv file

In [None]:
data.to_csv('News.csv', mode = 'a', index = False, header = False)

In [None]:
df = pd.read_csv('News.csv')
print("Shape    : ", df.shape)

<hr>

### Combine 2 data frames

In [None]:
# # Concatenate current and older data
# data = pd.concat([before, current])
# # Drop duplicates between previous and current data
# data.drop_duplicates(inplace = True, subset = ['Content'])
# # Store the data back to the News csv file
# data.to_csv('News.csv', index=False)
# data.head()

In [None]:
# print("Rows     : ", data.shape[0])
# print("Columns  : ", data.shape[1])
# print("Shape    : ", data.shape)
# print("Features : ", data.columns.tolist())