In [None]:
#URL SCRAPE FOR BOTH VNEXPRESS AND VIETNAMNET
import httplib2
from bs4 import BeautifulSoup, SoupStrainer

http = httplib2.Http()
#You paste the link of the news outlet that enumerates the list of articles for searching keywords.
#For VIETNAMNET, you can add a variable (e.g. "crypto_keyword_url") and do the same thing as below.
blockchain_keyword_url = 'https://timkiem.vnexpress.net/?q=blockchain&media_type=all&fromdate=0&todate=0&latest=on&cate_code=&search_f=title,tag_list&date_format=all&page='
bitcoin_keyword_url = 'https://timkiem.vnexpress.net/?q=bitcoin&media_type=all&fromdate=0&todate=0&latest=on&cate_code=&search_f=title,tag_list&date_format=all&page='

blockchain_list_url = []
bitcoin_list_url = []

for i in range(12):
  blockchain_list_url.append(blockchain_keyword_url + str(i+1))

for i in range(26):
  bitcoin_list_url.append(bitcoin_keyword_url + str(i+1))

def web_scrape(url):
  status, response = http.request(url)
  for link in BeautifulSoup(response, 'html.parser', parse_only=SoupStrainer('a')):
      if link.has_attr('href'):
          if link['href'].endswith("html"):
            print(f"{link['href']}")

for i in range(12):
  web_scrape(blockchain_list_url[i]) 
for i in range(26):
  web_scrape(bitcoin_list_url[i]) 

#WARNING: the outputs are filled with irrelevant links. Manual check is required to obtain the correct article links. 

In [None]:
#VNEXPRESS TITLES EXTRACTION
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

df = pd.read_csv("...csv") #obtained after applying URL scrape code

def fetch_title(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find("title")
        return title_tag.text.strip() if title_tag else "No title found"
    except Exception as e:
        return f"Error fetching title"

titles = []

for i, url in enumerate(df['url']):
    print(f"Fetching {i+1}/{len(df)}: {url}")
    title = fetch_title(url)
    titles.append(title)
    time.sleep(1) #Delay to avoid server overload

df.insert(0, 'title', titles)
df.to_csv("...csv", index=False)

In [None]:
#VNEXPRESS CONTENT SCRAPE
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import time

df = pd.read_csv("...csv") #obtained after applying the TITLE EXTRACTION code

def extract_vnexpress_data(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        meta_tag = soup.find("meta", attrs={"name": "pubdate"})
        if meta_tag and meta_tag.get("content"):
            raw_date = meta_tag["content"][:10]  # e.g. '2025-06-20'
            pub_date = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%d-%m-%Y")
        else:
            pub_date = "No date"

        paragraphs = soup.find_all("p")
        article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        content_clean = re.sub(r'\s+', ' ', article_text).strip()

        return pub_date, content_clean

    except Exception as e:
        return "Error", f"Error: {str(e)}"

dates, contents = [], []
for i, url in enumerate(df["url"]):
    print(f"Fetching {i+1}/{len(df)}: {url}")
    date, content = extract_vnexpress_data(url)
    dates.append(date)
    contents.append(content)
    time.sleep(1) #Delay to avoid server overload

df["date"] = dates
df["content"] = contents
df.to_csv("...csv", index=False)


In [None]:
#TUOITRE
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re

#All the links were manually collected and saved in a CSV. No beautiful soup scraping here.
df = pd.read_csv("...csv")

def extract_date(soup):
    date_div = soup.find("div", attrs={"data-role": "publishdate"})
    if date_div:
        raw = date_div.text.strip()
        try:
            dt = datetime.strptime(raw.split(" ")[0], "%d/%m/%Y")
            return dt.strftime("%d-%m-%Y")
        except:
            return "Unrecognized format"
    return "No date"

def extract_content(soup):
    content_div = soup.find("div", class_="detail-content")
    if content_div:
        paragraphs = content_div.find_all("p")
        content_raw = " ".join(p.get_text(separator=" ", strip=True) for p in paragraphs)
        content_clean = re.sub(r'\s+', ' ', content_raw).strip()
        return content_clean
    return "No content"

dates, contents = [], []
for i, url in enumerate(df["url"]):
    print(f"Fetching {i+1}/{len(df)}: {url}")
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        date = extract_date(soup)
        content = extract_content(soup)
        dates.append(date)
        contents.append(content)
    except Exception:
        dates.append("Error")
        contents.append("Error")
    time.sleep(1) #Delay

df["date"] = dates
df["content"] = contents
df.to_csv("tuoitre_update.csv", index=False)

In [None]:
#VNECONOMY
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

#All the links were manually collected and saved in a CSV 
"""
Tips:
Apparently, the links of VNECONOMY articles can be derived
from merging all the words from the titles,
except that certain special characters are re-converted
to different forms (e.g., "." -> "-").
"""

df = pd.read_csv("...csv")

def extract_content(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        
        content_div = soup.find("div", class_="detail__content")
        if content_div:
            paragraphs = content_div.find_all("p")
            content_raw = " ".join(p.get_text(separator=" ", strip=True) for p in paragraphs)
            content_clean = re.sub(r'\s+', ' ', content_raw).strip()
            return content_clean
        return "No content found"
    except Exception as e:
        return f"Error: {str(e)}"

contents = []
for i, url in enumerate(df["url"]):
    print(f"Fetching {i+1}/{len(df)}: {url}")
    content = extract_content(url)
    contents.append(content)
    time.sleep(1) #delay 

df["content"] = contents
df.to_csv("example.csv", index=False)


In [None]:
#VIETNAMNET
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
from datetime import datetime

df = pd.read_csv("...csv") #obtained after applying URL scrape code

def get_time(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        time_div = soup.find("div", class_="bread-crumb-detail__time")
        if time_div:
            raw_date = time_div.text.strip().split(",")[-1].strip()  # e.g., '14/07/2025 - 14:38'
            date_part = raw_date.split("-")[0].strip()  # get only the date part '14/07/2025'
            pub_date = datetime.strptime(date_part, "%d/%m/%Y").strftime("%Y-%m-%d")
            return pub_date
        else:
            return "No date found"
    except Exception as e:
        return f"Error: {str(e)}"

def get_content(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        paragraphs = soup.find_all("p")
        article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        content_clean = re.sub(r'\s+', ' ', article_text).strip()
        return content_clean
    except Exception as e:
        return f"Error: {str(e)}"

def get_title(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        title_tag = soup.find("title")
        return title_tag.text.strip() if title_tag else "No title found"
    except Exception as e:
        return f"Error: {str(e)}"

titles = []
dates = []
contents = []

for i, url in enumerate(df["url"]):
    print(f"Fetching {i+1}/{len(df)}: {url}")
    titles.append(get_title(url))
    dates.append(get_time(url))
    contents.append(get_content(url))
    time.sleep(1) ##Delay to avoid server overload

df["title"] = titles
df["date"] = dates
df["content"] = contents
df = df[["title", "url", "date", "content"]] 

df.to_csv("BITCOIN.csv", index=False)

In [None]:
#VNECONOMY scrape for "No_content", "Error", skipped rows
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

df = pd.read_csv("example.csv")

def fetch_vneconomy_content(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")

        # Div for VnEconomy content
        content_div = soup.find("div", class_="detail__content")
        if content_div:
            paragraphs = content_div.find_all(["p", "div", "br"])
            text = " ".join(content_div.stripped_strings)
            return text.strip()

        return "No content found"
    except Exception as e:
        return f"Error: {e}"

df["content"] = df["url"].apply(lambda x: fetch_vneconomy_content(x))
df.to_csv("example.csv", index=False)

In [None]:
# Cleaning PIPELINES
# The following code might not apply to every source and provide a general framework.
# It depends on the specific issues of a dataset. 

# The code is designed to handle common problems like:
#   - removing quotation marks
#   - filtering out error messages
#   - collecting rows with missing content for potential re-scraping

import pandas as pd

df = pd.read_csv("...csv") 
# language_content = "content" #for Vietnamese content || #"content_en" #for English content

def quotation_mark_removal(df, language_content):
    df["title"] = df["title"].astype(str).str.replace(r"[\"']", "", regex=True)
    df[language_content] = df[language_content].astype(str).str.replace(r"[\"']", "", regex=True)
    return df

def filter_content(df, language_content):
    # df= df.drop(columns=["Unnamed: 3"])
    df = df[df[language_content] != "error"]
    df = df[df[language_content] != "No content"]
    df = df[df[language_content] != "No content found"]
    df = df[df[language_content] != "Error: Invalid URL error: No scheme supplied. Perhaps you meant https://error?"]
    df = df[df[language_content] != "Error: Invalid URL Error: No Scheme Supplied. Perhaps You MeanT https: // Error?"]
    df = df[df["url"] != "error"]
    df = df.reset_index(drop=True)
    return df

#SPECIAL ISSUES
#Sometimes the content contains double quotation marks that render the text analysis malfunctional.
#The following function replaces any "text" with 'text'
def replace_quotes(text):
    if pd.isnull(text):
        return text
    # Replace any "text" with 'text'
    return re.sub(r'"(.*?)"', r"'\1'", text)

df['content'] = df['content'].apply(replace_quotes)


#Collecting rows with 'No content found' to re-scrape for contents
def no_content_df_collection(df, language_content): 
    no_content_df = df[df[language_content] == 'No content found']
    return df

df = quotation_mark_removal(df, )

df = filter_content(df, )

no_content_df_collection(df, )

df = df.drop_duplicates(subset=["title"])
df.to_csv("...csv", index=False)