In [5]:
using Cascadia
using HTTP
using Gumbo
using TextAnalysis

function get_stories_urls(pages)
    urls = []
    for page in pages
        for story in page["StoryList"]
            push!(urls, story["Url"])
        end
    end
    return urls
end

function save_urls(urls, filepath)
    file = open(filepath, "w")
    for url in urls
        println(file, url)
    end
    close(file)
end
        

function fetch_story(url)
    request = HTTP.get(url)
    body = String(request.body)
    return parsehtml(body)
end

function extract_story_text(parsed_html)
    story = eachmatch(Selector(".lead"), parsed_html.root)
    story_text = string(story[2])
    clean_text = remove_html_tags(story_text)
    return clean_text
end

function clean_story_text(story)
    return TextAnalysis.remove_whitespace(story)
end

function get_story(url)
    story = fetch_story(url)
    story_text = extract_story_text(story)
    return clean_story_text(story_text)
end

function get_all_stories(filepath, urls, delay)
    stories = []
    home_url = "https://www.scamalert.sg"
    for (i, url) in enumerate(urls)
        println("Fetching story $i from $url")
        try
            push!(stories, get_story(home_url*url))
        catch e
            println("Error occured while fetching story $i: $e")
        end
        println("Sleeping for $delay"*"s")
        sleep(delay)
    end
    
    file = open(filepath, "w")
    for story in stories
        println(file, story)
    end
    
    close(file)
    return stories
end

get_all_stories (generic function with 1 method)

In [6]:
using JSON

pages = JSON.parsefile("data/pages.json")

590-element Vector{Any}:
 Dict{String, Any}("CurrentScam" => "GetStoryListAjax", "YearList" => Any[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023], "CurrentPageItemsCount" => 6, "CurrentPage" => 1, "TotalPage" => 590, "ScamTypeList" => Any[Dict{String, Any}("Slug" => "car-rental-scam", "Name" => "Car Rental Scam"), Dict{String, Any}("Slug" => "cold-call-supplier-scam", "Name" => "Cold Call Supplier Scam"), Dict{String, Any}("Slug" => "credit-for-sex-scam", "Name" => "Credit-for-Sex Scam"), Dict{String, Any}("Slug" => "cyber-extortion-scam", "Name" => "Cyber Extortion Scam"), Dict{String, Any}("Slug" => "home-room-rental-scam", "Name" => "Home/Room Rental Scam"), Dict{String, Any}("Slug" => "impersonation-scam", "Name" => "Impersonation Scam"), Dict{String, Any}("Slug" => "inheritance-scam", "Name" => "Inheritance Scam"), Dict{String, Any}("Slug" => "internet-love-scam", "Name" => "Internet Love Scam"), Dict{String, Any}("Slug" => "investment-scam", "Name" => "Investment Scam"), Dict{St

In [7]:
urls = get_stories_urls(pages)
stories = get_all_stories("data/stories.csv", urls, 7)

Fetching story 1 from /stories-details/Story-10Jul2023151915PM
Sleeping for 7s
Fetching story 2 from /stories-details/Story-10Jul2023134904PM
Sleeping for 7s
Fetching story 3 from /stories-details/Story-10Jul2023112951AM
Sleeping for 7s
Fetching story 4 from /stories-details/Story-09Jul2023200402PM
Sleeping for 7s
Fetching story 5 from /stories-details/Story-08Jul2023224144PM
Sleeping for 7s
Fetching story 6 from /stories-details/Story-07Jul2023170235PM
Sleeping for 7s
Fetching story 7 from /stories-details/Story-07Jul2023125613PM
Sleeping for 7s
Fetching story 8 from /stories-details/Story-06Jul2023205531PM
Sleeping for 7s
Fetching story 9 from /stories-details/Story-06Jul2023151018PM
Sleeping for 7s
Fetching story 10 from /stories-details/Story-06Jul2023012707AM
Sleeping for 7s
Fetching story 11 from /stories-details/Story-04Jul2023205915PM
Sleeping for 7s
Fetching story 12 from /stories-details/Story-04Jul2023153634PM
Sleeping for 7s
Fetching story 13 from /stories-details/Story-04J

3495-element Vector{Any}:
 "I accepted a friend request on" ⋯ 2293 bytes ⋯ "an result in financial losses."
 "Whatsapp message: Good morning," ⋯ 823 bytes ⋯ "k) invitation code are ：G4KUDL"
 "I met Alan on Bumble, where he" ⋯ 1287 bytes ⋯ "that no such position existed."
 "I connected with a person name" ⋯ 2891 bytes ⋯ "t giving you time to consider."
 "A person with the online name " ⋯ 1115 bytes ⋯ "em and not contact them again."
 "I was contacted on Telegram by" ⋯ 1513 bytes ⋯ " they disappear with my money."
 "Scammer: Hello! Just wanted to " ⋯ 319 bytes ⋯ "fer any money to you. Hang up."
 "A few days ago, someone called" ⋯ 1081 bytes ⋯ "service for reporting instead."
 "The caller will somehow know yo" ⋯ 202 bytes ⋯ "d block the number right away."
 "I received calls and messages " ⋯ 1086 bytes ⋯ "ceive hardworking individuals."
 "I met a guy on a social app wh" ⋯ 1748 bytes ⋯ "s and avoid blind investments."
 "Someone contacted me, claiming " ⋯ 799 bytes ⋯ "idn't respond to that 