In [45]:
import platform as python_platform

class Platform:
    WINDOWS = "Windows"
    MACOS = "Darwin"

platform = python_platform.system()

windows_root_directory = "D:/Vector A/0. KHTN/Nam 4/HKII/Thesis/Brainstorming/DataCrawling"
mac_root_directory = "/Users/4rr311/Documents/VectorA/KHTN/Nam4/HKII/Thesis/Brainstorming/DataCrawling"

root_directory = windows_root_directory if platform == Platform.WINDOWS else mac_root_directory

In [46]:
import os
from os.path import join, getsize
import re

import sys
sys.path.append(root_directory)
import random

from bs4 import BeautifulSoup

import file_utils as fu

In [47]:
def copy_folder_structure(source, destination):
    '''Copy the structure of the source directory to the destination directory'''
    for root, dirs, files in os.walk(source):
        for dir in dirs:
            new_dir = f"{destination}/{root.replace(source, '')}/{dir}"
            
            regex_pattern = r"\/{2,}"            
            new_dir = re.sub(regex_pattern, "/", new_dir)

            os.makedirs(new_dir, exist_ok=True)

In [48]:
song_id_of_song_parts_dir = fu.relative_to_absolute_path(
    "DataPreprocessing/song_id_of_song_parts",
    root_path=root_directory
)

raw_html_dir = fu.relative_to_absolute_path(
    "GetAllSongHTML/song_list/raw_html",
    root_path=root_directory
)

copy_folder_structure(
    source=raw_html_dir, 
    destination=song_id_of_song_parts_dir
)

In [49]:
def extract_metrics_and_conclusion_from_html(html_raw_content):
    '''
        Trích xuất thông tin về metrics và các kết luận từ nội dung HTML của một bài hát
    '''
    metrics = {}
    metrics_conclusion = ""
    key_conclusion = ""

    soup = BeautifulSoup(html_raw_content, "html.parser")

    # Tìm thẻ h3 có thuộc tính text chứa chuỗi ký tự, trong đó có chuỗi ký tự "Chord And Melody Metrics" bằng CSS Selector
    h3 = soup.find(
        "h3",
        string=lambda text: "Chord And Melody Metrics" in text    
    )

    # Lấy các thẻ div chứa thông tin về metrics
    metric_divs = h3.find_next_sibling("div").find_all("div")

    for div in metric_divs:
        h5 = div.find("h5")
        h2 = div.find("h2")

        metrics[h5.text] = h2.text

    metrics_conclusion_p = h3.find_next_sibling("p")
    metrics_conclusion = metrics_conclusion_p.text
    
    try:
        key_conclusion_p = h3.parent.parent.find_next_sibling("div").find("p")
        key_conclusion = key_conclusion_p.text
    except:
        pass
    
    return metrics, metrics_conclusion, key_conclusion

In [50]:
def extract_all_a_hrefs_from_html(html_raw_content, text_of_a_tag="Open In Hookpad"):
    '''
        Trả về href của các thẻ a có text thoả text_of_a_tag là chuỗi ký tự con
    '''
    # Ví dụ:
    # <h2 class="margin-0">
    #     <a class="" href="https://hookpad.hooktheory.com?idOfSong=JkmZRne-mqn&amp;enableYouTube=false&amp;openBandEditorOnInit=true">
    #         Open In Hookpad
    #     </a>
    # </h2>

    soup = BeautifulSoup(html_raw_content, "html.parser")
    
    a_tags = soup.find_all("a", string=text_of_a_tag)

    hrefs = [a.get("href") for a in a_tags]

    return hrefs

In [51]:
def extract_song_part_ids_from_hrefs(open_in_hookpad_hrefs):
    '''
        Trả về song part id từ href của thẻ a "Open In Hookpad"
    '''
    # Ví dụ về href: https://hookpad.hooktheory.com?idOfSong=Weglnna-orY&amp;enableYouTube=false&amp;openBandEditorOnInit=true

    song_part_ids = []
    
    for href in open_in_hookpad_hrefs:
        song_part_id = ""

        # Tìm chuỗi "idOfSong=" trong href
        idOfSong_index = href.find("idOfSong=")
        if idOfSong_index != -1:
            # Tìm chuỗi "idOfSong=" và lấy phần sau nó
            song_part_id = href[idOfSong_index + len("idOfSong="):]

            # Tìm chuỗi "&" trong phần sau "idOfSong=" và lấy phần trước nó
            and_index = song_part_id.find("&")
            if and_index != -1:
                song_part_id = song_part_id[:and_index]
            else:
                print(f"extract_song_part_ids_from_hrefs: Cannot find '&' in {href}")
        else:
            print(f"extract_song_part_ids_from_hrefs: Cannot find 'idOfSong=' in {href}")

        if song_part_id != "":
            song_part_ids.append(song_part_id)
        else:
            print(f"extract_song_part_ids_from_hrefs: Cannot extract song part id from {href}")

    return song_part_ids

print(extract_song_part_ids_from_hrefs(
    ["https://hookpad.hooktheory.com?idOfSong=Weglnna-orY&amp;enableYouTube=false&amp;openBandEditorOnInit=true"]
))

['Weglnna-orY']


In [52]:
def extract_song_parts_ids(html_raw_content):
    '''
        Trả về song part id từ nội dung HTML của một bài hát
    '''
    text_of_a_tag = "Open In Hookpad"
    
    hrefs = extract_all_a_hrefs_from_html(
        html_raw_content, 
        text_of_a_tag
    )
    
    song_part_ids = extract_song_part_ids_from_hrefs(hrefs)

    return song_part_ids

In [53]:
def get_song_json_list_by_alphabet(letter: str):
    '''
        Trả về danh sách các bài hát có tên nghệ sĩ bắt đầu bằng chữ cái letter trong thư mục song_list_link_by_artist/json
    '''

    song_list_link_by_artist_json_dir = fu.relative_to_absolute_path(
        "GetAllSongHTML/song_list_link_by_artist/json",
        root_path=root_directory
    )

    song_json_file_by_letter = f"{song_list_link_by_artist_json_dir}/{letter.lower()}.json"

    song_json_items = fu.read_data_from_json_file(song_json_file_by_letter)

    return song_json_items

In [54]:
def song_link_to_raw_html_file_path(song_link: str):
    relative_file_path = fu.song_link_to_relative_html_file_path(song_link)

    absolute_file_path = fu.relative_to_absolute_path(
        f"GetAllSongHTML/song_list/raw_html/{relative_file_path}",
        root_path=root_directory
    )

    return absolute_file_path

In [55]:
song_json_list = get_song_json_list_by_alphabet("a")

song_index = random.randint(0, len(song_json_list) - 1)

link = song_link_to_raw_html_file_path(
    get_song_json_list_by_alphabet("a")[song_index]["link"]
)

print(get_song_json_list_by_alphabet("a")[song_index])
print(link)

html_raw_content = fu.read_data_from_html_file(link)

metrics, metrics_conclusion, key_conclusion = extract_metrics_and_conclusion_from_html(
    html_raw_content
)

song_id_of_song_parts_dir = extract_song_parts_ids(html_raw_content)

print()
print(f"Metrics: {metrics}")
print()
print(f"Metrics Conclusion: {metrics_conclusion}")
print()
print(f"Key Conclusion: {key_conclusion}")
print()
print(f"Song Parts ID: {song_id_of_song_parts_dir}")

{'artist_name': 'April', 'artist_link': 'https://www.hooktheory.com/theorytab/artists/a/april', 'song_name': 'Oh My Mistake', 'link': 'https://www.hooktheory.com/theorytab/view/april/oh-my-mistake'}
/Users/4rr311/Documents/VectorA/KHTN/Nam4/HKII/Thesis/Brainstorming/DataCrawling/GetAllSongHTML/song_list/raw_html/a/april/oh-my-mistake.html

Metrics: {'Chord Complexity': '66', 'Melodic Complexity': '40', 'Chord-Melody Tension': '31', 'Chord Progression Novelty': '39', 'Chord Bass Melody': '0'}

Metrics Conclusion:  Oh My Mistake has higher complexity than the average song  in terms Chord Complexity. 

Key Conclusion:  Oh My Mistake is written in the key of B♭ Major.  

According to the Theorytab database, it is the 8th most popular key among Major keys and     the 12th most popular  among all keys. Major keys, along with minor keys, are a common choice for popular songs.  The three most important chords, built off the 1st, 4th and 5th scale degrees are all major chords (B♭ Major, E♭ Majo