<a href="https://colab.research.google.com/github/Haikoo96/Kpop-Trend-Analysis/blob/master/scraping%20and%20preprocess/K_IDOL_db_scrape_tojson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Objective**
1. scrape all the K-Idol groups and members' name as key-value pairs
2. These key-value pairs will be used as the ground truth to match the keywords extracted by the NER model.

In [2]:
import requests
from bs4 import BeautifulSoup
import string
import pandas as pd
import numpy as np
import json
import re

In [3]:
url = 'https://kpopping.com/profiles/the-groups'

# Send a GET request to the page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    elements = soup.find_all(class_='items')
else:
    print("Failed to retrieve the webpage")

## **First obtain the raw list of k-idol groups and members**

In [6]:
# Raw List of K-IDOL groups and members
kidol_lst = [element.parent.text.split('\n') for element in elements]

# first remove empty strings if any
for lst in kidol_lst:
  for val in lst:
    if val == '' or val == r'^$':
      lst.remove(val)

kidol_lst[1]

['A',
 'A SIX',
 '( Park Boeun',
 ', Won Jimin',
 ', J-Min',
 ', Ling Qi',
 ', Lee Han Bin',
 ', Youngseo',
 ')',
 'A Train To Autumn',
 '( Somi',
 ', Lee A Young',
 ', Subin',
 ', Jihyeon',
 ')',
 'A_Label',
 '( Seunga',
 ', Hyeni',
 ', Rina',
 ')',
 'A-Daily',
 '( MUU',
 ', NoA',
 ', ChaeE',
 ', Yunseol',
 ', Youngbi',
 ', Jei',
 ', Cho Jisun',
 ', Sena',
 ')',
 'A-DEAN',
 '( Jimin',
 ', Yuki',
 ', Joohyun',
 ', Haru',
 ')',
 'A-Fati',
 '( Jiyun',
 ', Hyoi',
 ')',
 'A-FEEL',
 '( Haeseon',
 ', Mini',
 ', Jini',
 ', Ssun',
 ', Mina',
 ')',
 'A-Jax',
 '( Jaehyung',
 ', Seungjin',
 ', Jihu',
 ', Sungmin',
 ', Joonghee',
 ', Yunyoung',
 ', Dowoo',
 ', Seungyub',
 ')',
 'A-Plus',
 '( Dayoung',
 ', Hyoeun',
 ', Yeonhee',
 ')',
 'A-plus',
 '( Song Hwanhee',
 ', Lee Heeju',
 ', Byun Subin',
 ', Kim Jiyun',
 ', Kim Jamin',
 ', An Soyeon',
 ', Jihyun',
 ')',
 'A-Prince',
 '( Minhyuk',
 ', J.Vin',
 ', Sun',
 ', J.Jun',
 ')',
 'A-seed',
 '( Jihyun',
 ', Mini',
 ', Soyeon',
 ', Soly',
 ')',
 'A:FA

##**Processing the raw "kidol_lst"**
- the list is alphabetically ordered by the first element of the nested list except for the very first one.
- Basically, all the idol groups in the nested list begin with the same letter.
- Given this condition, it is wise to organize them into key-value pairs where keys are the alphabet and values will be the group's name under which the members' names will be included.

In [7]:
# putting the raw list to two key-value pairs:
# one for special characters and the other for the alphabetically ordered one
def organize_idol_lst(lst, tar_str, letter_to_avoid):
    if tar_str not in ['()', '{}', '[]']:
        print('Only a pair of parentheses, brackets, or braces is accepted such as (), {}, []')
        return {}, {}  # Early return if tar_str is not valid

    print('Ready to organize')
    special_char_dict = {'#': {}}
    alpha_order_dict = {letter: {} for letter in string.ascii_uppercase}

    for i, sub_lst in enumerate(lst):
        idx_lst = []

        # Simplify the loop by directly collecting the necessary pairs of indices and keys
        for idx, element in enumerate(sub_lst):
            if element.startswith(tar_str[0]) and not sub_lst[idx-1].startswith(letter_to_avoid):
                try:
                    end_idx = sub_lst.index(tar_str[1], idx)  # Find the matching end index
                    name_key = sub_lst[idx-1].strip()  # Extract the key, assuming it's directly before the start symbol
                    name_value = sub_lst[idx:end_idx+1]  # Include end_idx in the slice

                    # Decide on which dictionary to use
                    target_dict = special_char_dict['#'] if i == 0 else alpha_order_dict.get(name_key[0].upper(), {})
                    target_dict[name_key] = name_value
                except ValueError:
                    print(f"Matching {tar_str[1]} not found for {element} at index {idx}.")
                except IndexError:
                    print(f"No key found before {element} at index {idx}.")

    return special_char_dict, alpha_order_dict


In [8]:
special_char_idol, normal_char_idol = organize_idol_lst(kidol_lst, '()', '(G)')

Ready to organize


In [9]:
len(normal_char_idol['B'].keys())

109

In [10]:
count = 0

for key in normal_char_idol['B'].keys():
  if key in kidol_lst[2]:
    count += 1

print(count)

109


In [12]:
normal_char_idol['B']

{'B.A.P': ['( Youngjae',
  ', Bang Yongguk',
  ', Jongup',
  ', Daehyun',
  ', Zelo',
  ', Himchan',
  ')'],
 'B.D.U': ['( Seunghun', ', Jay Chang', ', Kim Minseo', ', Bitsaeon', ')'],
 'B.Dolls': ['( Yewon',
  ', Taeyoon',
  ', Seungjoo',
  ', Seo I',
  ', Hayeon',
  ', Geunyoung',
  ')'],
 'B.I.G': ['( Heedo',
  ', Benji',
  ', Gunmin',
  ', Minpyo',
  ', Jinseok',
  ', J-Hoon',
  ')'],
 'B.O.Y': ['( Kook Heon', ', Yuvin', ')'],
 'B1A4': ['( Jinyoung', ', CNU', ', Baro', ', Sandeul', ', Gongchan', ')'],
 'B2ST (BEAST)': ['( Gikwang',
  ', Yoseop',
  ', Dongwoon',
  ', Doojoon',
  ', Yong Jun Hyung',
  ', Hyunseung',
  ')'],
 'BaBa': ['( Hayun',
  ', Youzin',
  ', Jin Dozin',
  ', Seungha',
  ', Kim Hye-won',
  ', Serin',
  ', Somi',
  ', Bimby',
  ', Byulha',
  ', Dayul',
  ', Hyoah',
  ', Seoae',
  ', Yera',
  ', Sooreem',
  ', Seyul',
  ', Chaeha',
  ', Starlight',
  ', Pureum',
  ', Roha',
  ', Johwa',
  ', Haena',
  ')'],
 'BABY BLUE': ['( Jinaon', ', Do Ayun', ')'],
 'Baby Boo':

In [13]:
# further cleaning the alphabetically ordered dictionary
for key, val in normal_char_idol.items():
    for inner_key, list_of_strings in val.items():
        updated_list = []  # Create a new list to hold updated strings
        for word in list_of_strings:
            # Remove the characters `(`, `)`, and `,`
            updated_word = re.sub(r'[\(\), ]', '', word)
            # Only add non-empty strings to the updated list
            if updated_word != '':
                updated_list.append(updated_word)
        # Update the dictionary with the cleaned list
        normal_char_idol[key][inner_key] = updated_list

In [16]:
# further cleaning the special charactered dictionary
for key, val in special_char_idol.items():
    for inner_key, list_of_strings in val.items():
        updated_list = []  # Create a new list to hold updated strings
        for word in list_of_strings:
            # Remove the characters `(`, `)`, and `,`
            updated_word = re.sub(r'[\(\), ]', '', word)
            # Only add non-empty strings to the updated list
            if updated_word != '':
                updated_list.append(updated_word)
        # Update the dictionary with the cleaned list
        special_char_idol[key][inner_key] = updated_list

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import os

path = 'drive/MyDrive/Colab Notebooks/kpop trend analysis/dataset_raw'
os.path.exists(path)

True

##Finally, storing them as JSON

In [None]:
# Open a new file in write mode
with open(os.path.join(path,'normal_char_idol.json'), 'w') as file:
    # Use json.dump() to write the dictionary to the file
    json.dump(normal_char_idol, file, indent=4)

In [None]:
# Open a new file in write mode
with open(os.path.join(path,'special_char_idol.json'), 'w') as file:
    # Use json.dump() to write the dictionary to the file
    json.dump(special_char_idol, file, indent=4)