In [1]:
import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
from urllib.parse import quote_plus
import os
import json
import time
from bs4 import BeautifulSoup
from IPython.display import clear_output

In [2]:
df = pd.read_csv('../datasets/rank_1/query.csv')

In [3]:
json_path = '../datasets/rank_1/lyrics.json'

if not os.path.exists(json_path):
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(json_path, 'w') as json_file:
        json.dump({}, json_file)

with open(json_path, 'r') as json_file:
    existing_data = json.load(json_file)

In [4]:
batch_size = 100
batches_done = int(len(existing_data) / batch_size)
batched_dfs = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]
total_batches = len(batched_dfs)
total_items = len(df)

In [5]:
batch_size, batches_done, total_batches, total_items

(100, 0, 12, 1175)

In [6]:
base_url = 'http://search.azlyrics.com'
x_param = '3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9'
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    }

In [7]:
async def search(session, song, artist):
    
    song_param = quote_plus(song)
    artist_param = quote_plus(artist)
    
    query_string = f"{song_param}+{artist_param}&x={x_param}"
    
    url = f"{base_url}/search.php?q={query_string}"
    print(url)
    
    async with session.get(url, headers=headers) as response:
        
        response.raise_for_status()
        
        html = await response.text()
        soup = BeautifulSoup(html, 'html.parser')

        try:
            result = soup.find("td", class_="text-left visitedlyr").find('a').get('href')
            return result
            
        except Exception as e:
            print(e)
            return None

In [8]:
async def get_link(song, artist):
    async with aiohttp.ClientSession() as session:
        link = await asyncio.gather(search(session, song, artist))
        return link

In [9]:
async def get_all_links(df):
    tasks = [get_link(song, artist) for song, artist in zip(df["Song"], df["Artist"])]
    all_links = await asyncio.gather(*tasks)
    return all_links

In [10]:
def parse_data(track_ids, batch_results):
    return {
        index: {"track_id": track_id, "link": link}
        for index, (track_id, link) in enumerate(zip(track_ids, batch_results))
    }
    return {track_id: result for track_id, result in zip(track_ids, batch_results)}

In [11]:
def append_to_file(data):
    with open(json_path, 'r') as json_file:
        existing_data = json.load(json_file)

    existing_data.append(data)
    
    with open(json_path, 'w') as json_file:
        json.dump(existing_data, json_file, indent=4)

In [12]:
async def process_batches():
    global batches_done
    while batches_done < total_batches and input('Continue? (y/n): ').lower() == 'y':
        current_batch = batched_dfs[batches_done]
        batch_results = await get_all_links(current_batch)
        if batch_results:
            data = parse_data(current_batch['Track ID'], batch_results)
            append_to_file(data)
            print(f'Processed batch {batches_done+1}/{total_batches}')
            batches_done += 1
        else:
            print(f'Failed to process batch {batches_done+1}')
            break

In [13]:
nest_asyncio.apply()
start_time = time.time()
init_batches_done = batches_done
asyncio.get_event_loop().run_until_complete(process_batches())
round_batches_done = batches_done - init_batches_done
clear_output()
print(f'Batches processed: {round_batches_done}')
print(f"Execution Time: {time.time() - start_time:.2f} seconds")

Continue? (y/n):  y


http://search.azlyrics.com/search.php?q=poor+little+fool+ricky+nelson&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=nel+blu+dipinto+di+domenico+modugno&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=little+star+the+elegants&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=it%27s+all+in+the+tommy+edwards&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=it%27s+only+make+believe+conway+twitty&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=tom+dooley+the+kingston&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=to+know+him+is+the+teddy&x=3d8668771b0c5d2c207aae1e914bd019a30dd1633e66b7f78328f040c3851ab9
http://search.azlyrics.com/search.php?q=

ClientResponseError: 403, message='Forbidden', url='https://b.azlyrics.com/?u=/search.php'