In [195]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd

In [196]:
day_idx = 2

In [197]:
def scrape_rockfm_page(base_url='https://onlineradiobox.com/es/rockfm/playlist/'):
    # Set headers to mimic a real browser
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    songs = []
    today = datetime.today().date()
    
    for page_idx in [day_idx]:
        
        idx = '' if page_idx == 0 else str(page_idx)
        
        date = today - timedelta(days=page_idx)
        date = date.strftime("%Y-%m-%d")
     
        page_url = base_url + idx + '?cs=es.rockfm'
    
        # Fetch the content of the webpage
        response = requests.get(page_url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the table
            table = soup.find("table", class_="tablelist-schedule")

            # Extract rows from the table
            rows = table.find_all("tr")
 
            for row in rows:
                # Extract time
                time_cell = row.find("td", class_="tablelist-schedule__time")
                time_text = time_cell.find("span", class_="time--schedule").text.strip() if time_cell else "N/A"

                # Extract track info
                track_cell = row.find("td", class_="track_history_item")
                try:
                    track_text = track_cell.find("a").text.strip() if track_cell else ''
                except:
                    track_text = track_cell.text.strip() if track_cell else ''
                    
                artist = track_text.split(' - ')[0]
                song = track_text.split(' - ')[1]
                
                # List with one song's data
                song = [date, time_text, artist, song]
                songs.append(song)
                songs = songs[::-1]
        
        # Dataframe with all songs data
        songs_df = pd.DataFrame(songs, columns=["Date", "Time", "Artist", "Song"])
                      
    return songs_df


# URL to scrape
base_url = 'https://onlineradiobox.com/es/rockfm/playlist/'

# Scrape the songs data
songs_df = scrape_rockfm_page(base_url)
songs_df

Unnamed: 0,Date,Time,Artist,Song
0,2024-12-25,00:00,BILLY IDOL,REBEL YELL
1,2024-12-25,00:11,IRON MAIDEN,CAN I PLAY WITH MADNESS
2,2024-12-25,00:21,VOLBEAT,LOLA MONTEZ
3,2024-12-25,00:29,DEF LEPPARD,MAKE LOVE LIKE A MAN
4,2024-12-25,00:36,THE SWEET,BALLROOM BLITZ
...,...,...,...,...
323,2024-12-25,00:40,FOO FIGHTERS,BEST OF YOU
324,2024-12-25,00:33,PAT BENATAR,WE BELONG
325,2024-12-25,00:25,ALICE COOPER,SCHOOL'S OUT
326,2024-12-25,00:15,GERRY RAFFERTY,BAKER STREET


In [198]:
songs_df['Datetime'] = pd.to_datetime(songs_df['Date'] + ' ' + songs_df['Time'] + ':00')
songs_df = songs_df.sort_values(by='Datetime')
songs_df

Unnamed: 0,Date,Time,Artist,Song,Datetime
0,2024-12-25,00:00,BILLY IDOL,REBEL YELL,2024-12-25 00:00:00
327,2024-12-25,00:05,METALLICA,NOTHING ELSE MATTERS,2024-12-25 00:05:00
1,2024-12-25,00:11,IRON MAIDEN,CAN I PLAY WITH MADNESS,2024-12-25 00:11:00
326,2024-12-25,00:15,GERRY RAFFERTY,BAKER STREET,2024-12-25 00:15:00
2,2024-12-25,00:21,VOLBEAT,LOLA MONTEZ,2024-12-25 00:21:00
...,...,...,...,...,...
166,2024-12-25,23:42,SIMPLE MINDS,DON'T YOU (FORGET ABOUT ME),2024-12-25 23:42:00
162,2024-12-25,23:46,QUEEN,HAMMER TO FALL,2024-12-25 23:46:00
165,2024-12-25,23:50,DEREK & THE DOMINOS,LAYLA,2024-12-25 23:50:00
163,2024-12-25,23:53,JON BON JOVI,BLAZE OF GLORY,2024-12-25 23:53:00


In [199]:
def scrape_rockfm_programs(page_url='https://www.rockfm.fm/programacion'):
    # Set headers to mimic a real browser
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    yesterday = datetime.today().date() - timedelta(days=day_idx)
        
    # Fetch the content of the webpage
    response = requests.get(page_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        #print(soup)
            
        # Find the div for yesterday
        day_div = soup.find("div", class_="wednesday")
        print(day_div)

        # Find all divs with class='c-schedule-thumb_txt' inside the wednesday div
        ##schedule_thumbs = day_div.find_all('div', class_='c-schedule-thumb__txt')

        # Print each schedule found
        ##for schedule in schedule_thumbs:
            ##span = schedule.find("span", class_="title")
            ##title = span.get_text()
            ##print(title)
                  
    #return songs_df

scrape_rockfm_programs('https://www.rockfm.fm/programacion')

None


In [200]:
import requests
import json

# Make a GET request to the API endpoint you found in the Network tab
api_url = 'https://www.rockfm.fm/ply/prg/37' #?0.2481731647902638'
response = requests.get(api_url)

# The response is in JSON format
data = response.json()

# Prettyfy the JSON
pretty_json = json.dumps(data, indent=4)

# Print or process the data
#print(pretty_json)



In [201]:
yesterday = datetime.today().date() - timedelta(days=day_idx)
print(yesterday)
yesterday_weekday = yesterday.weekday()
print(yesterday_weekday)

2024-12-25
2


In [202]:
weekday_formated = f'd{yesterday_weekday}'
yesterday_obj = datetime.strptime(str(yesterday), "%Y-%m-%d")
programs_lt = []

for item in data["prg"][weekday_formated]["es"]:

    from_time = int(item["from"])
    from_time_datetime = yesterday_obj  + timedelta(minutes=from_time)
    to_time = int(item["to"])
    to_time_datetime = yesterday_obj  + timedelta(minutes=to_time+1)
    title = item["title"]
    timetable = item["horario"]
    program = [from_time_datetime, to_time_datetime, title]
    programs_lt.append(program)

columns = ['From', 'To', 'Title']
programs_df = pd.DataFrame(programs_lt, columns=columns)
programs_df = programs_df.sort_values(by='From')
programs_df


Unnamed: 0,From,To,Title
0,2024-12-25 00:00:00,2024-12-25 06:00:00,RockFM noche
1,2024-12-25 06:00:00,2024-12-25 10:00:00,El Pirata y su banda
2,2024-12-25 10:00:00,2024-12-25 14:30:00,Marta Vázquez
3,2024-12-25 14:30:00,2024-12-25 18:00:00,Nano Jaquotot
4,2024-12-25 18:00:00,2024-12-25 21:00:00,Rodrigo Garcinuño
5,2024-12-25 21:00:00,2024-12-26 00:00:00,RockFM Motel


In [203]:
# Merge songs with programs based on closest prior start time
merged_df = pd.merge_asof(
    songs_df,
    programs_df,
    left_on='Datetime',
    right_on='From',
    direction='backward'
)

# Filter rows where song time is within the program's duration
merged_df = merged_df[merged_df['Datetime'] < merged_df['To']]

# Drop two columns
merged_df = merged_df.drop(columns=['From', 'To'])

merged_df

Unnamed: 0,Date,Time,Artist,Song,Datetime,Title
0,2024-12-25,00:00,BILLY IDOL,REBEL YELL,2024-12-25 00:00:00,RockFM noche
1,2024-12-25,00:05,METALLICA,NOTHING ELSE MATTERS,2024-12-25 00:05:00,RockFM noche
2,2024-12-25,00:11,IRON MAIDEN,CAN I PLAY WITH MADNESS,2024-12-25 00:11:00,RockFM noche
3,2024-12-25,00:15,GERRY RAFFERTY,BAKER STREET,2024-12-25 00:15:00,RockFM noche
4,2024-12-25,00:21,VOLBEAT,LOLA MONTEZ,2024-12-25 00:21:00,RockFM noche
...,...,...,...,...,...,...
323,2024-12-25,23:42,SIMPLE MINDS,DON'T YOU (FORGET ABOUT ME),2024-12-25 23:42:00,RockFM Motel
324,2024-12-25,23:46,QUEEN,HAMMER TO FALL,2024-12-25 23:46:00,RockFM Motel
325,2024-12-25,23:50,DEREK & THE DOMINOS,LAYLA,2024-12-25 23:50:00,RockFM Motel
326,2024-12-25,23:53,JON BON JOVI,BLAZE OF GLORY,2024-12-25 23:53:00,RockFM Motel


In [204]:
import os

file_path = 'songs.csv'

# Append new data to the songs.csv file
#merged_df.to_csv(file_path, mode='a', index=False, header=False)

# Check if the file exists
if os.path.isfile(file_path):
    # Read the existing CSV file
    #existing_df = pd.read_csv(file_path)

    # Find rows that are not in the existing CSV
    #combined_df = pd.concat([existing_df, merged_df]).drop_duplicates(keep=False)
    
    # Write only new rows to the CSV file
    #if not combined_df.empty:
    merged_df.to_csv(file_path, mode='a', index=False, header=False)
    print("New rows appended to the CSV file.")
else:
    # If the file doesn't exist, write the new DataFrame with headers
    merged_df.to_csv(file_path, index=False)
    print("CSV file created with the new data.")

New rows appended to the CSV file.
