In [79]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd

In [80]:
def scrape_rockfm_page(base_url='https://onlineradiobox.com/es/rockfm/playlist/'):
    # Set headers to mimic a real browser
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    songs = []
    today = datetime.today().date()
    
    for page_idx in [1]:
        
        idx = '' if page_idx == 0 else str(page_idx)
        
        date = today - timedelta(days=page_idx)
        date = date.strftime("%Y-%m-%d")
     
        page_url = base_url + idx + '?cs=es.rockfm'
    
        # Fetch the content of the webpage
        response = requests.get(page_url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the table
            table = soup.find("table", class_="tablelist-schedule")

            # Extract rows from the table
            rows = table.find_all("tr")
 
            for row in rows:
                # Extract time
                time_cell = row.find("td", class_="tablelist-schedule__time")
                time_text = time_cell.find("span", class_="time--schedule").text.strip() if time_cell else "N/A"

                # Extract track info
                track_cell = row.find("td", class_="track_history_item")
                try:
                    track_text = track_cell.find("a").text.strip() if track_cell else ''
                except:
                    track_text = track_cell.text.strip() if track_cell else ''
                    
                artist = track_text.split(' - ')[0]
                song = track_text.split(' - ')[1]
                
                # List with one song's data
                song = [date, time_text, artist, song]
                songs.append(song)
                songs = songs[::-1]
        
        # Dataframe with all songs data
        songs_df = pd.DataFrame(songs, columns=["Date", "Time", "Artist", "Song"])
                      
    return songs_df


# URL to scrape
base_url = 'https://onlineradiobox.com/es/rockfm/playlist/'

# Scrape the songs data
songs_df = scrape_rockfm_page(base_url)
songs_df

Unnamed: 0,Date,Time,Artist,Song
0,2024-12-26,00:04,EUROPE,ROCK THE NIGHT
1,2024-12-26,00:11,JANIS JOPLIN,PIECE OF MY HEART
2,2024-12-26,00:20,RAMONES,SHEENA IS A PUNK ROCKER
3,2024-12-26,00:28,KANSAS,DUST IN THE WIND
4,2024-12-26,00:35,U2,ELEVATION
...,...,...,...,...
309,2024-12-26,00:39,BILLY JOEL,WE DIDN'T START THE FIRE
310,2024-12-26,00:31,ADAMS BRYAN,18 TILL I DIE
311,2024-12-26,00:23,NEIL YOUNG,ROCKIN IN THE FREE WORLD
312,2024-12-26,00:15,COLDPLAY,CLOCKS


In [81]:
songs_df['Datetime'] = pd.to_datetime(songs_df['Date'] + ' ' + songs_df['Time'] + ':00')
songs_df = songs_df.sort_values(by='Datetime')
songs_df

Unnamed: 0,Date,Time,Artist,Song,Datetime
0,2024-12-26,00:04,EUROPE,ROCK THE NIGHT,2024-12-26 00:04:00
313,2024-12-26,00:07,FRANZ FERDINAND,DO YOU WANT TO,2024-12-26 00:07:00
1,2024-12-26,00:11,JANIS JOPLIN,PIECE OF MY HEART,2024-12-26 00:11:00
312,2024-12-26,00:15,COLDPLAY,CLOCKS,2024-12-26 00:15:00
2,2024-12-26,00:20,RAMONES,SHEENA IS A PUNK ROCKER,2024-12-26 00:20:00
...,...,...,...,...,...
159,2024-12-26,23:41,DOVER,CHERRY LEE,2024-12-26 23:41:00
155,2024-12-26,23:44,PATTI SMITH,PEOPLE HAVE THE POWER,2024-12-26 23:44:00
158,2024-12-26,23:50,SUPERTRAMP,GOODBYE STRANGER,2024-12-26 23:50:00
156,2024-12-26,23:55,THE STRANGLERS,GOLDEN BROWN,2024-12-26 23:55:00


In [82]:
def scrape_rockfm_programs(page_url='https://www.rockfm.fm/programacion'):
    # Set headers to mimic a real browser
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    yesterday = datetime.today().date() - timedelta(days=1)
        
    # Fetch the content of the webpage
    response = requests.get(page_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        #print(soup)
            
        # Find the div for yesterday
        day_div = soup.find("div", class_="wednesday")
        print(day_div)

        # Find all divs with class='c-schedule-thumb_txt' inside the wednesday div
        ##schedule_thumbs = day_div.find_all('div', class_='c-schedule-thumb__txt')

        # Print each schedule found
        ##for schedule in schedule_thumbs:
            ##span = schedule.find("span", class_="title")
            ##title = span.get_text()
            ##print(title)
                  
    #return songs_df

scrape_rockfm_programs('https://www.rockfm.fm/programacion')

None


In [83]:
import requests
import json

# Make a GET request to the API endpoint you found in the Network tab
api_url = 'https://www.rockfm.fm/ply/prg/37' #?0.2481731647902638'
response = requests.get(api_url)

# The response is in JSON format
data = response.json()

# Prettyfy the JSON
pretty_json = json.dumps(data, indent=4)

# Print or process the data
#print(pretty_json)



In [84]:
yesterday = datetime.today().date() - timedelta(days=1)
print(yesterday)
yesterday_weekday = yesterday.weekday()
print(yesterday_weekday)

2024-12-26
3


In [85]:
weekday_formated = f'd{yesterday_weekday}'
yesterday_obj = datetime.strptime(str(yesterday), "%Y-%m-%d")
programs_lt = []

for item in data["prg"][weekday_formated]["es"]:

    from_time = int(item["from"])
    from_time_datetime = yesterday_obj  + timedelta(minutes=from_time)
    to_time = int(item["to"])
    to_time_datetime = yesterday_obj  + timedelta(minutes=to_time+1)
    title = item["title"]
    timetable = item["horario"]
    program = [from_time_datetime, to_time_datetime, title]
    programs_lt.append(program)

columns = ['From', 'To', 'Title']
programs_df = pd.DataFrame(programs_lt, columns=columns)
programs_df = programs_df.sort_values(by='From')
programs_df


Unnamed: 0,From,To,Title
0,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
1,2024-12-26 01:00:00,2024-12-26 06:00:00,RockFM noche
2,2024-12-26 06:00:00,2024-12-26 10:00:00,El Pirata y su banda
3,2024-12-26 10:00:00,2024-12-26 14:30:00,Marta Vázquez
4,2024-12-26 14:30:00,2024-12-26 18:00:00,Nano Jaquotot
5,2024-12-26 18:00:00,2024-12-26 21:00:00,Rodrigo Garcinuño
6,2024-12-26 21:00:00,2024-12-27 00:00:00,RockFM Motel


In [90]:
# Merge songs with programs based on closest prior start time
merged_df = pd.merge_asof(
    songs_df,
    programs_df,
    left_on='Datetime',
    right_on='From',
    direction='backward'
)

# Filter rows where song time is within the program's duration
merged_df = merged_df[merged_df['Datetime'] < merged_df['To']]
merged_df

Unnamed: 0,Date,Time,Artist,Song,Datetime,From,To,Title
0,2024-12-26,00:04,EUROPE,ROCK THE NIGHT,2024-12-26 00:04:00,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
1,2024-12-26,00:07,FRANZ FERDINAND,DO YOU WANT TO,2024-12-26 00:07:00,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
2,2024-12-26,00:11,JANIS JOPLIN,PIECE OF MY HEART,2024-12-26 00:11:00,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
3,2024-12-26,00:15,COLDPLAY,CLOCKS,2024-12-26 00:15:00,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
4,2024-12-26,00:20,RAMONES,SHEENA IS A PUNK ROCKER,2024-12-26 00:20:00,2024-12-26 00:00:00,2024-12-26 01:00:00,El decálogo de Mariskal
...,...,...,...,...,...,...,...,...
309,2024-12-26,23:41,DOVER,CHERRY LEE,2024-12-26 23:41:00,2024-12-26 21:00:00,2024-12-27 00:00:00,RockFM Motel
310,2024-12-26,23:44,PATTI SMITH,PEOPLE HAVE THE POWER,2024-12-26 23:44:00,2024-12-26 21:00:00,2024-12-27 00:00:00,RockFM Motel
311,2024-12-26,23:50,SUPERTRAMP,GOODBYE STRANGER,2024-12-26 23:50:00,2024-12-26 21:00:00,2024-12-27 00:00:00,RockFM Motel
312,2024-12-26,23:55,THE STRANGLERS,GOLDEN BROWN,2024-12-26 23:55:00,2024-12-26 21:00:00,2024-12-27 00:00:00,RockFM Motel
