# Imports

In [9]:
from ytmusicapi import YTMusic
from pprint import pprint
import os
from pytube import YouTube
import urllib.request
import requests
import re
import string
from tkinter import Tk, ttk, filedialog, StringVar
from dotenv import load_dotenv
from spotipy import SpotifyOAuth
from spotipy.oauth2 import SpotifyOauthError
import pandas as pd
import numpy as np
import json

# Scrapper Classes

In [26]:
class YTScraper:
  def __init__(
    self,
    token_file_path = 'oauth.json',
  ):
    self.yt_music = YTMusic(token_file_path)
    
  def get_search_results(self,search_str):
    return self.yt_music.search(search_str,filter = 'songs')
  
  def get_lyrics(self, video_id):
    video=self.yt_music.get_watch_playlist(videoId=video_id)
    lyrics_id = video['lyrics']
    lyrics = None
    if lyrics_id:
      lyrics=self.yt_music.get_lyrics(lyrics_id)
    return lyrics
  
  def download_song_by_video_id(
    self,
    video_id,
    output_path=None,
  ):
    if output_path is not None:
      os.makedirs(output_path,exist_ok=True)
    else:
      output_path = ""
    yt = YouTube(f"https://youtube.com/watch?v={video_id}", use_oauth = True, allow_oauth_cache=True)
    temp_file = os.path.join(output_path,f"{video_id}.mp4")
    yt.streams.filter(only_audio=True).first().download(output_path=output_path, filename=f"{video_id}.mp4")
    if os.path.exists(temp_file):
      print(f"Downloaded successfully: {temp_file}")
    return os.path.join(output_path,f"{video_id}.mp4")
  
  def scrap_by_search(self, search_str, download_output_file_path = None):
    search_result = self.get_search_results(search_str=search_str)
    video_id = None
    for item in search_result:
      if item['resultType'] in ['song'] and item['category'] == 'Songs':
        video_id = item['videoId']
        break 
    if (video_id):
      lyrics = self.get_lyrics(video_id=video_id)
      if not lyrics:
        lyrics = {'lyrics':None,'source':None}
      download_path = self.download_song_by_video_id(video_id=video_id, output_path=download_output_file_path)
    else:
      raise Exception("Null Video Id")
    return {
      "lyrics" : lyrics['lyrics'],
      "audio_file_path" : download_path
    }
  
class SpotifyScraper:
    def __init__(
        self,
        client_id,
        client_secret,
        redirect_uri,
        scope = "user-library-read playlist-read-private playlist-read-collaborative"
    ):
        self.yt_scraper = YTScraper()
        try:
            self.sp_oauth = SpotifyOAuth(
                client_id=client_id,
                client_secret=client_secret,
                redirect_uri=redirect_uri,
                scope="user-library-read playlist-read-private playlist-read-collaborative",
            )
        except SpotifyOauthError as e:
            print(f"Spotify OAuth setup error: {e}")
            exit(1)
            
        self.access_token = self.get_access_token()
            
    def get_access_token(self):
        # Get access token
        self.token_info = self.sp_oauth.get_cached_token()
        if not self.token_info:
            auth_url = self.sp_oauth.get_authorize_url()
            print("Please go to this URL and authorize the app:", auth_url)
            auth_code = input("Enter the authorization code: ")
            self.token_info = self.sp_oauth.get_access_token(auth_code)
        access_token = self.token_info["access_token"]
        return access_token

    def get_auth_header(self):
        return {"Authorization": "Bearer " + self.access_token}
    # Function to fetch user playlists
    
    def get_user_playlists(self):
        print("Retrieving user playlists...")
        headers = self.get_auth_header()
        response = requests.get("https://api.spotify.com/v1/me/playlists", headers=headers)
        response_json = response.json()
        # for item in response_json["items"]:
        #     playlists[item["name"]] = item["id"]
        print("Playlists retrieved successfully.")
        return response_json
    
    @staticmethod
    def sanitize_filename(filename):
        valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
        return ''.join(c for c in filename if c in valid_chars)

    def get_track_info_by_id(self, track_id):
        headers = self.get_auth_header()
        response = requests.get(f"https://api.spotify.com/v1/tracks/{track_id}", headers=headers)
        if not response.ok:
            raise Exception(f"{response.status_code} : {response.text}")
        response_json = response.json()
        return response_json
    
    def get_playlist_info_by_id(self, playlist_id):
        headers = self.get_auth_header()
        response = requests.get(f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks", headers=headers)
        if not response.ok:
            raise Exception(f"{response.status_code} : {response.text}")
        response_json = response.json()
        return response_json
    
    def get_several_track_info_by_id(self, track_ids_str):
        headers = self.get_auth_header()
        response = requests.get(f"https://api.spotify.com/v1/tracks?ids={track_ids_str}", headers=headers)
        if not response.ok:
            raise Exception(f"{response.status_code} : {response.text}")
        response_json = response.json()
        return response_json
    
    def get_several_artist_info_by_id(self, artist_ids_str):
        headers = self.get_auth_header()
        response = requests.get(f"https://api.spotify.com/v1/artists?ids={artist_ids_str}", headers=headers)
        if not response.ok:
            raise Exception(f"{response.status_code} : {response.text}")
        response_json = response.json()
        return response_json
    
    def get_several_audio_feature_by_id(self, track_ids_str):
        headers = self.get_auth_header()
        response = requests.get(f"https://api.spotify.com/v1/audio-features?ids={track_ids_str}", headers=headers)
        if not response.ok:
            raise Exception(f"{response.status_code} : {response.text}")
        response_json = response.json()
        return response_json
    
    def destructure_artist_data(self,artists_list):
        artist_infos = []
        artist_genres = []
        artist_popularities = []
        artist_followers = []
        for artist_obj in artists_list:
            artist_infos.append({
                "name": artist_obj["name"],
                "id": artist_obj["id"],
                "genres": artist_obj.get("genres", []),
                "popularity": artist_obj["popularity"],
                "followers": artist_obj["followers"]["total"]
            })
            artist_genres.extend(artist_obj.get("genres", []))
            artist_popularities.append(artist_obj["popularity"])
            artist_followers.append(artist_obj["followers"]["total"])
        return {
            'artist_infos' : artist_infos,
            'artist_genres' : artist_genres,
            'artist_popularities' : artist_popularities,
            'artist_followers' : artist_followers,
        }
    
    def construct_track_info_dict(self,track_json = None, audio_features_json = None, artist_json = None): 
        
        most_popular_artist = max(artist_json['artist_infos'], key=lambda x: x["popularity"])
        
        search_str = f"{track_json['name']} {most_popular_artist['name']}"
        
        lyrics_audio_data = self.yt_scraper.scrap_by_search(search_str=search_str, download_output_file_path = "songs")
        
        track_info = {
        "track_name": track_json["name"],
        "track_id": track_json["id"],
        "track_number": track_json["track_number"],
        "disc_number": track_json["disc_number"],
        "duration_ms": track_json["duration_ms"],
        "explicit": track_json["explicit"],
        "popularity": track_json["popularity"],
        "preview_url": track_json["preview_url"],
        "isrc": track_json["external_ids"].get("isrc"),
        "album_name": track_json["album"]["name"],
        "album_id": track_json["album"]["id"],
        "album_type": track_json["album"]["album_type"],
        "album_total_tracks": track_json["album"]["total_tracks"],
        "album_release_date": track_json["album"]["release_date"],
        "album_release_date_precision": track_json["album"]["release_date_precision"],
        "album_images": track_json["album"]["images"],
        "popular_artist": most_popular_artist["name"],
        "popular_artist_id": most_popular_artist["id"],
        "artist_names": [artist["name"] for artist in track_json["artists"]],
        "artist_ids": [artist["id"] for artist in track_json["artists"]],
        "combined_genres": list(set(artist_json['artist_genres'])),
        "artist_popularity": most_popular_artist["popularity"],
        "artist_followers": most_popular_artist["followers"],
        "external_url": track_json["external_urls"]["spotify"],
        "acousticness": audio_features_json["acousticness"],
        "danceability": audio_features_json["danceability"],
        "energy": audio_features_json["energy"],
        "instrumentalness": audio_features_json["instrumentalness"],
        "key": audio_features_json["key"],
        "liveness": audio_features_json["liveness"],
        "loudness": audio_features_json["loudness"],
        "mode": audio_features_json["mode"],
        "speechiness": audio_features_json["speechiness"],
        "tempo": audio_features_json["tempo"],
        "time_signature": audio_features_json["time_signature"],
        "valence": audio_features_json["valence"],
    }
        track_info.update(lyrics_audio_data)
        return track_info
        
    
    def get_several_track_data(self, track_ids_list : list[str]):
        data = []
        for idx in range(0,len(track_ids_list),50):
            track_id_str = ",".join(track_ids_list[idx:idx+50])
            tracks_json = self.get_several_track_info_by_id(track_id_str)["tracks"]
            audio_features_json = self.get_several_audio_feature_by_id(track_ids_str=track_id_str)["audio_features"]
            
            
            assert len(tracks_json) == len(audio_features_json)
            for track_obj, audio_feat_obj in zip(tracks_json, audio_features_json):
                artist_ids_list = [artist['id'] for artist in track_obj["artists"]]
                artists_id_str = ','.join(artist_ids_list[:50])
                artists_data = self.get_several_artist_info_by_id(artists_id_str)["artists"]
                destructured_artist_data = self.destructure_artist_data(artists_list=artists_data)                
                data.append(self.construct_track_info_dict(track_obj, audio_feat_obj, destructured_artist_data))
        return data
      

## Scrapping Shuru kro

In [27]:
# Load environment variables
load_dotenv(dotenv_path='.env')
# Setup Spotify API credentials
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")
redirect_uri = os.getenv("REDIRECT_URL")

### Scrapper Instance

### yt music login for lyrics (copy this line to console)

In [17]:
!ytmusicapi oauth

^C


### Scrapper Instantiation

In [28]:
scrapper = SpotifyScraper(
  client_id=client_id,
  client_secret=client_secret,
  redirect_uri=redirect_uri,
)

Please go to this URL and authorize the app: https://accounts.spotify.com/authorize?client_id=fd83c27adc7e41f799e11bbaaab23d34&response_type=code&redirect_uri=http%3A%2F%2Flocalhost.com%3A8888%2Fcallback&scope=user-library-read+playlist-read-private+playlist-read-collaborative


  self.token_info = self.sp_oauth.get_access_token(auth_code)


In [29]:
# Yahan pr apne track ids ke list daal dena
track_ids_list = []
# Below is a sample trackid list
track_ids_list = ['7MJQ9Nfxzh8LPZ9e9u68Fq',
 '1GEBsLDvJGw7kviySRI6GX',
 '0gplL1WMoJ6iYaPgMCL0gX',
 '698ItKASDavgwZ3WjaWjtz',
 '3LlmKSHR3Rs0Y3KHQLAYDk',
 '20R4HfKloPKgXDqU7UKk3x',
 '4RVwu0g32PAqgUiJoXsdF8',
 '6habFhsOp2NvshLv26DqMb',
 '18A7ha5BitZjmdHTCwXFbU',
 '75MNhvTCCKsST3YqqUiU9r',
 '1rfofaqEpACxVEHIZBJe6W',
 '5eGEc27nnhtmcOh6RC890a',
 '5PjdY0CKGZdEuoNab3yDmX',
 '6ocbgoVGwYJhOv1GgI9NsF',
 '1mavQ4WCzXSeL2Dm5DS4GQ',
 '3ebXMykcMXOcLeJ9xZ17XH',
 '36jnG0GLshZiH7oWkOq7gV',
 '3z8h0TU7ReDPLIbEnYhWZb',
 '5SDzo5YMWly9n6hVHvxPwp',
 '0ct6r3EGTcMLPtrXHDvVjc',
 '7BMO7O7ImjV8HNTH74Tshv',
 '561jH07mF1jHuk7KlaeF0s',
 '6or1bKJiZ06IlK0vFvY75k',
 '6M47gaKejso9772SKTa3yH',
 '630sXRhIcfwr2e4RdNtjKN',
 '5QDLhrAOJJdNAmCTJ8xMyW',
 '0lYBSQXN6rCTvUZvg9S0lU',
 '5wANPM4fQCJwkGd4rN57mH',
 '4ZtFanR9U6ndgddUvNcjcG',
 '6RUKPb4LETWmmr3iAEQktW',
 '1P17dC1amhFzptugyAO7Il',
 '2DB4DdfCFMw1iaR6JaR03a',
 '3o9kpgkIcffx0iSwxhuNI2',
 '0QLb1y64s617SAnnDoUZLN',
 '5amAJIEdIVtWYEi4wGp7Fn',
 '5flerg6aEao2VayZezVlgu',
 '39LLxExYz6ewLAcYrzQQyP']



In [30]:
# Track_data is a dictionary of arrays
track_data = scrapper.get_several_track_data(track_ids_list)

Downloaded successfully: songs\soNLLPokjC4.mp4
Downloaded successfully: songs\LYyUr75fs_o.mp4
Downloaded successfully: songs\H9NJenpBV2I.mp4
Downloaded successfully: songs\pIWaVJPl0-c.mp4
Downloaded successfully: songs\aZwklvDdaVw.mp4
Downloaded successfully: songs\oDn4eKyhSH4.mp4
Downloaded successfully: songs\NSTUVHsb9xw.mp4
Downloaded successfully: songs\FXovf5dsRTw.mp4
Downloaded successfully: songs\8_Cw99SiHvc.mp4
Downloaded successfully: songs\G1ej5up7JG0.mp4
Downloaded successfully: songs\mJLDjIWuPXU.mp4
Downloaded successfully: songs\soNLLPokjC4.mp4
Downloaded successfully: songs\XfEMj-z3TtA.mp4
Downloaded successfully: songs\XZ868t23Pb4.mp4
Downloaded successfully: songs\64jjsEtvkdM.mp4
Downloaded successfully: songs\vFFT1iAUNDE.mp4
Downloaded successfully: songs\hJWSZDJb-W4.mp4
Downloaded successfully: songs\yl3TsqL0ZPw.mp4
Downloaded successfully: songs\9rj1NPEWOH4.mp4
Downloaded successfully: songs\W4C-NEWrnSQ.mp4
Downloaded successfully: songs\Zxnnuj4Vp_g.mp4
Downloaded su

In [32]:
track_data_df = pd.DataFrame(track_data)

In [33]:
track_data_df

Unnamed: 0,track_name,track_id,track_number,disc_number,duration_ms,explicit,popularity,preview_url,isrc,album_name,...,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,lyrics,audio_file_path
0,Lose Yourself,7MJQ9Nfxzh8LPZ9e9u68Fq,3,2,320626,True,68,,USIR10211559,SHADYXV,...,2,0.365,-4.545,1,0.267,171.403,4,0.059,"Hey, Em, it's Paul\r\nUh, I was listening to t...",songs\soNLLPokjC4.mp4
1,You Belong With Me,1GEBsLDvJGw7kviySRI6GX,12,1,232120,False,68,,USCJY0803328,Fearless (Big Machine Radio Release Special),...,6,0.111,-4.432,1,0.0379,129.966,4,0.435,"You're on the phone with your girlfriend, she'...",songs\LYyUr75fs_o.mp4
2,Easy On Me,0gplL1WMoJ6iYaPgMCL0gX,1,1,224694,False,76,https://p.scdn.co/mp3-preview/a0cd8077c79a4aa3...,USSM12105970,Easy On Me,...,5,0.133,-7.519,1,0.0282,141.981,4,0.13,There ain't no gold in this river\nThat I've b...,songs\H9NJenpBV2I.mp4
3,Faded,698ItKASDavgwZ3WjaWjtz,15,1,212106,False,78,https://p.scdn.co/mp3-preview/dd79198f4b4c43ae...,NOG841549010,Different World,...,6,0.11,-5.085,1,0.0476,179.642,4,0.159,"You were the shadow to my light, did you feel ...",songs\pIWaVJPl0-c.mp4
4,Alone,3LlmKSHR3Rs0Y3KHQLAYDk,12,1,160426,False,72,https://p.scdn.co/mp3-preview/ab66780b7a2f2245...,NOG841617010,Different World,...,10,0.186,-3.962,1,0.0496,97.021,4,0.183,"Lost in your mind, I wanna know\nAm I losin' m...",songs\aZwklvDdaVw.mp4
5,Therefore I Am,20R4HfKloPKgXDqU7UKk3x,14,1,173539,False,70,,USUM72021500,Happier Than Ever,...,2,0.0583,-7.781,1,0.0924,94.016,4,0.663,"I'm not your friend or anything, damn\nYou thi...",songs\oDn4eKyhSH4.mp4
6,Happier Than Ever,4RVwu0g32PAqgUiJoXsdF8,15,1,298899,True,80,,USUM72105936,Happier Than Ever,...,0,0.128,-8.697,1,0.0348,81.055,3,0.297,"When I'm away from you, I'm happier than ever\...",songs\NSTUVHsb9xw.mp4
7,Despacito,6habFhsOp2NvshLv26DqMb,9,1,229360,False,77,,USUM71607007,VIDA,...,2,0.067,-4.787,1,0.153,177.928,4,0.839,"¡Ay!\r\nFonsi, DY\r\nOh, oh no, oh no (oh)\r\n...",songs\FXovf5dsRTw.mp4
8,Cradles,18A7ha5BitZjmdHTCwXFbU,1,1,209829,False,0,https://p.scdn.co/mp3-preview/584c63beb79d87d1...,GB2LD1800949,Cradles,...,1,0.179,-1.87,1,0.369,67.443,3,0.63,I live inside my own world of make-believe\nKi...,songs\8_Cw99SiHvc.mp4
9,Shivers,75MNhvTCCKsST3YqqUiU9r,1,1,207853,False,15,https://p.scdn.co/mp3-preview/08cec59d36ac30ae...,GBAHS2100671,Shivers,...,2,0.0424,-2.724,1,0.0856,141.02,4,0.822,I took an arrow to the heart\nI never kissed a...,songs\G1ej5up7JG0.mp4


In [35]:
pprint(track_data_df.loc[12]['lyrics'])

('I do the same thing I told you that I never would\n'
 'I told you I changed, even when I knew I never could\n'
 "Know that I can't find nobody else as good as you\n"
 'I need you to stay, need you to stay, hey\n'
 '\n'
 "I get drunk, wake up, I'm wasted still\n"
 'I realize the time that I wasted here\n'
 "I feel like you can't feel the way I feel\n"
 "I'll be fucked up if you can't be right here\n"
 '\n'
 'Oh, whoa (oh, whoa, whoa)\n'
 'Oh, whoa (oh, whoa)\n'
 'Oh, whoa (oh, whoa)\n'
 "I'll be fucked up if you can't be right here\n"
 '\n'
 'I do the same thing I told you that I never would\n'
 'I told you I changed, even when I knew I never could\n'
 "Know that I can't find nobody else as good as you\n"
 'I need you to stay, need you to stay, hey\n'
 '\n'
 'I do the same thing I told you that I never would\n'
 'I told you I changed, even when I knew I never could\n'
 "Know that I can't find nobody else as good as you\n"
 'I need you to stay, need you to stay, yeah\n'
 '\n'
 "When I'

In [36]:
track_data_df.to_pickle('Track_data_complete.pkl')

In [37]:
track_data_df.to_csv('Track_data_complete_csv.csv')