In [0]:
import base64
import requests
import pandas as pd
import json

# Spotify credentials
client_id = "0090f5edcfae4e96afc0bcc353e08dd1"
client_secret = "8e7147684cae4f29ac4c8e4c00e60b86"

# Function to get access token
def get_access_token(client_id, client_secret):
    print("Fetching access token...")
    url = "https://accounts.spotify.com/api/token"
    auth_str = f"{client_id}:{client_secret}"
    auth_base64 = base64.b64encode(auth_str.encode("utf-8")).decode("utf-8")
    headers = {
        "Authorization": f"Basic {auth_base64}",
        "Content-Type": "application/x-www-form-urlencoded",
    }
    data = {"grant_type": "client_credentials"}
    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json()["access_token"]
    else:
        print(f"Error fetching token: {response.status_code}, {response.text}")
        return None

# Function to search for artist by name
def search_spotify(query, search_type, access_token):
    url = "https://api.spotify.com/v1/search"
    params = {"q": query, "type": search_type, "limit": 1}
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching {search_type}: {response.status_code}, {response.text}")
        return None

# Function to get artist data by Spotify ID
def get_artist_data(artist_id, access_token):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        artist_info = response.json()
        artist_data = {
            "Name": artist_info["name"],
            "ID": artist_info["id"],
            "Followers": artist_info["followers"]["total"],
            "Genres": ", ".join(artist_info["genres"]),
            "Popularity": artist_info["popularity"],
            "Spotify URL": artist_info["external_urls"]["spotify"],
            "Image URL": artist_info["images"][0]["url"] if artist_info["images"] else None
        }
        return artist_data
    else:
        print(f"Error fetching artist data: {response.status_code}, {response.text}")
        return None

# Function to get top tracks (songs) for an artist
def get_top_tracks(artist_id, access_token):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=US"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        top_tracks_data = response.json()
        top_tracks = [track["name"] for track in top_tracks_data["tracks"]]
        top_track_popularity = [track["popularity"] for track in top_tracks_data["tracks"]]
        return top_tracks, top_track_popularity
    else:
        print(f"Error fetching top tracks: {response.status_code}, {response.text}")
        return [], []

# Function to fetch related artists for an artist
def get_related_artists(artist_id, access_token):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/related-artists"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        related_artists_data = response.json()
        related_artists = [artist["name"] for artist in related_artists_data["artists"]]
        return related_artists
    else:
        print(f"Error fetching related artists: {response.status_code}, {response.text}")
        return []

# Function to fetch albums for an artist
def get_albums(artist_id, access_token):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        albums_data = response.json()
        album_dates = [album["release_date"] for album in albums_data["items"]]
        # album_popularity = [album["Popularity"] for album in albums_data["items"]]
        return album_dates
    else:
        print(f"Error fetching albums: {response.status_code}, {response.text}")
        return [], []

# Function to fetch audio features for top tracks
def get_audio_features(top_track_ids, access_token):
    url = f"https://api.spotify.com/v1/audio-features?ids={','.join(top_track_ids)}"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        audio_features_data = response.json()
        danceability = [features["danceability"] for features in audio_features_data["audio_features"]]
        energy = [features["energy"] for features in audio_features_data["audio_features"]]
        tempo = [features["tempo"] for features in audio_features_data["audio_features"]]
        return danceability, energy, tempo
    else:
        print(f"Error fetching audio features: {response.status_code}, {response.text}")
        return [], [], []

# Function to get full artist performance data
def get_full_artist_performance_data(artist_name, access_token):
    artist_data = search_spotify(artist_name, "artist", access_token)
    if artist_data and "artists" in artist_data and artist_data["artists"]["items"]:
        artist_info = artist_data["artists"]["items"][0]
        artist_id = artist_info["id"]
        
        # Fetch artist basic data
        artist_performance = get_artist_data(artist_id, access_token)
        
        # Fetch top tracks and related artists
        top_tracks, top_track_popularity = get_top_tracks(artist_id, access_token)
        related_artists = get_related_artists(artist_id, access_token)
        
        # Fetch albums and audio features
        album_dates = get_albums(artist_id, access_token)
        danceability, energy, tempo = get_audio_features(top_tracks, access_token)
        
        # Combine all performance data
        performance_data = {
            "Name": artist_performance["Name"],
            "ID": artist_performance["ID"],
            "Followers": artist_performance["Followers"],
            "Genres": artist_performance["Genres"],
            "Popularity": artist_performance["Popularity"],
            "Spotify URL": artist_performance["Spotify URL"],
            "Image URL": artist_performance["Image URL"],
            "Top Tracks": top_tracks,
            "Top Track Popularity": top_track_popularity,
            "Related Artists": related_artists,
            "Album Release Dates": album_dates,
            "Danceability": danceability,
            "Energy": energy,
            "Tempo": tempo
        }
        
        # Return a DataFrame
        return pd.DataFrame([performance_data])

    else:
        print(f"Artist '{artist_name}' not found.")
        return None

# Main function to execute
def main():
    token = get_access_token(client_id, client_secret)
    print("-----")
    if not token:
        return

    # List of 50 artists you want to analyze
    artist_names = [
        "Arijit Singh", "Ed Sheeran", "Taylor Swift", "Ariana Grande", "Drake",
        "BTS", "Billie Eilish", "Post Malone", "Justin Bieber", "Shawn Mendes",
        "Dua Lipa", "The Weeknd", "Kendrick Lamar", "Kanye West", "Coldplay",
        "Imagine Dragons", "Bruno Mars", "Rihanna", "Halsey", "Adele",
        "Cardi B", "Travis Scott", "Lady Gaga", "Zayn Malik", "Selena Gomez",
        "Harry Styles", "Sam Smith", "Lil Nas X", "Megan Thee Stallion", "Doja Cat",
        "The Chainsmokers", "Eminem", "Nicki Minaj", "Maroon 5", "Lana Del Rey",
        "Shakira", "Camila Cabello", "Pitbull", "Katy Perry", "One Direction", 
        "Edith Piaf", "Luis Fonsi", "Shawn Mendes", "Charlie Puth", "Miley Cyrus", 
        "John Legend", "Alan Walker", "Alicia Keys", "Avicii", "Kesha", "Sia", "Rita Ora", "The Rolling Stones"
    ]

    all_artist_data = []
    # Fetch data for each artist
    for artist_name in artist_names:
        artist_data = get_full_artist_performance_data(artist_name, token)
        if artist_data is not None:
            all_artist_data.append(artist_data)
        
    # Combine data for all artists
    if all_artist_data:
        final_data = pd.concat(all_artist_data, ignore_index=True)
        # Save the data to CSV
        final_data.to_csv("artist_performance_data.csv", index=False)
        print("Artist performance data saved to 'artist_performance_data.csv'")
        
    else:
         print("No artist data found.")
    print(final_data)   
if __name__ == "__main__":
    main()

Fetching access token...
-----
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "error" : {
    "status" : 403
  }
}
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "error" : {
    "status" : 403
  }
}
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "error" : {
    "status" : 403
  }
}
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "error" : {
    "status" : 403
  }
}
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "error" : {
    "status" : 403
  }
}
Error fetching related artists: 404, {"error": {"status": 404, "message": "Not Found" } }
Error fetching audio features: 403, {
  "err

In [0]:
#import pandas as pd
df = pd.read_csv('/Workspace/Users/n01660392@humber.ca/artist_performance_data.csv')

display(df)

Name,ID,Followers,Genres,Popularity,Spotify URL,Image URL,Top Tracks,Top Track Popularity,Related Artists,Album Release Dates,Danceability,Energy,Tempo
Arijit Singh,4YRxDV8wJFPHPTeXepOstw,133168035,"filmi, modern bollywood",92,https://open.spotify.com/artist/4YRxDV8wJFPHPTeXepOstw,https://i.scdn.co/image/ab6761610000e5eb5ba2d75eb08a2d672f9b69b7,"['Sajni (From ""Laapataa Ladies"")', 'Apna Bana Le', 'Tujhe Kitna Chahne Lage (From ""Kabir Singh"")', 'Satranga (From ""ANIMAL"")', 'Heeriye (feat. Arijit Singh)', 'Humdard (From ""Ek Villain"")', 'O Maahi', 'Tum Hi Ho', 'Agar Tum Saath Ho (From ""Tamasha"")', 'Tainu Khabar Nahi - From ""Munjya""']","[79, 74, 77, 77, 76, 74, 71, 75, 69, 74]",[],"['2024-07-31', '2024-04-26', '2024-04-26', '2024-04-25', '2024-04-25', '2024-04-24', '2024-04-24', '2024-04-24', '2024-02-12', '2023-08-14', '2023-08-08', '2023-07-14', '2023-05-26', '2023-05-15', '2022-11-30', '2022-08-14', '2021-05-04', '2021-03-31', '2021-03-15', '2021-02-17']",[],[],[]
Ed Sheeran,6eUKZXaKkcviH0Ku9w2n3V,118303036,"pop, singer-songwriter pop, uk pop",90,https://open.spotify.com/artist/6eUKZXaKkcviH0Ku9w2n3V,https://i.scdn.co/image/ab6761610000e5eb784daff754ecfe0464ddbeb9,"['Merry Christmas', 'Shape of You', 'Perfect', 'Photograph', 'Shivers', 'Thinking out Loud', 'Under the Tree (from “That Christmas”)', 'Bad Habits', ""I Don't Care (with Justin Bieber)"", 'Castle on the Hill']","[82, 86, 86, 66, 83, 65, 79, 78, 74, 76]",[],"['2024-12-27', '2024-09-27', '2024-06-21', '2023-10-02', '2023-09-29', '2023-05-05', '2022-05-27', '2021-10-25', '2021-10-25', '2019-07-12', '2017-03-03', '2014-06-21', '2014-06-20', '2013', '2011-12-09', '2011-12-09', '2011-09-09', '2011-09-09', '2024-11-26', '2024-09-18']",[],[],[]
Taylor Swift,06HL4z0CvFAxyc27GXpf02,129785206,pop,100,https://open.spotify.com/artist/06HL4z0CvFAxyc27GXpf02,https://i.scdn.co/image/ab6761610000e5ebe672b5f553298dcdccb0e676,"['Cruel Summer', 'Fortnight (feat. Post Malone)', 'Christmas Tree Farm', 'I Can Do It With a Broken Heart', 'cardigan', 'Lover', 'august', 'Don’t Blame Me', 'Anti-Hero', 'Blank Space']","[88, 84, 85, 81, 82, 83, 82, 82, 81, 69]",[],"['2024-04-19', '2024-04-18', '2023-10-27', '2023-10-26', '2023-07-07', '2023-05-26', '2022-10-22', '2022-10-21', '2021-11-12', '2021-04-09', '2021-01-07', '2020-12-11', '2020-11-25', '2020-08-18', '2020-07-24', '2019-08-23', '2017-11-10', '2017-11-09', '2014-10-27', '2014-10-27']",[],[],[]
Ariana Grande,66CXWjxzNUsdJxJ2JdwvnR,102593265,pop,96,https://open.spotify.com/artist/66CXWjxzNUsdJxJ2JdwvnR,https://i.scdn.co/image/ab6761610000e5eb40b5c07ab77b6b1a9075fdc0,"['Santa Tell Me', ""we can't be friends (wait for your love)"", 'Santa, Can’t You Hear Me', 'Popular', 'What Is This Feeling?', 'Save Your Tears (Remix) (with Ariana Grande) - Bonus Track', '7 rings', 'the boy is mine', 'One Last Time', 'Last Christmas']","[94, 87, 85, 84, 84, 79, 82, 82, 82, 82]",[],"['2024-11-22', '2024-10-01', '2024-08-22', '2024-03-11', '2024-03-08', '2023-08-25', '2021-02-19', '2020-10-30', '2019-12-23', '2019-02-08', '2018-08-17', '2016-05-20', '2014-08-22', '2013-01-01', '2024-10-10', '2024-09-08', '2024-07-19', '2024-06-21', '2024-03-08', '2024-02-16']",[],[],[]
Drake,3TVXtAsR1Inumwj472S9r4,94924678,"canadian hip hop, canadian pop, hip hop, pop rap, rap",97,https://open.spotify.com/artist/3TVXtAsR1Inumwj472S9r4,https://i.scdn.co/image/ab6761610000e5eb4293385d324db8558179afd9,"['One Dance', 'WAIT FOR U (feat. Drake & Tems)', ""God's Plan"", 'Passionfruit', 'Jimmy Cooks (feat. 21 Savage)', 'Rich Baby Daddy (feat. Sexyy Red & SZA)', 'Circadian Rhythm', 'Teenage Fever', 'Fair Trade (with Travis Scott)', 'Headlines']","[85, 82, 81, 80, 79, 79, 78, 78, 78, 77]",[],"['2023-11-17', '2023-10-06', '2022-11-04', '2022-06-17', '2021-09-03', '2020-05-01', '2019-08-02', '2018-06-29', '2017-03-18', '2016-05-06', '2015-09-20', '2015-02-12', '2013-01-01', '2013-01-01', '2011-11-15', '2010-01-01', '2009', '2024-08-31', '2024-08-30', '2024-07-26']",[],[],[]
BTS,3Nrfpe0tUJi4K4DXYWgMUX,77343777,"k-pop, k-pop boy group, pop",88,https://open.spotify.com/artist/3Nrfpe0tUJi4K4DXYWgMUX,https://i.scdn.co/image/ab6761610000e5ebd642648235ebf3460d2d1f6a,"['Dynamite', 'My Universe', 'Left and Right (Feat. Jung Kook of BTS)', 'FAKE LOVE', 'Boy With Luv (feat. Halsey)', 'Butter', 'Life Goes On', 'Run BTS', 'Spring Day', 'Permission to Dance']","[78, 73, 73, 73, 73, 71, 72, 72, 71, 71]",[],"['2022-06-10', '2020-11-20', '2020-07-14', '2020-02-21', '2019-04-12', '2018-08-24', '2018-05-18', '2018-04-04', '2017-09-18', '2017-02-13', '2016-10-10', '2016-09-07', '2016-05-02', '2015-11-30', '2015-04-29', '2014-12-24', '2014-08-20', '2014-05-14', '2014-02-12', '2013-09-11']",[],[],[]
Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,105041600,"art pop, pop",96,https://open.spotify.com/artist/6qqNVTkY8uBg9cP3Jd7DAH,https://i.scdn.co/image/ab6761610000e5eb4a21b4760d2ecb7b0dcdc8da,"['BIRDS OF A FEATHER', 'WILDFLOWER', 'CHIHIRO', 'Guess featuring billie eilish', 'lovely (with Khalid)', 'LUNCH', 'What Was I Made For? [From The Motion Picture ""Barbie""]', 'ocean eyes', ""when the party's over"", 'TV']","[97, 93, 87, 82, 86, 85, 85, 72, 83, 82]",[],"['2024-05-17', '2021-07-30', '2019-03-29', '2024-08-01', '2024-05-21', '2023-07-13', '2023-05-09', '2022-07-21', '2021-07-28', '2021-02-22', '2021-01-21', '2020-11-12', '2020-07-30', '2020-02-13', '2019-11-13', '2019-07-11', '2019-01-09', '2018-11-20', '2018-04-19', '2018-03-30']",[],[],[]
Post Malone,246dkjvS1zLTtiykXe5h60,46205845,"dfw rap, melodic rap, pop, rap",91,https://open.spotify.com/artist/246dkjvS1zLTtiykXe5h60,https://i.scdn.co/image/ab6761610000e5ebe17c0aa1714a03d62b5ce4e0,"['I Had Some Help (Feat. Morgan Wallen)', 'Fortnight (feat. Post Malone)', 'Sunflower - Spider-Man: Into the Spider-Verse', 'Circles', 'rockstar (feat. 21 Savage)', 'Guy For That (Feat. Luke Combs)', 'Dial Drunk (with Post Malone)', 'Congratulations', 'Better Now', 'Pour Me A Drink (Feat. Blake Shelton)']","[84, 84, 81, 83, 80, 76, 78, 77, 77, 74]",[],"['2024-08-16', '2024-08-15', '2023-08-02', '2023-07-28', '2023-04-21', '2022-06-07', '2022-06-03', '2019-09-06', '2018-04-27', '2016-12-09', '2016-12-09', '2024-09-06', '2024-08-15', '2024-08-15', '2024-08-15', '2024-08-15', '2024-08-15', '2024-08-15', '2024-08-15', '2024-08-15']",[],[],[]
Justin Bieber,1uNFoZAHBGtllmzznpCI3s,79912279,"canadian pop, pop",93,https://open.spotify.com/artist/1uNFoZAHBGtllmzznpCI3s,https://i.scdn.co/image/ab6761610000e5eb8ae7f2aaa9817a704a87ea36,"['Mistletoe', 'STAY (with Justin Bieber)', 'Ghost', 'Love Yourself', 'Sorry', 'Beauty And A Beat', 'Baby', 'What Do You Mean?', 'bad guy', 'Confident']","[89, 82, 83, 82, 82, 79, 81, 80, 56, 79]",[],"['2021-10-08', '2021-03-26', '2021-03-19', '2020-02-14', '2015-11-13', '2014-05-13', '2013-01-01', '2012-06-15', '2012-01-01', '2011-01-01', '2011-01-01', '2011-01-01', '2010-01-01', '2010-01-01', '2009-01-01', '2023-09-15', '2023-02-23', '2022-08-30', '2022-04-29', '2022-04-27']",[],[],[]
Shawn Mendes,7n2wHs1TKAczGzO7Dd2rGr,44085353,"canadian pop, pop, viral pop",85,https://open.spotify.com/artist/7n2wHs1TKAczGzO7Dd2rGr,https://i.scdn.co/image/ab6761610000e5eb58b4b9419486550f6fda0535,"[""There's Nothing Holdin' Me Back"", 'Treat You Better', 'Stitches', 'Señorita', 'Mercy', 'Why Why Why', 'Heart of Gold', ""If I Can't Have You"", 'Monster (Shawn Mendes & Justin Bieber)', 'I Know What You Did Last Summer']","[84, 83, 82, 78, 78, 67, 69, 70, 66, 70]",[],"['2024-11-15', '2020-12-07', '2020-12-05', '2020-12-04', '2019-06-19', '2018-05-25', '2017-11-03', '2017-04-20', '2017-04-20', '2016-12-23', '2015-11-20', '2015-04-14', '2024-11-01', '2024-09-12', '2024-08-08', '2023-11-21', '2023-06-09', '2022-09-30', '2022-05-13', '2022-03-31']",[],[],[]


In [0]:
# Assuming df is a Spark DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Name                  53 non-null     object
 1   ID                    53 non-null     object
 2   Followers             53 non-null     int64 
 3   Genres                53 non-null     object
 4   Popularity            53 non-null     int64 
 5   Spotify URL           53 non-null     object
 6   Image URL             53 non-null     object
 7   Top Tracks            53 non-null     object
 8   Top Track Popularity  53 non-null     object
 9   Related Artists       53 non-null     object
 10  Album Release Dates   53 non-null     object
 11  Danceability          53 non-null     object
 12  Energy                53 non-null     object
 13  Tempo                 53 non-null     object
dtypes: int64(2), object(12)
memory usage: 5.9+ KB


In [0]:
# Assuming df is a Pandas DataFrame, convert it to a PySpark DataFrame first
spark_df = spark.createDataFrame(df)

# Now create or replace the temporary view
spark_df.createOrReplaceTempView("artist_performance_data")

In [0]:
# Top 10 artists by number of followers
top_followed_artists = spark_df.select('Name', 'Followers').orderBy('Followers', ascending=False).limit(10)

display(top_followed_artists)

Name,Followers
Arijit Singh,133168035
Taylor Swift,129785206
Ed Sheeran,118303036
Billie Eilish,105041600
Ariana Grande,102593265
The Weeknd,97412415
Eminem,95619328
Drake,94924678
Justin Bieber,79912279
BTS,77343777


Databricks visualization. Run in Databricks to view.

In [0]:
# Top 5 artists by popularity
top_artists = df[['Name', 'Popularity']].sort_values(by='Popularity', ascending=False).head(5)

display(top_artists)

Name,Popularity
Taylor Swift,100
Bruno Mars,97
The Weeknd,97
Drake,97
Kendrick Lamar,97


Databricks visualization. Run in Databricks to view.

In [0]:
# Step 1: Drop null values for Name and Popularity
df_clean = df.dropna(subset=['Name', 'Popularity'])

# Step 2: Get the bottom 5 artists by popularity in descending order
bottom_5_artists = df_clean[['Name', 'Popularity']].sort_values(by='Popularity', ascending=True).head(5).sort_values(by='Popularity', ascending=False)

display(bottom_5_artists)

Name,Popularity
ZAYN,79
Lil Nas X,76
Luis Fonsi,76
Rita Ora,73
Édith Piaf,63


Databricks visualization. Run in Databricks to view.

In [0]:
# Step 1: Drop null values for Name, Popularity, and Genres
df_clean = df.dropna(subset=['Name', 'Popularity', 'Genres'])

# Step 2: Split genres if they are comma-separated
df_clean['Genres'] = df_clean['Genres'].str.split(', ')

# Step 3: Flatten the genre list and count occurrences
all_genres = df_clean.explode('Genres')['Genres'].value_counts().reset_index()
all_genres.columns = ['Genre', 'Total_Artists']

# Step 4: Sort by the total number of artists and select top 5 genres
top_5_genres = all_genres.sort_values(by='Total_Artists', ascending=False).head(5)

# Step 5: Display the top genres in a table format
display(top_5_genres)

Genre,Total_Artists
pop,42
dance pop,10
rap,9
uk pop,6
canadian pop,5


Databricks visualization. Run in Databricks to view.

In [0]:
# Step 1: Drop null values for Name, Popularity, and Top Tracks
df_clean = df.dropna(subset=['Name', 'Popularity', 'Top Tracks'])

# Step 2: Get the top 10 artists by popularity
top_10_artists = df_clean[['Name', 'Popularity']].sort_values(by='Popularity', ascending=False).head(10)
top_10_artist_names = top_10_artists['Name'].tolist()

# Step 3: Filter the dataset for these top 10 artists
top_tracks_data = df_clean[df_clean['Name'].isin(top_10_artist_names)]

# Step 4: Extract one top track per artist (assuming "Top Tracks" contains a list or a comma-separated string)
top_tracks_data['Top Track'] = top_tracks_data['Top Tracks'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)

# Step 5: Select only required columns (Name, Top Track, Popularity)
top_tracks = top_tracks_data[['Name', 'Top Track', 'Popularity']].drop_duplicates(subset=['Name'])

# Display the top tracks for top 10 artists
display(top_tracks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_tracks_data['Top Track'] = top_tracks_data['Top Tracks'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)


Name,Top Track,Popularity
Taylor Swift,['Cruel Summer',100
Ariana Grande,['Santa Tell Me',96
Drake,['One Dance',97
Billie Eilish,['BIRDS OF A FEATHER',96
The Weeknd,['Timeless (with Playboi Carti)',97
Kendrick Lamar,['luther (with sza)',97
Kanye West,['Heartless',93
Bruno Mars,['Die With A Smile',97
Travis Scott,['FE!N (feat. Playboi Carti)',94
Eminem,['Without Me',93


Databricks visualization. Run in Databricks to view.

In [0]:
# Step 1: Drop null values for Name, Popularity, and Top Tracks
df_clean = df.dropna(subset=['Name', 'Popularity', 'Top Tracks'])

# Step 2: Get the bottom 10 artists by popularity
bottom_10_artists = df_clean[['Name', 'Popularity']].sort_values(by='Popularity').head(10)
bottom_10_artist_names = bottom_10_artists['Name'].tolist()

# Step 3: Filter the dataset for these bottom 10 artists
bottom_tracks_data = df_clean[df_clean['Name'].isin(bottom_10_artist_names)]

# Step 4: Extract one top track per artist (assuming "Top Tracks" contains a list or a comma-separated string)
bottom_tracks_data['Top Track'] = bottom_tracks_data['Top Tracks'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)

# Step 5: Select only required columns (Name, Top Track, Popularity)
bottom_tracks = bottom_tracks_data[['Name', 'Top Track', 'Popularity']].drop_duplicates(subset=['Name'])

display(bottom_tracks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bottom_tracks_data['Top Track'] = bottom_tracks_data['Top Tracks'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)


Name,Top Track,Popularity
Cardi B,['Please Me',81
ZAYN,['Dusk Till Dawn (feat. Sia) - Radio Edit',79
Lil Nas X,['INDUSTRY BABY (feat. Jack Harlow)',76
Édith Piaf,['Non,63
Luis Fonsi,['Despacito',76
Charlie Puth,"[""We Don't Talk Anymore (feat. Selena Gomez)""",82
John Legend,['All of Me',80
Alicia Keys,"[""If I Ain't Got You""",81
Rita Ora,['For You (With Rita Ora)',73
The Rolling Stones,['Paint It,80


Databricks visualization. Run in Databricks to view.

In [0]:
# Example data
data = {
    'Name': ['Taylor Swift'],
    'Top Tracks': ["['Cruel Summer', 'Fortnight (feat. Post Malone)', 'I Can Do It With a Broken Heart', 'august', 'cardigan', 'Lover', 'Don’t Blame Me', 'Anti-Hero', 'Blank Space', 'Guilty as Sin?']"],
    'Popularity': ["[89, 84, 82, 83, 83, 83, 82, 81, 69, 77]"]
}

# Step 1: Create a DataFrame
df = spark.createDataFrame(pd.DataFrame(data))

# Step 2: Filter for Taylor Swift and process the Top Tracks and Popularity
taylor_swift_data = df.filter(df['Name'] == 'Taylor Swift').collect()[0]
tracks = eval(taylor_swift_data['Top Tracks'])  # Convert string to list
popularity = eval(taylor_swift_data['Popularity'])  # Convert string to list

# Step 3: Pair tracks with popularity and sort
track_popularity = sorted(zip(tracks, popularity), key=lambda x: x[1], reverse=True)

# Step 4: Get the top 5 tracks
top_5_tracks = track_popularity[:5]

# Step 5: Create a table
top_5_df = spark.createDataFrame(top_5_tracks, ['Track', 'Popularity'])

# Display the table
display(top_5_df)

Track,Popularity
Cruel Summer,89
Fortnight (feat. Post Malone),84
august,83
cardigan,83
Lover,83


Databricks visualization. Run in Databricks to view.

In [0]:
# Example data for Taylor Swift's albums
data = {
    'Name': ['Taylor Swift', 'Taylor Swift', 'Taylor Swift', 'Taylor Swift'],
    'Album': ['Fearless', '1989', 'Reputation', 'Lover'],
    'Release Date': ['2008-11-11', '2014-10-27', '2017-11-10', '2019-08-23'],
    'Popularity': [85, 90, 88, 87]
}

# Step 1: Create a DataFrame
df_albums = spark.createDataFrame(pd.DataFrame(data))

# Step 2: Filter for Taylor Swift's albums
taylor_swift_albums = df_albums.filter(df_albums['Name'] == 'Taylor Swift')

# Step 3: Convert 'Release Date' to date type
taylor_swift_albums = taylor_swift_albums.withColumn('Release Date', taylor_swift_albums['Release Date'].cast('date'))

# Step 4: Sort albums by release date
sorted_albums = taylor_swift_albums.orderBy('Release Date')

# Display the sorted albums
display(sorted_albums)

Name,Album,Release Date,Popularity
Taylor Swift,Fearless,2008-11-11,85
Taylor Swift,1989,2014-10-27,90
Taylor Swift,Reputation,2017-11-10,88
Taylor Swift,Lover,2019-08-23,87
