In [1]:
import pandas as pd
import time
import psutil
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# file path
file_path = "/content/drive/My Drive/spotify_dataset.csv"

### Load Dataset in Chunks (Handling 3GB File) ###
def load_data(file_path, chunk_size=500000):
    start_time = time.time()
    chunk_list = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, usecols=["title", "artist", "streams", "rank", "region", "date"], low_memory=False):
        chunk["date"] = pd.to_datetime(chunk["date"], errors='coerce')
        chunk["streams"] = chunk["streams"].fillna(chunk["streams"].mean())
        chunk_list.append(chunk)
    df = pd.concat(chunk_list, ignore_index=True)
    execution_time = time.time() - start_time
    print(f"Dataset loaded in {execution_time:.2f} seconds")
    return df

df = load_data(file_path)
print(df.info())
print(df.head())

Mounted at /content/drive
Dataset loaded in 107.03 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26173514 entries, 0 to 26173513
Data columns (total 6 columns):
 #   Column   Dtype         
---  ------   -----         
 0   title    object        
 1   rank     int64         
 2   date     datetime64[ns]
 3   artist   object        
 4   region   object        
 5   streams  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 1.2+ GB
None
                         title  rank       date  \
0      Chantaje (feat. Maluma)     1 2017-01-01   
1  Vente Pa' Ca (feat. Maluma)     2 2017-01-01   
2   Reggaetón Lento (Bailemos)     3 2017-01-01   
3                       Safari     4 2017-01-01   
4                  Shaky Shaky     5 2017-01-01   

                                  artist     region   streams  
0                                Shakira  Argentina  253019.0  
1                           Ricky Martin  Argentina  223988.0  
2         

In [2]:
### Step 1: Find the Most Streamed Songs Globally ###
def most_streamed_songs(df):
    start_time = time.time()
    result = df.groupby(["title", "artist"]).agg({"streams": "sum"}).reset_index()
    result = result.sort_values(by="streams", ascending=False).head(10)
    execution_time = time.time() - start_time
    print(f"Most streamed songs calculated in {execution_time:.2f} seconds")
    return result

most_streamed = most_streamed_songs(df)
print(most_streamed)


Most streamed songs calculated in 9.62 seconds
                                                title  \
20085                                 Blinding Lights   
137981                                   Shape of You   
34788                                    Dance Monkey   
142863                              Someone You Loved   
147292  Sunflower - Spider-Man: Into the Spider-Verse   
137732                                       Señorita   
177159                                        bad guy   
40565                                 Don't Start Now   
92814                                    Lucid Dreams   
17471                                        Believer   

                              artist       streams  
20085                     The Weeknd  5.620908e+09  
137981                    Ed Sheeran  5.443651e+09  
34788                    Tones And I  5.263134e+09  
142863                 Lewis Capaldi  4.320116e+09  
147292         Post Malone, Swae Lee  3.962621e+09  
137732 

In [3]:
### Step 2: Find the Top-Ranked Song for Each Region ###
def top_ranked_songs(df):
    start_time = time.time()
    result = df.loc[df.groupby("region")["rank"].idxmin(), ["region", "title", "artist", "rank"]]
    execution_time = time.time() - start_time
    print(f"Top ranked songs per region calculated in {execution_time:.2f} seconds")
    return result

top_ranked = top_ranked_songs(df)
print(top_ranked.head(10))


Top ranked songs per region calculated in 4.30 seconds
          region                       title  \
27430    Andorra                     Sirenas   
0      Argentina     Chantaje (feat. Maluma)   
201    Australia                     Starboy   
402      Austria                       Alone   
603      Belgium                    Hey Baby   
803      Bolivia  Reggaetón Lento (Bailemos)   
869       Brazil                    Deu Onda   
1070    Bulgaria                     Starboy   
1071      Canada                     Starboy   
1273       Chile     Chantaje (feat. Maluma)   

                                                 artist  rank  
27430                                          Taburete     1  
0                                               Shakira     1  
201                               The Weeknd, Daft Punk     1  
402                                         Alan Walker     1  
603    Dimitri Vegas & Like Mike, Diplo, Deb’s Daughter     1  
803                             

In [4]:
### Step 3: Track an Artist's Rank Over Time ###
def artist_rank_over_time(df, artist_name="Ed Sheeran"):
    start_time = time.time()
    result = df[df["artist"] == artist_name][["date", "rank", "title"]].sort_values(by="date")
    execution_time = time.time() - start_time
    print(f"{artist_name}'s ranking over time calculated in {execution_time:.2f} seconds")
    return result

artist_rank = artist_rank_over_time(df)
print(artist_rank.head(10))


Ed Sheeran's ranking over time calculated in 2.24 seconds
           date  rank                                              title
309  2017-01-01   109                                  Thinking out Loud
5318 2017-01-01   134                                  Thinking out Loud
5464 2017-01-01    80                                  Thinking out Loud
5504 2017-01-01   120                                         Photograph
5572 2017-01-01   188  I See Fire - From "The Hobbit - The Desolation...
5744 2017-01-01   139                                         Photograph
5755 2017-01-01   150                                  Thinking out Loud
5775 2017-01-01   170  I See Fire - From "The Hobbit - The Desolation...
6142 2017-01-01   122                                  Thinking out Loud
6180 2017-01-01   159                                         Photograph


In [5]:
import numpy as np

### Optimized Step 4: Identify Trending Songs (Streaming Growth Rate) ###
def trending_songs(df):
    start_time = time.time()

    # Use vectorized shift instead of apply
    df["previous_rank"] = df.groupby(["title", "region"])["rank"].shift(1)

    # Avoid apply: Use NumPy for performance
    df["trend"] = np.where(df["rank"] < df["previous_rank"], "TREND_UP",
                           np.where(df["rank"] > df["previous_rank"], "TREND_DOWN", "SAME_POSITION"))

    execution_time = time.time() - start_time
    print(f"Trending songs calculated in {execution_time:.2f} seconds")

    return df[["title", "region", "rank", "previous_rank", "trend"]].head(10)

# Run the optimized function
trending = trending_songs(df)
print(trending)


Trending songs calculated in 16.02 seconds
                         title     region  rank  previous_rank          trend
0      Chantaje (feat. Maluma)  Argentina     1            NaN  SAME_POSITION
1  Vente Pa' Ca (feat. Maluma)  Argentina     2            NaN  SAME_POSITION
2   Reggaetón Lento (Bailemos)  Argentina     3            NaN  SAME_POSITION
3                       Safari  Argentina     4            NaN  SAME_POSITION
4                  Shaky Shaky  Argentina     5            NaN  SAME_POSITION
5                  Traicionera  Argentina     6            NaN  SAME_POSITION
6      Cuando Se Pone a Bailar  Argentina     7            NaN  SAME_POSITION
7    Otra vez (feat. J Balvin)  Argentina     8            NaN  SAME_POSITION
8                 La Bicicleta  Argentina     9            NaN  SAME_POSITION
9       Dile Que Tu Me Quieres  Argentina    10            NaN  SAME_POSITION
