In [3]:
# 📦 Install Dask (if not already installed)
!pip install dask

# 🧠 Import libraries
import dask.dataframe as dd
import time
import psutil
import os

# Get process for memory tracking
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss / 1024 ** 2
start_time = time.time()

# ✅ Define required columns
required_columns = ['Artist(s)', 'song', 'Genre', 'Tempo', 'Popularity', 'Energy', 'Danceability']

# ✅ Load only selected columns using Dask
df_dask = dd.read_csv("spotify_dataset.csv", usecols=required_columns)

# 🧾 View structure
print("🔹 Shape (approx):", df_dask.shape)
print("🔹 Columns:", df_dask.columns)
print(f"🔸 Number of Partitions: {df_dask.npartitions}")

# Optimize data type
df_dask['Genre'] = df_dask['Genre'].astype('category')
df_dask['Popularity'] = df_dask['Popularity'].astype('int32')
df_dask['Energy'] = df_dask['Energy'].astype('int32')
df_dask['Danceability'] = df_dask['Danceability'].astype('int32')
df_dask['Tempo'] = df_dask['Tempo'].astype('float32')


# Take 1% sample
sample_df = df_dask.sample(frac=0.01, random_state=42)
print("📊 Sample Shape:", sample_df.shape)

# Get statistics (triggers computation)
result = df_dask.describe().compute()
print("📉 Summary Statistics:\n", result)


🔹 Shape (approx): (<dask_expr.expr.Scalar: expr=ArrowStringConversion(frame=FromMapProjectable(ddbf4d7)).size() // 7, dtype=int64>, 7)
🔹 Columns: Index(['Artist(s)', 'song', 'Genre', 'Tempo', 'Popularity', 'Energy',
       'Danceability'],
      dtype='object')
🔸 Number of Partitions: 1
📊 Sample Shape: (<dask_expr.expr.Scalar: expr=(Sample(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=ArrowStringConversion(frame=FromMapProjectable(ddbf4d7))))))), state_data=[array([1608637542, 3421126067, 4083286876,  787846414, 3143890026,
       3348747335, 2571218620, 2563451924,  670094950, 1914837113,
        669991378,  429389014,  249467210, 1972458954, 3720198231,
       1433267572, 2581769315,  613608295, 3041148567, 2795544706,
         88409749,  242285876, 4165731073, 3100961111, 3575313899,
       4031053213,  911989541,    3344769,  780932287, 4261516219,
        787716372, 2652062880, 1306710475, 2627030329, 2253811733,
         30349564, 1855189739,   99052376, 

In [None]:
mem_after = process.memory_info().rss / 1024 ** 2
end_time = time.time()

print("\n📈 Performance Measurement (With Dask Optimization)")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Memory Used: {mem_after - mem_before:.2f} MB")



📈 Performance Measurement (With Dask Optimization)
Execution Time: 248.10 seconds
Memory Used: 39.59 MB
