
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/data/Most_Streamed_Spotify_Songs_2024.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,2024-04-26,QM24S2402528,1,725.4,390470936.0,30716.0,196631588.0,92.0,84274754.0,1713126.0,5767700.0,651565900.0,5332281936.0,150597040.0,210.0,40975.0,684.0,62.0,17598718.0,114.0,18004655.0,22931.0,4818457.0,2669262.0,,0
Not Like Us,Not Like Us,Kendrick Lamar,2024-05-04,USUG12400910,2,545.9,323703884.0,28113.0,174597137.0,92.0,116347040.0,3486739.0,674700.0,35223547.0,208339025.0,156380351.0,188.0,40778.0,3.0,67.0,10422430.0,111.0,7780028.0,28444.0,6623075.0,1118279.0,,1
i like the way you kiss me,I like the way you kiss me,Artemas,2024-03-19,QZJ842400387,3,538.4,601309283.0,54331.0,211607669.0,92.0,122599116.0,2228730.0,3025400.0,275154237.0,3369120610.0,373784955.0,190.0,74333.0,536.0,136.0,36321847.0,172.0,5022621.0,5639.0,7208651.0,5285340.0,,0
Flowers,Flowers - Single,Miley Cyrus,2023-01-12,USSM12209777,4,444.9,2031280633.0,269802.0,136569078.0,85.0,1096100899.0,10629796.0,7189811.0,1078757968.0,14603725994.0,3351188582.0,394.0,1474799.0,2182.0,264.0,24684248.0,210.0,190260277.0,203384.0,,11822942.0,,0
Houdini,Houdini,Eminem,2024-05-31,USUG12403398,5,423.3,107034922.0,7223.0,151469874.0,88.0,77373957.0,3670188.0,16400.0,,,112763851.0,182.0,12185.0,1.0,82.0,17660624.0,105.0,4493884.0,7006.0,207179.0,457017.0,,1
Lovin On Me,Lovin On Me,Jack Harlow,2023-11-10,USAT22311371,6,410.1,670665438.0,105892.0,175421034.0,83.0,131148091.0,1392593.0,4202367.0,214943489.0,2938686633.0,2867222632.0,138.0,522042.0,4654.0,86.0,17167254.0,152.0,138529362.0,50982.0,9438601.0,4517131.0,,1
Beautiful Things,Beautiful Things,Benson Boone,2024-01-18,USWB12307016,7,407.2,900158751.0,73118.0,201585714.0,86.0,308723145.0,4120760.0,,29584940.0,534915313.0,4601579812.0,280.0,383478.0,429.0,168.0,48197850.0,154.0,65447476.0,57372.0,,9990302.0,,0
Gata Only,Gata Only,FloyyMenor,2024-02-02,QZL382406049,8,375.8,675079153.0,40094.0,211236940.0,92.0,228382568.0,1439495.0,3500000.0,338546668.0,3804584163.0,2112581620.0,160.0,17221.0,30.0,87.0,33245595.0,53.0,3372428.0,5762.0,,6063523.0,,1
Danza Kuduro - Cover,��������������������� - ������������������ -,MUSIC LAB JPN,2024-06-09,TCJPA2463708,9,355.7,1653018119.0,1.0,15.0,,,,,,,,,,,,,,,,,,,1
BAND4BAND (feat. Lil Baby),BAND4BAND (feat. Lil Baby),Central Cee,2024-05-23,USSM12404354,10,330.6,90676573.0,10400.0,184199419.0,86.0,32735244.0,988682.0,325800.0,121574500.0,974656200.0,174706874.0,191.0,3823.0,117.0,78.0,10800098.0,92.0,1005626.0,842.0,3679709.0,666302.0,,1


In [0]:

# Create a view or table

temp_table_name = "Most_Streamed_Spotify_Songs_2024_csv"

df.createOrReplaceTempView(temp_table_name)

EXPLORATORY ANALYSIS 

In [0]:
#column data types
print(df.dtypes)

df.printSchema()

#this format of reading in data does not recognize the appropriate data types

[('Track', 'string'), ('Album Name', 'string'), ('Artist', 'string'), ('Release Date', 'date'), ('ISRC', 'string'), ('All Time Rank', 'string'), ('Track Score', 'double'), ('Spotify Streams', 'string'), ('Spotify Playlist Count', 'string'), ('Spotify Playlist Reach', 'string'), ('Spotify Popularity', 'int'), ('YouTube Views', 'string'), ('YouTube Likes', 'string'), ('TikTok Posts', 'string'), ('TikTok Likes', 'string'), ('TikTok Views', 'string'), ('YouTube Playlist Reach', 'string'), ('Apple Music Playlist Count', 'int'), ('AirPlay Spins', 'string'), ('SiriusXM Spins', 'string'), ('Deezer Playlist Count', 'int'), ('Deezer Playlist Reach', 'string'), ('Amazon Playlist Count', 'int'), ('Pandora Streams', 'string'), ('Pandora Track Stations', 'string'), ('Soundcloud Streams', 'string'), ('Shazam Counts', 'string'), ('TIDAL Popularity', 'string'), ('Explicit Track', 'int')]
root
 |-- Track: string (nullable = true)
 |-- Album Name: string (nullable = true)
 |-- Artist: string (nullable = 

In [0]:
#Second Read-In
from pyspark.sql.types import *

schema = StructType([
    StructField("Track", StringType(),True),
    StructField("Album Name", StringType(), True),
    StructField("Artist", StringType(), True),
    StructField("Release Date", DateType(), True),
    StructField("ISRC", IntegerType(), True),
    StructField("All Time Rank", IntegerType(), True),
    StructField("Track Score", IntegerType(), True),
    StructField("Spotify Streams", IntegerType(), True),
    StructField("Spotify Playlist Count", IntegerType(), True),
    StructField("Spotify Playlist Reach", IntegerType(), True),
    StructField("Spotify Popularity", IntegerType(), True),
    StructField("YouTube Views", IntegerType(), True),
    StructField("YouTube Likes", IntegerType(), True),
    StructField("TikTok Posts", IntegerType(), True),
    StructField("TikTok Likes", IntegerType(), True),
    StructField("TikTok Views", IntegerType(), True),
    StructField("YouTube Playlist Reach", IntegerType(), True),
    StructField("Apple Music Playlist Count", IntegerType(), True),
    StructField("AirPlay Spins", IntegerType(), True),
    StructField("SiriusXM Spins", IntegerType(), True),
    StructField("Deezer Playlist Count", IntegerType(), True),         
    StructField("Deezer Playlist Reach", IntegerType(), True),
     StructField("Amazon Playlist Count", IntegerType(), True),
    StructField("Pandora Streams", IntegerType(), True),
    StructField("Pandora Track Stations", IntegerType(), True),
    StructField("Soundcloud Streams", IntegerType(), True),
    StructField("Shazam Counts", IntegerType(), True),
    StructField("TIDAL Popularity", IntegerType(), True),
    StructField("Explicit Track", IntegerType(), True)
    ])

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
stream_df = spark.read.csv(file_location, header=True, schema =schema, dateFormat="dd/MM/yyyy")

stream_df.printSchema()

root
 |-- Track: string (nullable = true)
 |-- Album Name: string (nullable = true)
 |-- Artist: string (nullable = true)
 |-- Release Date: date (nullable = true)
 |-- ISRC: integer (nullable = true)
 |-- All Time Rank: integer (nullable = true)
 |-- Track Score: integer (nullable = true)
 |-- Spotify Streams: integer (nullable = true)
 |-- Spotify Playlist Count: integer (nullable = true)
 |-- Spotify Playlist Reach: integer (nullable = true)
 |-- Spotify Popularity: integer (nullable = true)
 |-- YouTube Views: integer (nullable = true)
 |-- YouTube Likes: integer (nullable = true)
 |-- TikTok Posts: integer (nullable = true)
 |-- TikTok Likes: integer (nullable = true)
 |-- TikTok Views: integer (nullable = true)
 |-- YouTube Playlist Reach: integer (nullable = true)
 |-- Apple Music Playlist Count: integer (nullable = true)
 |-- AirPlay Spins: integer (nullable = true)
 |-- SiriusXM Spins: integer (nullable = true)
 |-- Deezer Playlist Count: integer (nullable = true)
 |-- Deezer 

In [0]:
display(stream_df)

Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,2026-02-04,,1.0,,,,,92.0,,,,,,,210.0,,684.0,62.0,,114.0,,,,,,0
Not Like Us,Not Like Us,Kendrick Lamar,2024-04-05,,2.0,,,,,92.0,,,,,,,188.0,,3.0,67.0,,111.0,,,,,,1
i like the way you kiss me,I like the way you kiss me,Artemas,2025-07-03,,3.0,,,,,92.0,,,,,,,190.0,,536.0,136.0,,172.0,,,,,,0
Flowers,Flowers - Single,Miley Cyrus,2023-12-01,,4.0,,,,,85.0,,,,,,,394.0,,,264.0,,210.0,,,,,,0
Houdini,Houdini,Eminem,2026-07-05,,5.0,,,,,88.0,,,,,,,182.0,,1.0,82.0,,105.0,,,,,,1
Lovin On Me,Lovin On Me,Jack Harlow,2023-10-11,,6.0,,,,,83.0,,,,,,,138.0,,,86.0,,152.0,,,,,,1
Beautiful Things,Beautiful Things,Benson Boone,2025-06-01,,7.0,,,,,86.0,,,,,,,280.0,,429.0,168.0,,154.0,,,,,,0
Gata Only,Gata Only,FloyyMenor,2024-02-02,,8.0,,,,,92.0,,,,,,,160.0,,30.0,87.0,,53.0,,,,,,1
Danza Kuduro - Cover,��������������������� - ������������������ -,MUSIC LAB JPN,2024-09-06,,9.0,,,1.0,15.0,,,,,,,,,,,,,,,,,,,1
BAND4BAND (feat. Lil Baby),BAND4BAND (feat. Lil Baby),Central Cee,2025-11-05,,10.0,,,,,86.0,,,,,,,191.0,,117.0,78.0,,92.0,,842.0,,,,1


In [0]:
stream_df.dtypes

Out[15]: [('Track', 'string'),
 ('Album Name', 'string'),
 ('Artist', 'string'),
 ('Release Date', 'date'),
 ('ISRC', 'int'),
 ('All Time Rank', 'int'),
 ('Track Score', 'int'),
 ('Spotify Streams', 'int'),
 ('Spotify Playlist Count', 'int'),
 ('Spotify Playlist Reach', 'int'),
 ('Spotify Popularity', 'int'),
 ('YouTube Views', 'int'),
 ('YouTube Likes', 'int'),
 ('TikTok Posts', 'int'),
 ('TikTok Likes', 'int'),
 ('TikTok Views', 'int'),
 ('YouTube Playlist Reach', 'int'),
 ('Apple Music Playlist Count', 'int'),
 ('AirPlay Spins', 'int'),
 ('SiriusXM Spins', 'int'),
 ('Deezer Playlist Count', 'int'),
 ('Deezer Playlist Reach', 'int'),
 ('Amazon Playlist Count', 'int'),
 ('Pandora Streams', 'int'),
 ('Pandora Track Stations', 'int'),
 ('Soundcloud Streams', 'int'),
 ('Shazam Counts', 'int'),
 ('TIDAL Popularity', 'int'),
 ('Explicit Track', 'int')]

In [0]:
stream_df.select("Track").show()

+--------------------+
|               Track|
+--------------------+
| MILLION DOLLAR BABY|
|         Not Like Us|
|i like the way yo...|
|             Flowers|
|             Houdini|
|         Lovin On Me|
|    Beautiful Things|
|           Gata Only|
|Danza Kuduro - Cover|
|BAND4BAND (feat. ...|
|I Had Some Help (...|
|            The Door|
|               LUNCH|
|           Like That|
|      bathroom floor|
|                LALA|
|Fortnight (feat. ...|
|              greedy|
|                BLUE|
|           As It Was|
+--------------------+
only showing top 20 rows



In [0]:
stream_df.select(['Track', "Artist", "Album Name", "Release Date"]).show()

+--------------------+--------------+--------------------+------------+
|               Track|        Artist|          Album Name|Release Date|
+--------------------+--------------+--------------------+------------+
| MILLION DOLLAR BABY| Tommy Richman|Million Dollar Ba...|  2026-02-04|
|         Not Like Us|Kendrick Lamar|         Not Like Us|  2024-04-05|
|i like the way yo...|       Artemas|I like the way yo...|  2025-07-03|
|             Flowers|   Miley Cyrus|    Flowers - Single|  2023-12-01|
|             Houdini|        Eminem|             Houdini|  2026-07-05|
|         Lovin On Me|   Jack Harlow|         Lovin On Me|  2023-10-11|
|    Beautiful Things|  Benson Boone|    Beautiful Things|  2025-06-01|
|           Gata Only|    FloyyMenor|           Gata Only|  2024-02-02|
|Danza Kuduro - Cover| MUSIC LAB JPN|�����������������...|  2024-09-06|
|BAND4BAND (feat. ...|   Central Cee|BAND4BAND (feat. ...|  2025-11-05|
|I Had Some Help (...|   Post Malone|     I Had Some Help|  2024

In [0]:
stream_df.select(['Spotify Streams', "Tiktok Views", "Apple Music Playlist Count", "YouTube Views"]).show()

#INCONSISTENCY IN SCHEMA PROCESSING CONTINUES TO RETURN NULL VALUES

+---------------+------------+--------------------------+-------------+
|Spotify Streams|Tiktok Views|Apple Music Playlist Count|YouTube Views|
+---------------+------------+--------------------------+-------------+
|           null|        null|                       210|         null|
|           null|        null|                       188|         null|
|           null|        null|                       190|         null|
|           null|        null|                       394|         null|
|           null|        null|                       182|         null|
|           null|        null|                       138|         null|
|           null|        null|                       280|         null|
|           null|        null|                       160|         null|
|           null|        null|                      null|         null|
|           null|        null|                       191|         null|
|           null|        null|                       157|       