# Spotify Streaming History Analysis

## Data Loading

In [None]:

import json
import pandas as pd

# Load the provided JSON files
files = [
    "StreamingHistory0.json",
    "StreamingHistory1.json",
    "StreamingHistory2.json",
    "StreamingHistory3.json"
]

data = []
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        data.extend(json.load(f))

# Convert data into a pandas DataFrame
df = pd.DataFrame(data)

# Convert endTime to datetime format
df['endTime'] = pd.to_datetime(df['endTime'])

df.head()


## Analysis

### Most Listened-to Artists

In [None]:

from collections import defaultdict

# Most listened-to artists
artist_time_played = df.groupby('artistName')['msPlayed'].sum().sort_values(ascending=False)
artist_time_played.head(10)


### Most Played Tracks

In [None]:

# Most played tracks
most_played_tracks = df.groupby(['artistName', 'trackName']).size().sort_values(ascending=False)
most_played_tracks.head(10)


### Total Listening Time

In [None]:

# Total listening time
total_listening_time = df['msPlayed'].sum()
total_listening_time


### Distribution of Listening by Date

In [None]:

# Distribution of listening by date
listening_by_date = df.groupby(df['endTime'].dt.date)['msPlayed'].sum()
listening_by_date.head()


### Hourly Listening Activity by Week

In [None]:
# Extract day of the week from the endTime column
df['day_of_week'] = df['endTime'].dt.day_name()

# Group by day of the week and sum the msPlayed
listening_by_day = df.groupby('day_of_week')['msPlayed'].sum()

# Order the days for visualization
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
listening_by_day = listening_by_day.reindex(ordered_days)

# Calculate the number of unique weeks in the dataset to get the average
num_weeks = (df['endTime'].max() - df['endTime'].min()).days / 7
average_listening_by_day = listening_by_day / num_weeks

average_listening_by_day


In [None]:
# Extract the hour from the endTime column
df['hour'] = df['endTime'].dt.hour

# Group by day of the week and hour, then sum the msPlayed
listening_by_day_hour = df.groupby(['day_of_week', 'hour'])['msPlayed'].sum().unstack()

# Order the days for visualization
listening_by_day_hour = listening_by_day_hour.reindex(ordered_days)

# Calculate the average listening time for each day-hour combination
average_listening_by_day_hour = listening_by_day_hour / num_weeks

average_listening_by_day_hour


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the heatmap
plt.figure(figsize=(14, 7))
sns.heatmap(average_listening_by_day_hour, cmap="YlGnBu", linewidths=.5)
plt.title('Average Hourly Listening Activity by Day of the Week')
plt.xlabel('Hour of the Day')
plt.ylabel('Day of the Week')
plt.tight_layout()
plt.show()


## Visualizations

In [None]:

# 1. Bar chart for the top 10 most listened-to artists
plt.figure(figsize=(12, 6))
artist_time_played.head(10).plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Listened-to Artists')
plt.ylabel('Total Time Played (ms)')
plt.xlabel('Artist Name')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Bar chart for the top 10 most played tracks
plt.figure(figsize=(12, 6))
most_played_tracks.head(10).plot(kind='bar', color='coral')
plt.title('Top 10 Most Played Tracks')
plt.ylabel('Number of Plays')
plt.xlabel('Artist - Track Name')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 3. Pie chart for listening time by top 10 artists
plt.figure(figsize=(10, 10))
artist_time_played.head(10).plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)
plt.title('Total Listening Time by Top 10 Artists')
plt.ylabel('')
plt.show()

# 4. Time series plot for distribution by date
plt.figure(figsize=(14, 6))
listening_by_date.plot(kind='line', color='green')
plt.title('Distribution of Listening by Date')
plt.ylabel('Total Time Played (ms)')
plt.xlabel('Date')
plt.tight_layout()
plt.show()
