<a href="https://colab.research.google.com/github/Helemz-data/Spotify-Data-Analysis/blob/main/Copy_of_Spotify_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Panda and Matplotlib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

Load and Clean data

In [None]:
df = pd.read_csv("spotify_data clean.csv")

Clean column name

In [None]:
df.columns = df.columns.str.strip()

Convert date

In [None]:
df["album_release_date"] = pd.to_datetime(df["album_release_date"], errors="coerce")

Drop missing values in critical columns

In [None]:
df = df.dropna(subset=["artist_genres", "album_release_date", "track_popularity"])

Convert list-like strings into actual lists

In [None]:
df["artist_genres"] = (
    df["artist_genres"]
    .str.strip("[]")
    .str.replace("'", "")
    .str.split(", ")
)

Explode so each genre is its own row

In [None]:
df_exploded = df.explode("artist_genres")
df_exploded = df_exploded.rename(columns={"artist_genres": "genre"})

Filter out empty genres

In [None]:
df_exploded = df_exploded[df_exploded["genre"].notna() & (df_exploded["genre"] != "")]

Add year & month columns for trend analysis

In [None]:
df_exploded["year"] = df_exploded["album_release_date"].dt.year
df_exploded["month"] = df_exploded["album_release_date"].dt.month

GENRE POPULARITY OVER TIME

In [None]:
genre_trend = (
    df_exploded.groupby(["year", "genre"])["track_popularity"]
    .mean()
    .reset_index()
)

Example: plot top 5 genres with most data

In [None]:
top_genres = (
    df_exploded["genre"].value_counts().head(5).index
)

In [None]:
plt.figure(figsize=(12, 6))
for g in top_genres:
    subset = genre_trend[genre_trend["genre"] == g]
    plt.plot(subset["year"], subset["track_popularity"], label=g)

In [None]:
genre_counts = df_exploded["genre"].value_counts()

In [None]:
MIN_TRACKS = 50
valid_genres = genre_counts[genre_counts >= MIN_TRACKS].index

In [None]:
top_genres = valid_genres[:5]

plt.figure(figsize=(12, 6))

for g in top_genres:
    subset = genre_trend[genre_trend["genre"] == g]

In [None]:
subset = subset.sort_values("year")

    if not subset.empty:
        plt.plot(subset["year"], subset["track_popularity"], marker='o', label=g)

In [None]:
plt.title("Genre Popularity Over Time (Top 5 Genres with Enough Data)")
plt.xlabel("Year")
plt.ylabel("Average Track Popularity")
plt.xticks(subset["year"].unique())  # show all years on x-axis
plt.grid(alpha=0.3)
plt.legend(title="Genre")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
for g in top_genres:
    subset = genre_trend[genre_trend["genre"] == g]
    plt.plot(subset["year"], subset["track_popularity"], label=g)

plt.title("Genre Popularity Over Time (Top 5 Genres)")
plt.xlabel("Year")
plt.ylabel("Average Track Popularity")
plt.legend()
plt.show()

SEASONAL (MONTHLY) TRENDS

In [None]:
monthly_genre = (
    df_exploded.groupby(["month", "genre"])["track_popularity"]
    .mean()
    .reset_index()
)

In [None]:
plt.figure(figsize=(12, 6))
for g in top_genres:
    subset = monthly_genre[monthly_genre["genre"] == g]
    plt.plot(subset["month"], subset["track_popularity"], label=g)

plt.title("Seasonal Genre Performance (Monthly Avg Popularity)")
plt.xlabel("Month")
plt.ylabel("Popularity")
plt.legend()
plt.show()

CORRELATION BETWEEN GENRE & POPULARITY

In [None]:
genre_popularity = (
    df_exploded.groupby("genre")["track_popularity"]
    .mean()
    .sort_values(ascending=False)
)

In [None]:
print("Top Genres by Average Popularity:")
print(genre_popularity.head(10))

In [None]:
plt.figure(figsize=(10, 8))
genre_popularity.head(20).plot(kind='bar')
plt.title("Top 20 Genres by Average Popularity")
plt.xlabel("Genre")
plt.ylabel("Avg Popularity")
plt.tight_layout()
plt.show()

In [None]:
genre_popularity.head(20).plot(kind='bar', figsize=(10, 6))
plt.title("Top 10 Genres by Average Popularity")
plt.xlabel("Genre")
plt.ylabel("Track Popularity")
plt.show()