In [1]:
# Install project requirements for this notebook
# Uses Jupyter %pip magic to install into the active kernel env
%pip install -q -r ../requirements.txt


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: 'c:\\Python313\\Lib\\site-packages\\websocket\\tests\\test_websocket.py'
Consider using the `--user` option or check the permissions.



# Spotify Churn Dataset — EDA and Assignment Summary

This notebook inspects `spotify_churn_dataset.csv` and its metadata to verify assignment criteria and produce basic exploratory analysis and visuals you can reuse in your PDF presentation.


In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

DATA_DIR = Path.cwd().parent if (Path.cwd().name == 'notebooks') else Path.cwd()
CSV_PATH = DATA_DIR / 'spotify_churn_dataset.csv'
META_PATH = DATA_DIR / 'spotify-dataset-for-churn-analysis-metadata.json'

print('CSV exists:', CSV_PATH.exists(), 'size:', CSV_PATH.stat().st_size if CSV_PATH.exists() else 'NA')
print('Metadata exists:', META_PATH.exists())

df = pd.read_csv(CSV_PATH)
print('Rows, Columns:', df.shape)
df.head()


In [None]:
# Basic data info
print('Columns:', list(df.columns))
print('\nDtypes:')
print(df.dtypes)

# Check target distribution
print('\nTarget value counts:')
print(df['is_churned'].value_counts(dropna=False))

# Simple missing check
missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0]


In [None]:
# Read and summarize metadata
with open(META_PATH, 'r', encoding='utf-8') as f:
    meta = json.load(f)

meta_summary = {
    'name': meta.get('name'),
    'creator': (meta.get('creator') or {}).get('name'),
    'catalog': (meta.get('includedInDataCatalog') or {}).get('name'),
    'url': meta.get('url'),
    'license': ((meta.get('license') or {}).get('name'), (meta.get('license') or {}).get('url')),
    'datePublished': meta.get('datePublished'),
    'dateModified': meta.get('dateModified'),
    'synthetic': meta.get('alternateName'),
}
meta_summary


In [None]:
# Quick numeric summary
num_cols = df.select_dtypes(include=[np.number]).columns
summary = df[num_cols].describe().T
summary


In [None]:
# Visuals
sns.set_theme(style='whitegrid')

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

sns.countplot(data=df, x='is_churned', ax=axes[0,0])
axes[0,0].set_title('Target distribution (is_churned)')

sns.boxplot(data=df, x='is_churned', y='listening_time', ax=axes[0,1])
axes[0,1].set_title('Listening time by churn')

sns.boxplot(data=df, x='is_churned', y='songs_played_per_day', ax=axes[1,0])
axes[1,0].set_title('Songs per day by churn')

sns.barplot(data=df, x='subscription_type', y='is_churned', estimator=np.mean, ax=axes[1,1])
axes[1,1].set_title('Churn rate by subscription type')
axes[1,1].set_ylabel('Mean churn rate')
plt.tight_layout();


In [None]:
# Churn by subscription type: counts and percentage share
from matplotlib.ticker import PercentFormatter

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Counts per subscription_type split by churn status
sns.countplot(data=df, x='subscription_type', hue='is_churned', ax=axes[0])
axes[0].set_title('Counts by subscription type and churn status')
axes[0].set_xlabel('subscription_type')
axes[0].set_ylabel('count')
# Relabel legend for readability
handles, labels = axes[0].get_legend_handles_labels()
label_map = {'0': 'Active', '1': 'Churned', 0: 'Active', 1: 'Churned'}
axes[0].legend(handles=handles, labels=[label_map.get(l, l) for l in labels], title='Status')

# Percentage (share) per subscription_type (stacked) using crosstab
share = pd.crosstab(df['subscription_type'], df['is_churned'], normalize='index')
share = share.rename(columns={0: 'Active', 1: 'Churned'}).sort_index()

share[['Active', 'Churned']].plot(kind='bar', stacked=True, ax=axes[1], color=['#4dac26', '#d01c8b'])
axes[1].set_title('Churn share per subscription type')
axes[1].set_xlabel('subscription_type')
axes[1].set_ylabel('share')
axes[1].yaxis.set_major_formatter(PercentFormatter(1.0))
axes[1].legend(title='Status', loc='upper right')

plt.tight_layout();


## Notes for the presentation
- Source: Kaggle — "Spotify Analysis Dataset 2025" (Apache 2.0)
- Data type: synthetic, tabular; mixed numeric + categorical
- Rows represent users; target column: `is_churned` (0/1)
- Brief findings: include target balance, key descriptive stats, and 2–4 visuals above.
