# 1. **Introduction**

The aim of this project is predict whether a Spotify user will churn (cancel subscription) or remain active.


# 2. **Configuration**

Set libraries and download data

In [8]:
import kagglehub
import os
import polars as pl

In [35]:
# Download data
path = kagglehub.dataset_download("nabihazahid/spotify-dataset-for-churn-analysis")

df_raw = pl.read_csv(f"{path}/spotify_churn_dataset.csv")
df_raw.head()

user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
i64,str,i64,str,str,i64,i64,f64,str,i64,i64,i64
1,"""Female""",54,"""CA""","""Free""",26,23,0.2,"""Desktop""",31,0,1
2,"""Other""",33,"""DE""","""Family""",141,62,0.34,"""Web""",0,1,0
3,"""Male""",38,"""AU""","""Premium""",199,38,0.04,"""Mobile""",0,1,1
4,"""Female""",22,"""CA""","""Student""",36,2,0.31,"""Mobile""",0,1,0
5,"""Other""",29,"""US""","""Family""",250,57,0.36,"""Mobile""",0,1,1


In [41]:
# Set the correct type for each column
df_raw = df_raw.with_columns(
    pl.col("user_id").cast(pl.String),
    pl.col("gender","country", "subscription_type","device_type").cast(pl.Categorical),
    pl.col("offline_listening", "is_churned").cast(pl.Boolean),
)

quantitative_cols = [c for c, dt in zip(df_raw.columns, df_raw.dtypes) if dt == pl.Int64]
string_cols = [c for c, dt in zip(df_raw.columns, df_raw.dtypes) if dt == pl.String]
categorical_cols = [c for c, dt in zip(df_raw.columns, df_raw.dtypes) if dt == pl.Categorical]
boolean_cols = [c for c, dt in zip(df_raw.columns, df_raw.dtypes) if dt == pl.Boolean]



# 3. **Exploratory Data Analysis (EDA)**

## A. **General**

All rows have information in all columns. 2071 users (of 8000) are target as churned.

In [14]:
# Check nulls
df_raw.null_count()

user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Target variable distribution
df_raw["is_churned"].value_counts()

is_churned,count
i64,u32
1,2071
0,5929


## B. Numerical variables

In [None]:
df_raw.select(quantitative_cols).describe()

statistic,age,listening_time,songs_played_per_day,ads_listened_per_week
str,f64,f64,f64,f64
"""count""",8000.0,8000.0,8000.0,8000.0
"""null_count""",0.0,0.0,0.0,0.0
"""mean""",37.662125,154.06825,50.12725,6.943875
"""std""",12.740359,84.015596,28.449762,13.617953
"""min""",16.0,10.0,1.0,0.0
"""25%""",26.0,81.0,25.0,0.0
"""50%""",38.0,154.0,50.0,0.0
"""75%""",49.0,227.0,75.0,5.0
"""max""",59.0,299.0,99.0,49.0


In [46]:
df_raw.group_by("is_churned").agg(
    [
        *[pl.col(c).mean().round(3).alias(f"{c}_mean") for c in quantitative_cols],
        *[pl.col(c).median().round(3).alias(f"{c}_median") for c in quantitative_cols],
    ]
)

is_churned,age_mean,listening_time_mean,songs_played_per_day_mean,ads_listened_per_week_mean,age_median,listening_time_median,songs_played_per_day_median,ads_listened_per_week_median
bool,f64,f64,f64,f64,f64,f64,f64,f64
False,37.632,154.447,49.971,6.962,38.0,155.0,50.0,0.0
True,37.748,152.985,50.576,6.891,38.0,153.0,52.0,0.0


In [49]:
import matplotlib.pyplot as plt

# Ejemplo con minutes_played
df_raw.select("age").to_pandas().boxplot()
plt.title("Outliers in minutes_played")
plt.show()

ModuleNotFoundError: pa.Table requires 'pyarrow' module to be installed