# Analisi dati Spotify

## import delle librerie

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## caricamento del dataset

In [62]:
# Caricare il dataset
df = pd.read_csv("spotify_2015_2025.csv")
print(f"Dataset caricato: {df.shape[0]} righe, {df.shape[1]} colonne.")
print("Colonne presenti:", list(df.columns))

Dataset caricato: 85000 righe, 19 colonne.
Colonne presenti: ['track_id', 'track_name', 'artist_name', 'album_name', 'release_date', 'genre', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'instrumentalness', 'tempo', 'stream_count', 'country', 'explicit', 'label']


In [63]:
# Panoramica dei dati
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85000 entries, 0 to 84999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          85000 non-null  object 
 1   track_name        84979 non-null  object 
 2   artist_name       85000 non-null  object 
 3   album_name        84954 non-null  object 
 4   release_date      85000 non-null  object 
 5   genre             85000 non-null  object 
 6   duration_ms       85000 non-null  int64  
 7   popularity        85000 non-null  int64  
 8   danceability      85000 non-null  float64
 9   energy            85000 non-null  float64
 10  key               85000 non-null  int64  
 11  loudness          85000 non-null  float64
 12  mode              85000 non-null  int64  
 13  instrumentalness  85000 non-null  float64
 14  tempo             85000 non-null  float64
 15  stream_count      85000 non-null  int64  
 16  country           85000 non-null  object

Unnamed: 0,duration_ms,popularity,danceability,energy,key,loudness,mode,instrumentalness,tempo,stream_count,explicit
count,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0
mean,254913.824294,48.162894,0.52071,0.505412,5.515988,-27.9993,0.500141,0.399781,129.948351,214354.7,0.201329
std,95300.233207,14.829752,0.270799,0.279774,3.454069,15.598365,0.500003,0.23147,40.444321,1680637.0,0.400996
min,90004.0,0.0,0.05,0.02,0.0,-55.0,0.0,0.0,60.0,1000.0,0.0
25%,171871.5,38.0,0.29,0.26,2.75,-41.47,0.0,0.198,94.84,1000.0,0.0
50%,254920.5,47.0,0.52,0.5,6.0,-27.97,1.0,0.399,129.99,2000.0,0.0
75%,337203.25,57.0,0.76,0.75,9.0,-14.5,1.0,0.6,165.03,9000.0,0.0
max,420000.0,100.0,0.99,0.99,11.0,-1.0,1.0,0.8,200.0,20000000.0,1.0


## Cleaning

### Gestione valori mancanti

In [64]:
# Percentuale di missing
missing_pct = df.isnull().sum() / len(df) * 100
cols_to_drop = missing_pct[missing_pct > 50].index.tolist()

if cols_to_drop:
    print(f"Rimozione colonne con >50% missing: {cols_to_drop}")
    df.drop(columns=cols_to_drop, inplace=True)

# Mostra colonne con almeno un missing
missing_per_col = df.isnull().sum()
missing_per_col = missing_per_col[missing_per_col > 0]
print(missing_per_col)


track_name    21
album_name    46
dtype: int64


### Imputazione valori mancanti

In [65]:
# Numeri → media
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

# Oggetti → moda
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col].fillna(mode_val, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_val, inplace=True)


### Rimozione duplicati

In [66]:
df.drop_duplicates(subset=['track_name', 'artist_name'], keep='first', inplace=True)


### gestione degli outlier

In [67]:
col_num = df.select_dtypes(include=[np.number]).columns
for col in col_num:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    df[col] = df[col].clip(lower, upper)


### validazione range numerici

In [68]:
for f in ['danceability', 'energy', 'instrumentalness']:
    if f in df.columns:
        if df[f].max() > 1:
            df[f] = df[f] / 100
        df[f] = df[f].clip(0, 1).astype('float32')

if 'popularity' in df.columns:
    df['popularity'] = df['popularity'].clip(0, 100).astype('float32')
if 'tempo' in df.columns:
    df['tempo'] = df['tempo'].clip(50, 250).astype('float32')
if 'loudness' in df.columns:
    df['loudness'] = df['loudness'].clip(-60, 5).astype('float32')


### standardizzazione stringhe e gestione categorie

In [69]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip().replace(['', 'N/A', 'null', 'nan'], np.nan)
    df[col] = df[col].str.title()
    df[col].fillna('Unknown', inplace=True)
    if df[col].nunique() / len(df[col]) < 0.5:
        df[col] = df[col].astype('category')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

In [70]:
# Gestione release_date e durata
# release_date
if 'release_date' in df.columns:
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    median_year = df['release_date'].dt.year.median()
    df['release_year'] = df['release_date'].dt.year.fillna(median_year).astype('uint16')
    df['release_month'] = df['release_date'].dt.month.fillna(1).astype('uint8')

# durata ms → secondi
if 'duration_ms' in df.columns:
    df['duration_s'] = (df['duration_ms'] / 1000).astype('float32')
    df.drop('duration_ms', axis=1, inplace=True)


In [71]:
# coerenze tipi di dato 
# Interi
for col in df.select_dtypes(include=['int64', 'int32']).columns:
    col_min, col_max = df[col].min(), df[col].max()
    if col_min >= 0:
        if col_max <= 255:
            df[col] = df[col].astype('uint8')
        elif col_max <= 65535:
            df[col] = df[col].astype('uint16')
        else:
            df[col] = df[col].astype('uint32')
    else:
        if col_min >= -128 and col_max <= 127:
            df[col] = df[col].astype('int8')
        elif col_min >= -32768 and col_max <= 32767:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')

# Float
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype('float32')

# Conversione esplicita di 'key' e 'mode' a category
df['key'] = df['key'].astype('category')
df['mode'] = df['mode'].astype('category')


## Salvataggio dataset pulito

In [72]:
out_file = 'spotify_clean.csv'
df.to_csv(out_file, index=False)


In [73]:
# report statistico 
print("\n--- TIPI DATI ---")
print(df.dtypes)
print(f"Righe finali: {df.shape[0]}")

print("\n--- STATISTICHE DESCRITTIVE ---")
print(df[['popularity', 'danceability', 'energy', 'loudness', 'tempo', 'duration_s']].describe().T)



--- TIPI DATI ---
track_id                    object
track_name                  object
artist_name                 object
album_name                  object
release_date        datetime64[ns]
genre                     category
popularity                 float32
danceability               float32
energy                     float32
key                       category
loudness                   float32
mode                      category
instrumentalness           float32
tempo                      float32
stream_count                uint16
country                   category
explicit                     uint8
label                     category
release_year                uint16
release_month                uint8
duration_s                 float32
dtype: object
Righe finali: 84997

--- STATISTICHE DESCRITTIVE ---
                count        mean        std        min         25%  \
popularity    84997.0   48.005966  14.359772   9.500000   38.000000   
danceability  84997.0    0.520710   0