# Analisi dati Spotify

## import delle librerie

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## caricamento del dataset

In [34]:
# Caricare il dataset
df = pd.read_csv("spotify_2015_2025.csv")
print(f"Dataset caricato: {df.shape[0]} righe, {df.shape[1]} colonne.")
print("Colonne presenti:", list(df.columns))

Dataset caricato: 1500 righe, 12 colonne.
Colonne presenti: ['track_name', 'artist_name', 'release_date', 'popularity', 'danceability', 'energy', 'loudness', 'tempo', 'duration_ms', 'instrumentalness', 'key', 'mode']


In [35]:
# Panoramica dei dati
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_name        1500 non-null   object 
 1   artist_name       1500 non-null   object 
 2   release_date      1500 non-null   object 
 3   popularity        1489 non-null   float64
 4   danceability      1500 non-null   float64
 5   energy            1500 non-null   float64
 6   loudness          1500 non-null   float64
 7   tempo             1500 non-null   float64
 8   duration_ms       1500 non-null   int64  
 9   instrumentalness  1500 non-null   float64
 10  key               1500 non-null   int64  
 11  mode              1500 non-null   int64  
dtypes: float64(6), int64(3), object(3)
memory usage: 140.8+ KB


Unnamed: 0,popularity,danceability,energy,loudness,tempo,duration_ms,instrumentalness,key,mode
count,1489.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,61.010052,0.596589,0.696302,-7.041317,120.115368,225099.425333,0.050132,5.347333,0.510667
std,16.265563,0.173084,0.171969,4.08129,32.495307,44427.866884,0.028963,3.416793,0.500053
min,-50.0,0.300007,0.400018,-16.530111,23.369509,150005.0,0.000109,0.0,0.0
25%,50.88267,0.447987,0.549153,-9.051546,98.097301,186819.5,0.02535,2.0,0.0
50%,60.732901,0.593705,0.69041,-7.094712,119.523207,225891.0,0.050268,5.0,1.0
75%,70.624558,0.745446,0.843266,-5.057517,141.463974,265073.25,0.075293,8.0,1.0
max,200.0,0.899735,0.999276,100.0,500.0,299967.0,0.099964,11.0,1.0


## Cleaning

### Gestione valori mancanti

In [36]:
# Percentuale di missing
missing_pct = df.isnull().sum() / len(df) * 100
cols_to_drop = missing_pct[missing_pct > 50].index.tolist()

if cols_to_drop:
    print(f"Rimozione colonne con >50% missing: {cols_to_drop}")
    df.drop(columns=cols_to_drop, inplace=True)

# Mostra colonne con almeno un missing
missing_per_col = df.isnull().sum()
missing_per_col = missing_per_col[missing_per_col > 0]
print(missing_per_col)


popularity    11
dtype: int64


### Imputazione valori mancanti

In [37]:
# Numeri → media
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

# Oggetti → moda
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col].fillna(mode_val, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


### Rimozione duplicati

In [38]:
df.drop_duplicates(subset=['track_name', 'artist_name'], keep='first', inplace=True)


### gestione degli outlier

In [39]:
col_num = df.select_dtypes(include=[np.number]).columns
for col in col_num:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    df[col] = df[col].clip(lower, upper)


### validazione range numerici

In [40]:
for f in ['danceability', 'energy', 'instrumentalness']:
    if f in df.columns:
        if df[f].max() > 1:
            df[f] = df[f] / 100
        df[f] = df[f].clip(0, 1).astype('float32')

if 'popularity' in df.columns:
    df['popularity'] = df['popularity'].clip(0, 100).astype('float32')
if 'tempo' in df.columns:
    df['tempo'] = df['tempo'].clip(50, 250).astype('float32')
if 'loudness' in df.columns:
    df['loudness'] = df['loudness'].clip(-60, 5).astype('float32')


### standardizzazione stringhe e gestione categorie

In [41]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip().replace(['', 'N/A', 'null', 'nan'], np.nan)
    df[col] = df[col].str.title()
    df[col].fillna('Unknown', inplace=True)
    if df[col].nunique() / len(df[col]) < 0.5:
        df[col] = df[col].astype('category')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

In [42]:
# Gestione release_date e durata
# release_date
if 'release_date' in df.columns:
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    median_year = df['release_date'].dt.year.median()
    df['release_year'] = df['release_date'].dt.year.fillna(median_year).astype('uint16')
    df['release_month'] = df['release_date'].dt.month.fillna(1).astype('uint8')

# durata ms → secondi
if 'duration_ms' in df.columns:
    df['duration_s'] = (df['duration_ms'] / 1000).astype('float32')
    df.drop('duration_ms', axis=1, inplace=True)


In [43]:
# coerenze tipi di dato 
# Interi
for col in df.select_dtypes(include=['int64', 'int32']).columns:
    col_min, col_max = df[col].min(), df[col].max()
    if col_min >= 0:
        if col_max <= 255:
            df[col] = df[col].astype('uint8')
        elif col_max <= 65535:
            df[col] = df[col].astype('uint16')
        else:
            df[col] = df[col].astype('uint32')
    else:
        if col_min >= -128 and col_max <= 127:
            df[col] = df[col].astype('int8')
        elif col_min >= -32768 and col_max <= 32767:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')

# Float
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype('float32')

# Conversione esplicita di 'key' e 'mode' a category
df['key'] = df['key'].astype('category')
df['mode'] = df['mode'].astype('category')


## Salvataggio dataset pulito

In [44]:
out_file = 'spotify_clean.csv'
df.to_csv(out_file, index=False)


In [45]:
# report statistico 
print("\n--- TIPI DATI ---")
print(df.dtypes)
print(f"Righe finali: {df.shape[0]}")

print("\n--- STATISTICHE DESCRITTIVE ---")
print(df[['popularity', 'danceability', 'energy', 'loudness', 'tempo', 'duration_s']].describe().T)



--- TIPI DATI ---
track_name            object
artist_name           object
release_date        category
popularity           float32
danceability         float32
energy               float32
loudness             float32
tempo                float32
instrumentalness     float32
key                 category
mode                category
release_year          uint16
release_month          uint8
duration_s           float32
dtype: object
Righe finali: 1500

--- STATISTICHE DESCRITTIVE ---
               count        mean        std         min         25%  \
popularity    1500.0   60.938885  14.903061   21.458715   50.954057   
danceability  1500.0    0.596589   0.173084    0.300007    0.447987   
energy        1500.0    0.696302   0.171969    0.400018    0.549153   
loudness      1500.0   -7.108552   2.982905  -15.042589   -9.051546   
tempo         1500.0  119.995865  30.756939   50.000000   98.097303   
duration_s    1500.0  225.099442  44.427868  150.005005  186.819504   

           