## IMPORT


In [60]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from langdetect import detect

# Carica il modello multilingua di SpaCy
try:
    nlp_it = spacy.load("it_core_news_sm")
    nlp_en = spacy.load("en_core_web_sm")
    print("Modelli SpaCy italiano e inglese caricati con successo")
    nlp_available = True
except Exception as e:
    print(f"ATTENZIONE: Errore nel caricamento dei modelli SpaCy: {e}")
    print("Installa con:")
    print("  python -m spacy download it_core_news_sm")
    print("  python -m spacy download en_core_web_sm")
    print("  pip install langdetect")
    nlp_available = False
    nlp_it = None
    nlp_en = None
    nlp = None

Modelli SpaCy italiano e inglese caricati con successo


## MODELLO RICONOSCIMENTO LINGUAGGIO

In [59]:
def get_nlp_model(text):
    """Rileva la lingua e restituisce il modello appropriato"""
    try:
        lang = detect(text[:500])  # Usa i primi 500 caratteri
        return nlp_it if lang == 'it' else nlp_en
    except:
        return nlp_it  # Default italiano

## DATASET LOADS


In [66]:
data = "./data/"
artists = pd.read_csv(f'{data}artists.csv', sep=';', index_col=0)
tracks = pd.read_csv(f'{data}tracks.csv', sep=',', index_col=0)

## ARTISTS


### DROP LONGITUDE, LATITUDE AND ACTIVE_END

In [47]:
# Print number of columns before dropping
print(f'Numero di colonne prima del drop: {len(artists.columns)}')
print(f'Colonne: {list(artists.columns)}')

columns_to_drop = ['longitude', 'latitude', 'active_end']
artists = artists.drop(columns=columns_to_drop)

# Print number of columns after dropping
print(f'\nNumero di colonne dopo il drop: {len(artists.columns)}')
print(f'Colonne rimosse: {columns_to_drop}')
print(f'Colonne rimanenti: {list(artists.columns)}')

Numero di colonne prima del drop: 13
Colonne: ['name', 'gender', 'birth_date', 'birth_place', 'nationality', 'description', 'active_start', 'active_end', 'province', 'region', 'country', 'latitude', 'longitude']

Numero di colonne dopo il drop: 10
Colonne rimosse: ['longitude', 'latitude', 'active_end']
Colonne rimanenti: ['name', 'gender', 'birth_date', 'birth_place', 'nationality', 'description', 'active_start', 'province', 'region', 'country']


## TRACK


### Drop no lyrics songs

In [50]:
# Drop tracks with null lyrics

# Statistiche prima del drop
initial_count = len(tracks)
null_lyrics_count = tracks['lyrics'].isna().sum()

tracks = tracks.dropna(subset=['lyrics'])

# Statistiche dopo il drop
final_count = len(tracks)
dropped_count = initial_count - final_count
print(f'\nTracce rimosse: {dropped_count}')
print(f'Tracce rimanenti: {final_count} ({(final_count/initial_count)*100:.2f}% del totale originale)')



Tracce rimosse: 0
Tracce rimanenti: 11163 (100.00% del totale originale)


### FIX OUT OF RANGE YEARS OF THE TRACK


In [49]:
# Fix out-of-range years in tracks: impostare a vuoto se year < 1992 o > 2025
col = 'year'
total = len(tracks)

# Convertiamo in numerico
years = pd.to_numeric(tracks[col], errors='coerce')

# Identifichiamo valori fuori range
mask_out = (years < 1992) | (years > 2025)
out_count = int(mask_out.sum())
print(f'Valori fuori range (<1992 o >2025): {out_count} su {total} ({(out_count/total)*100:.2f}%)')

# Impostiamo a vuoto (pd.NA) i valori fuori range
tracks.loc[mask_out, col] = pd.NA

# Convertiamo la colonna in Int64 nullable per mantenere tipo numerico con NA
tracks[col] = pd.to_numeric(tracks[col], errors='coerce').astype('Int64')

# Statistiche dopo la pulizia
years_after = pd.to_numeric(tracks[col], errors='coerce')
print('\nStatistiche dopo la pulizia:')
print(years_after.describe())

# Numero di righe con year vuoto dopo la pulizia
final_empty = tracks[col].isna().sum()
print(f'\nNumero di righe con year vuoto dopo pulizia: {final_empty} su {total} ({(final_empty/total)*100:.2f}%)')


Valori fuori range (<1992 o >2025): 2152 su 11163 (19.28%)

Statistiche dopo la pulizia:
count         8573.0
mean     2015.151172
std          7.04169
min           1992.0
25%           2011.0
50%           2016.0
75%           2021.0
max           2025.0
Name: year, dtype: Float64

Numero di righe con year vuoto dopo pulizia: 2590 su 11163 (23.20%)


### FIX MISSING LYRICS STATISTICS


In [62]:
# ============================================================================
# FIX n_sentences
# ============================================================================
print("\n--- FIX n_sentences ---")
missing_before = tracks['n_sentences'].isna().sum()
print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")

# Identifica righe con n_sentences mancante
mask_missing = tracks['n_sentences'].isna()

# Calcola n_sentences dalle lyrics
for idx in tracks[mask_missing].index:
    lyrics = tracks.loc[idx, 'lyrics']
    # Split su punti, esclamativi, interrogativi
    sentences = re.split(r'[.!?]+', str(lyrics))
    sentences = [s.strip() for s in sentences if s.strip()]
    n_sent = len(sentences)
    tracks.loc[idx, 'n_sentences'] = n_sent
    
# Statistiche dopo il fix
missing_after = tracks['n_sentences'].isna().sum()
fixed = missing_before - missing_after
print(f"Valori fixati: {fixed}")
print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")


# ============================================================================
# FIX n_tokens
# ============================================================================
print("\n--- FIX n_tokens ---")
missing_before = tracks['n_tokens'].isna().sum()
print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")

# Identifica righe con n_tokens mancante
mask_missing = tracks['n_tokens'].isna()

# Calcola n_tokens dalle lyrics
for idx in tracks[mask_missing].index:
    lyrics = tracks.loc[idx, 'lyrics']
    # Estrai parole (token)
    tokens = re.findall(r'\b\w+\b', str(lyrics).lower())
    n_tok = len(tokens)
    tracks.loc[idx, 'n_tokens'] = n_tok
    
# Statistiche dopo il fix
missing_after = tracks['n_tokens'].isna().sum()
fixed = missing_before - missing_after
print(f"Valori fixati: {fixed}")
print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")


# ============================================================================
# FIX tokens_per_sent
# ============================================================================
print("\n--- FIX tokens_per_sent ---")
missing_before = tracks['tokens_per_sent'].isna().sum()
print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")

# Identifica righe con tokens_per_sent mancante
mask_missing = tracks['tokens_per_sent'].isna()

# Calcola tokens_per_sent da n_tokens e n_sentences
for idx in tracks[mask_missing].index:
    n_tok = tracks.loc[idx, 'n_tokens']
    n_sent = tracks.loc[idx, 'n_sentences']
    
    if pd.notna(n_tok) and pd.notna(n_sent) and n_sent > 0:
        tok_per_sent = n_tok / n_sent
        tracks.loc[idx, 'tokens_per_sent'] = tok_per_sent
    
# Statistiche dopo il fix
missing_after = tracks['tokens_per_sent'].isna().sum()
fixed = missing_before - missing_after
print(f"Valori fixati: {fixed}")
print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")


# ============================================================================
# FIX char_per_tok
# ============================================================================
print("\n--- FIX char_per_tok ---")
missing_before = tracks['char_per_tok'].isna().sum()
print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")

# Identifica righe con char_per_tok mancante
mask_missing = tracks['char_per_tok'].isna()

# Calcola char_per_tok dalle lyrics
for idx in tracks[mask_missing].index:
    lyrics = tracks.loc[idx, 'lyrics']
    # Estrai token e calcola media caratteri
    tokens = re.findall(r'\b\w+\b', str(lyrics).lower())
    n_tok = len(tokens)
    
    if n_tok > 0:
        total_chars = sum(len(t) for t in tokens)
        char_per_tok = total_chars / n_tok
        tracks.loc[idx, 'char_per_tok'] = char_per_tok
    
# Statistiche dopo il fix
missing_after = tracks['char_per_tok'].isna().sum()
fixed = missing_before - missing_after
print(f"Valori fixati: {fixed}")
print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")


# ============================================================================
# FIX lexical_density
# ============================================================================
print("\n--- FIX lexical_density ---")
missing_before = tracks['lexical_density'].isna().sum()
print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")

# Identifica righe con lexical_density mancante
mask_missing = tracks['lexical_density'].isna()

# Calcola lexical_density dalle lyrics
for idx in tracks[mask_missing].index:
    lyrics = tracks.loc[idx, 'lyrics']
    # Estrai token e calcola densità lessicale
    tokens = re.findall(r'\b\w+\b', str(lyrics).lower())
    n_tok = len(tokens)
    
    if n_tok > 0:
        unique_tokens = len(set(tokens))
        lex_dens = unique_tokens / n_tok
        tracks.loc[idx, 'lexical_density'] = lex_dens
    
# Statistiche dopo il fix
missing_after = tracks['lexical_density'].isna().sum()
fixed = missing_before - missing_after
print(f"Valori fixati: {fixed}")
print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")


# ============================================================================
# FIX avg_token_per_clause
# ============================================================================
print("\n--- FIX avg_token_per_clause ---")

if not nlp_available or nlp_it is None or nlp_en is None:
    print("ERRORE: Modelli SpaCy non disponibili. Salta il calcolo di avg_token_per_clause")
else:
    missing_before = tracks['avg_token_per_clause'].isna().sum() if 'avg_token_per_clause' in tracks.columns else len(tracks)
    print(f"Valori mancanti prima: {missing_before} ({(missing_before/len(tracks)*100):.2f}%)")
    
    if 'avg_token_per_clause' not in tracks.columns:
        tracks['avg_token_per_clause'] = pd.NA
        print("Colonna 'avg_token_per_clause' creata")
    
    mask_missing = tracks['avg_token_per_clause'].isna()
    indices_to_process = tracks[mask_missing].index.tolist()
    total_to_process = len(indices_to_process)
    
    print(f"Elaborazione di {total_to_process} righe con SpaCy...")
    
    # Prepara i dati: [(idx, lyrics, lang), ...]
    data_to_process = []
    for idx in indices_to_process:
        lyrics = tracks.loc[idx, 'lyrics']
        if pd.notna(lyrics) and lyrics != '':
            try:
                lang = detect(str(lyrics)[:500])
            except:
                lang = 'it'
            data_to_process.append((idx, str(lyrics)[:1000000], lang))
    
    # Elabora in batch per lingua
    print("Elaborazione canzoni italiane...")
    italian_data = [(idx, text) for idx, text, lang in data_to_process if lang == 'it']
    for i, (idx, lyrics) in enumerate(italian_data):
        try:
            doc = nlp_it(lyrics)
            n_clauses = sum(1 for _ in doc.sents)
            for sent in doc.sents:
                for token in sent:
                    if token.pos_ == 'VERB' and token.dep_ in ['ccomp', 'xcomp', 'advcl', 'relcl', 'acl', 'csubj', 'csubjpass']:
                        n_clauses += 1
            n_tokens = len([t for t in doc if t.is_alpha])
            if n_clauses > 0:
                tracks.loc[idx, 'avg_token_per_clause'] = n_tokens / n_clauses
            
            if (i + 1) % 500 == 0:
                print(f"  Processate {i + 1}/{len(italian_data)} canzoni italiane...")
        except:
            continue
    
    print("Elaborazione canzoni inglesi...")
    english_data = [(idx, text) for idx, text, lang in data_to_process if lang != 'it']
    for i, (idx, lyrics) in enumerate(english_data):
        try:
            doc = nlp_en(lyrics)
            n_clauses = sum(1 for _ in doc.sents)
            for sent in doc.sents:
                for token in sent:
                    if token.pos_ == 'VERB' and token.dep_ in ['ccomp', 'xcomp', 'advcl', 'relcl', 'acl', 'csubj', 'csubjpass']:
                        n_clauses += 1
            n_tokens = len([t for t in doc if t.is_alpha])
            if n_clauses > 0:
                tracks.loc[idx, 'avg_token_per_clause'] = n_tokens / n_clauses
            
            if (i + 1) % 500 == 0:
                print(f"  Processate {i + 1}/{len(english_data)} canzoni inglesi...")
        except:
            continue
    
    missing_after = tracks['avg_token_per_clause'].isna().sum()
    fixed = missing_before - missing_after
    print(f"\nValori fixati: {fixed}")
    print(f"Valori mancanti dopo: {missing_after} ({(missing_after/len(tracks)*100):.2f}%)")



--- FIX n_sentences ---
Valori mancanti prima: 0 (0.00%)
Valori fixati: 0
Valori mancanti dopo: 0 (0.00%)

--- FIX n_tokens ---
Valori mancanti prima: 0 (0.00%)
Valori fixati: 0
Valori mancanti dopo: 0 (0.00%)

--- FIX tokens_per_sent ---
Valori mancanti prima: 0 (0.00%)
Valori fixati: 0
Valori mancanti dopo: 0 (0.00%)

--- FIX char_per_tok ---
Valori mancanti prima: 0 (0.00%)
Valori fixati: 0
Valori mancanti dopo: 0 (0.00%)

--- FIX lexical_density ---
Valori mancanti prima: 0 (0.00%)
Valori fixati: 0
Valori mancanti dopo: 0 (0.00%)

--- FIX avg_token_per_clause ---
Valori mancanti prima: 73 (0.65%)
Elaborazione di 73 righe con SpaCy...
Elaborazione canzoni italiane...
Elaborazione canzoni inglesi...

Valori fixati: 73
Valori mancanti dopo: 0 (0.00%)


## SAVE CHANGES ON CSV


### ARTISTS


In [None]:
# Save artists dataset
output_path = f'{data}artists.csv'
artists.to_csv(output_path, sep=';')
print(f'Dataset tracks salvato in: {output_path}')


### TRACKS


In [None]:
# Save tracks dataset
output_path = f'{data}tracks.csv'
tracks.to_csv(output_path, sep=',')
print(f'Dataset tracks salvato in: {output_path}')


Dataset tracks salvato in: ./data/tracks.csv
