# Importing Libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA


# Data Understanding and Preparation

## Reading Data

In [None]:
artists_path = 'data\\artists.csv'
tracks_path = 'data\\tracks.csv'

This code automatically detects the correct separator for two dataset files (tracks and artists) by checking which character — comma, semicolon, or tab — appears most in the first line. It then loads each file into a pandas DataFrame using the detected separator, prints their shapes, and displays the first few rows.

 The tracks dataset has 11,166 rows and 45 columns, while the artists dataset has 104 rows and 14 columns.

In [None]:
# Funzione helper per capire il separatore corretto
def detect_separator(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        sample = f.readline()
    # Conta quanti separatori compaiono
    seps = {',': sample.count(','), ';': sample.count(';'), '\t': sample.count('\t')}
    best_sep = max(seps, key=seps.get)
    print(f"Detected separator for {filepath}: '{best_sep}'")
    return best_sep

# Rileva automaticamente il separatore
sep_tracks = detect_separator(tracks_path)
sep_artists = detect_separator(artists_path)

print('------------------------------------')

# Carica i dataset in base al separatore rilevato
tracks = pd.read_csv(tracks_path, sep=sep_tracks, encoding='utf-8', engine='python')
artists = pd.read_csv(artists_path, sep=sep_artists, encoding='utf-8', engine='python')

# Mostra alcune info per verifica
print(f"Tracks shape: {tracks.shape}")
print(f"Artists shape: {artists.shape}")
print('------------------------------------')

print('TRACKS')
display(tracks.head(3))

print('------------------------------------')
print('ARTISTS')
display(artists.head(3))


In [None]:
print("Artists Features")
print(artists.columns.tolist())

print("Tracks Features")
print(tracks.columns.tolist())


## Duplicates

### Artists

The following code checks the artists dataset for duplicates in two ways: first, it looks for identical full rows to detect any completely repeated entries; then, it checks for duplicates specifically based on the artist ID and artist name columns.
<B> After performing both checks, it confirms that there are no duplicate artists in the dataset </B>.

In [None]:
# Check for duplicated artists rows
duplicates_artists = artists[artists.duplicated()]

print(f"Number of duplicated Artists rows: {duplicates_artists.shape[0]}")
display(duplicates_artists.head(5))

In [None]:
# Check for duplicated artists based on artist id
duplicates_artists_id = artists[artists.duplicated(subset='id_author')]
print(f"Number of duplicated artist based on ID: {duplicates_artists_id.shape[0]}")
display(duplicates_artists_id.head(5))



In [None]:
# Check for duplicated artists based on artist name	
duplicates_artists_name = artists[artists.duplicated(subset='name')]
print(f"Number of duplicated artist based on Name: {duplicates_artists_name.shape[0]}")
display(duplicates_artists_name.head(5))

### Tracks
Duplicates rows check has been also performed here.
No duplicated rows were detected, indicating that all track entries are unique.

In [None]:
# Check for duplicated tracks rows
duplicates_tracks = tracks[tracks.duplicated()]

print(f"Number of duplicated rows: {duplicates_tracks.shape[0]}")
display(duplicates_tracks.head(5))

#### Duplicated Tracks based on ID

This code checks the tracks dataset for duplicates based specifically on the track ID column. It identifies all rows where the same ID appears more than once, counts them, and displays them.
 It first identifies all rows where the same ID appears more than once, counts how many duplicated tracks exist, and displays them. Then, it counts how many times each track ID occurs in the dataset. 

<B> The result shows that there are 73 duplicated rows based on track IDs. 
Precisely we have 71  distinct IDs that have duplicates. </B>

<B>one track ID is repeated four times, while the others are each repeated twice </B>

In [None]:
# Check for duplicated tracks based on track id
duplicates_tracks_id = tracks[tracks.duplicated(subset='id')]
print(f"Number of duplicated Tracks rows based on ID: {duplicates_tracks_id.shape[0]}")
display(duplicates_tracks_id)


# Count how many times each id_track appears
id_counts = tracks['id'].value_counts()
duplicate_id_counts = id_counts[id_counts > 1]

print('Number of distinct IDs that have duplicates')
print(duplicate_id_counts.size)
print("Number of tracks for each id:")
print(duplicate_id_counts)


The following code lists every full_title associated with each duplicated track ID. The results show 71 duplicated IDs in total. Most of these IDs are linked to two different songs, except for one ID that is associated with four songs (two pairs sharing the same title).

In [None]:
# Find all duplicated track IDs
duplicate_ids = tracks[tracks.duplicated(subset='id', keep=False)]

# Group by 'id' and list all titles
titles_per_id = duplicate_ids.groupby('id')['full_title'].apply(list)

# Display each ID with all titles and the count of unique titles
for track_id, titles in titles_per_id.items():
    unique_count = len(set(titles))  # number of unique titles
    print(f"Track ID: {track_id} Number(of total songs: {len(titles)})(Unique titles: {unique_count})")
    for title in titles:
        print(f"  - {title}")
    print('----------------------------------------------------------')


#### Fixing Duplicated Tracks Id
After reviewing the songs associated with the duplicated IDs, we found that each duplicated ID corresponds to different songs, except for one case that will be treated later. Therefore, the most reasonable solution is to modify the duplicated IDs by appending the row number to each one. This approach ensures that all songs are preserved while maintaining unique identifiers for every track.

In [None]:
# Identify duplicated IDs
duplicate_mask = tracks.duplicated(subset='id', keep=False)

# Assign new unique IDs only to duplicated rows
tracks.loc[duplicate_mask, 'id'] = (
    tracks.loc[duplicate_mask]
    .apply(lambda x: f"{x['id']}_{x.name}", axis=1)
)


print("Example of updated duplicates:")
display(tracks[duplicate_mask][['id', 'full_title']])


#### Duplicated Tracks based on Title
The following code identifies tracks that share the same full_title, meaning duplicate song titles. We found four duplicated tracks, corresponding to two pairs of songs with identical titles.

In [None]:
# Find duplicated full_title
duplicate_titles = tracks[tracks.duplicated(subset='full_title', keep=False)]

# Sort by full_title to see them together
duplicate_titles = duplicate_titles.sort_values('full_title')

print(f"Tracks with duplicate track based on full_title: {duplicate_titles.shape[0]}")
display(duplicate_titles)


#### Fixing Duplicated Tracks full_title

The duplicated titles  — "BUGIE by Madame (Ft. Carl Brave & Rkomi)" and "sentimi by Madame" — actually refer to the same songs released in two different formats: one from the album and one from the single version. 
We decided to keep the duplicated tracks in the dataset but add a clear indication in the full_title to show whether each song comes from a single or an album. This way, all versions are preserved while making it easy to distinguish between different releases of the same song

In [None]:

# Find duplicated full_titles
duplicate_mask = tracks.duplicated(subset='full_title', keep=False)

# Update only the duplicated titles by appending album_type
tracks.loc[duplicate_mask, 'full_title'] = (
    tracks.loc[duplicate_mask, 'full_title'] + 
    " (" + tracks.loc[duplicate_mask, 'album_type'].fillna('unknown').str.capitalize() + ")"
)

# Verify the changes
duplicate_titles = tracks[tracks.duplicated(subset='full_title', keep=False)].sort_values('full_title')
display(duplicate_titles[['full_title', 'album_type', 'id']])


## Merging the Datasets


Merging the tracks and artists datasets into a single DataFrame called df. It matches rows where the <B> id_artist column in tracks</B> corresponds to the <B>id_author column in artists</B>, using a left join so that all tracks are kept even if some artists are missing. After merging, it prints the number of rows and columns in the unified dataset and shows the first three rows for inspection.

In [None]:
df = tracks.merge(artists, left_on='id_artist', right_on='id_author', how='left')

print(f"Unified dataset: {df.shape[0]} rows , {df.shape[1]} columns")
display(df.head(3))



Checking if there is a track without an artist

In [None]:
# Check for tracks without a matching artist
missing_artists = df[df['id_author'].isna()]

print(f"Number of tracks without an artist: {missing_artists.shape[0]}")
display(missing_artists.head(5))

Checking if there is an artist without a track

In [None]:
missing_artists = artists[~artists['id_author'].isin(tracks['id_artist'])]
print("Number of artists without any tracks:", len(missing_artists))

## Correlation Analysis

This section explores the linear relationships between numerical features in the datasets.
The **Pearson correlation coefficient** is used to measure how strongly two variables move together.

The analysis is conducted separately for:
- **tracks.csv** → to understand relationships among audio, linguistic, and popularity features;
- **artists.csv** → to explore dependencies between demographic and career-related attributes;
- and finally on the **merged dataset**, combining both perspectives.

Interpreting the heatmaps:
- **+1** → strong positive correlation (variables increase together)
- **−1** → strong negative correlation (one increases while the other decreases)
- **0** → little or no linear relationship

This step helps identify redundant variables, reveal patterns between audio and artist features,
and guide feature selection and dimensionality reduction before modeling.

In [None]:
def plot_correlation_heatmap(df, title):
    """Plot a Pearson correlation heatmap for numeric features (white–magenta color scheme)."""

    # Select numeric columns
    numeric_df = df.select_dtypes(include=['number'])
    if numeric_df.shape[1] < 2:
        print(f"Skipping {title}: not enough numeric columns.")
        return

    # Compute Pearson correlation matrix
    corr_matrix = numeric_df.corr(method='pearson')

    # Define custom color palette (white → light magenta → dark magenta)
    cmap = sns.color_palette(["#F6D6FF", "#E5A4FF", "#D873FF", "#C43EFF", "#9B00CC"])


    # Plot the heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        corr_matrix,
        cmap=cmap,         # Custom magenta palette
        center=0,
        annot=True,
        fmt=".2f",
        linewidths=0.5,
        cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'}
    )

    # Style adjustments
    plt.title(f"Correlation Heatmap – {title}", fontsize=16, pad=15, color="#333333")
    plt.xticks(rotation=45, ha='right', color="#333333")
    plt.yticks(rotation=0, color="#333333")
    plt.tight_layout()
    plt.show()


In [None]:
# Correlation Heatmap for TRACKS dataset
print("=== Correlation Heatmap: TRACKS dataset ===")
plot_correlation_heatmap(tracks, "Tracks Dataset")

# Correlation Heatmap for ARTISTS dataset
print("=== Correlation Heatmap: ARTISTS dataset ===")
plot_correlation_heatmap(artists, "Artists Dataset")

# Correlation Heatmap for MERGED dataset (df)
print("=== Correlation Heatmap: MERGED dataset ===")
plot_correlation_heatmap(df, "Merged Dataset (Tracks + Artists)")

## Outliers Detection

In [None]:
# Numerical Feature Definition

# List of key numerical columns to analyze
skewed_features = [
    'tokens_per_sent', 'avg_token_per_clause', 'duration_ms', 'stats_pageviews', 'swear_EN', 'char_per_tok', 'swear_IT', 'bpm', 'n_sentences', 'n_tokens', 'rolloff', 'zcr', 'lexical_density', 'flatness'
]

simetric_features =[
    'pitch', 'centroid', 'spectral_complexity', 'loudness', 'flux', 'rms'
]


I removed the following variables from the list, the statistical analysis of outliers (IQR/Z-Score) is semantically wrong for them: disc_number, track_number, Month, Day

In [None]:
n_cols = 3
n_rows = -(-len(simetric_features) // n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(simetric_features):
    sns.boxplot(x=df[col], ax=axes[i], orient='h', color="skyblue")
    axes[i].set_title(col, fontsize=14)
    axes[i].set_xlabel("")

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Analisi Box Plot per Rilevazione Outlier", fontsize=20, y=1.03)
plt.tight_layout()
plt.savefig("simetric_box_plots_analysis.png")
plt.show()

All six plots show distributions that appear relatively symmetric. The median line within each box is positioned near the center of the box (the Interquartile Range, or IQR), indicating that the 50th percentile is roughly equidistant from the 25th (Q1) and 75th (Q3) percentiles.

Every feature have the presence of candidate outliers.

Outlier Distribution:

pitch, centroid, flux, and rms all show outliers on both the lower (left) and upper (right) ends of their distributions.

spectral_complexity and loudness appear to have outliers almost exclusively on the high end (right side).

This visual inspection suggests that while the central tendency of these features is symmetrically distributed, a small number of records possess extremely high and/or low values that fall outside the main data cluster.

In [None]:
n_cols = 3
n_rows = -(-len(skewed_features) // n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(skewed_features):
    sns.boxplot(x=df[col], ax=axes[i], orient='h', color="skyblue")
    axes[i].set_title(col, fontsize=14)
    axes[i].set_xlabel("")

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Analisi Box Plot per Rilevazione Outlier", fontsize=20, y=1.03)
plt.tight_layout()
plt.savefig("skewed_box_plots_analysis.png")
plt.show()

* **Extreme Skewness:** For many features, particularly `stats_pageviews`, `swear_EN`, `swear_IT`, `tokens_per_sent`, and `avg_token_per_clause`, the main "box" is extremely compressed and pushed to the far left. This visually confirms a strong positive skew (right-tailed distribution).

* **Vast Number of Outliers:** The most prominent characteristic is the dense cloud of outliers extending far to the right for these positively skewed features. This indicates that a large number of records have values significantly higher than the main cluster of data.

* **Specific Cases:**
    * **Positively Skewed:** `stats_pageviews` and `swear_EN` are the most extreme examples, where the box is barely visible, and the plot is dominated by a long stream of high-end outliers.
    * **Negatively Skewed:** `flatness` is the clear exception, showing the opposite pattern. Its box is compressed to the far right, with a long tail of outliers on the low end (left side), confirming its negative skew.
    * **Mixed Outliers:** Features like `duration_ms`, `n_sentences`, and `lexical_density` show outliers on *both* sides, though the high-end outliers are generally more numerous or extreme.

The visual analysis shows that these features are heavily skewed and contain a large quantity of extreme values. Simply removing all these outliers would lead to massive data loss.

In [None]:
# Analisi Statistica: Metodo IQR (per variabili asimmetriche)
outlier_data = []

for col in skewed_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    limite_inferiore = Q1 - 1.5 * IQR
    limite_superiore = Q3 + 1.5 * IQR

    # Identifica outlier
    outliers_bassi = df[df[col] < limite_inferiore]
    outliers_alti = df[df[col] > limite_superiore]

    outlier_data.append({
        'variable': col,
        'Outlier Alti (Sopra)': len(outliers_alti),
        'Outlier Bassi (Sotto)': len(outliers_bassi)
    })

    print(f"Variabile '{col}':")
    print(f"  Limiti IQR: [{limite_inferiore:.2f}, {limite_superiore:.2f}]")
    print(f"  Trovati {len(outliers_bassi)} outlier sotto il limite.")
    print(f"  Trovati {len(outliers_alti)} outlier sopra il limite.")

In [None]:
if outlier_data:
    df_outliers = pd.DataFrame(outlier_data)

    # Calcola il totale per l'ordinamento
    df_outliers['total_outliers'] = df_outliers['Outlier Alti (Sopra)'] + df_outliers['Outlier Bassi (Sotto)']

    # Ordina il DataFrame
    df_outliers = df_outliers.sort_values(by='total_outliers', ascending=False)

    df_melted = df_outliers.melt(
        id_vars=['variable', 'total_outliers'],
        value_vars=['Outlier Alti (Sopra)', 'Outlier Bassi (Sotto)'],
        var_name='Tipo di Outlier',
        value_name='Numero di Outlier'
    )

    # Crea il Grafico a Barre
    plt.figure(figsize=(18, 9))
    sns.barplot(
        data=df_melted,
        x='variable',
        y='Numero di Outlier',
        hue='Tipo di Outlier',
        order=df_outliers['variable'],
        palette={"Outlier Alti (Sopra)": "#d36ba8", "Outlier Bassi (Sotto)": "#8cbcd9"}
    )

    plt.title('Conteggio Outlier (Metodo IQR) per Variabili Asimmetriche', fontsize=20)
    plt.xlabel('Variabile', fontsize=14)
    plt.ylabel('Numero di Outlier Rilevati', fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title='Tipo di Outlier', fontsize=12, title_fontsize=14)
    plt.tight_layout()

    plt.savefig("outlier_counts_asymmetric_barchart.png")
    print("Grafico del conteggio degli outlier generato e salvato come 'outlier_counts_asymmetric_barchart.png'.")
    plt.show()

In [None]:
# Analisi Statistica: Metodo Z-Score (per variabili simmetriche)
print("\n Metodo Z-Score (per variabili simmetriche)")
outlier_data_z = []
soglia_z = 3

for col in simetric_features:
    mean = df[col].mean()
    std = df[col].std()

    limite_inferiore_z = mean - (soglia_z * std)
    limite_superiore_z = mean + (soglia_z * std)

    # Identifica outlier
    outliers_bassi_z = df[df[col] < limite_inferiore_z]
    outliers_alti_z = df[df[col] > limite_superiore_z]

    outlier_data_z.append({
        'variable': col,
        'Outlier Alti (Sopra)': len(outliers_alti_z),
        'Outlier Bassi (Sotto)': len(outliers_bassi_z)
    })

    print(f"Variabile '{col}':")
    print(f"  Limiti Z-Score (soglia={soglia_z}): [{limite_inferiore_z:.2f}, {limite_superiore_z:.2f}]")
    print(f"  Trovati {len(outliers_bassi_z)} outlier sotto il limite.")
    print(f"  Trovati {len(outliers_alti_z)} outlier sopra il limite.")

The IQR statistical analysis confirms that **all asymmetric features contain a significant number of outliers**, as clearly visualized in the summary bar chart.

* **Massive Outlier Counts:** The most striking case is `swear_EN`, with 2,740 high-end outliers. This is a statistical artifact: its IQR is `[0.00, 0.00]`, meaning any track with even one English swear word is flagged. `swear_IT` (746), `avg_token_per_clause` (605), and `flatness` (507) also show a very high volume of outliers, making simple removal impossible.

* **One-Sided Distributions:**
    * `stats_pageviews`, `swear_EN`, and `swear_IT` only have **high-end outliers**. This perfectly matches their positive skew (right-tail) and identifies "hit songs" or lyrically extreme tracks.
    * `flatness` is the only feature with exclusively **low-end outliers**, confirming its negative skew (left-tail).

* **Two-Sided Distributions:** Most features, including `duration_ms`, `n_sentences`, `lexical_density`, and `char_per_tok`, show a significant number of outliers on **both sides**. This implies the presence of errors or extreme values at both ends (e.g., for `duration_ms`, this likely includes both "skit" tracks and very long songs).

* **Isolated Outliers:** `bpm` is a special case, showing **only one high-end outlier**. This is almost certainly an data-entry error (e.g., `bpm = 900`) and will be simple to inspect and correct.

In [None]:
# Converti la lista in un DataFrame
if outlier_data_z:
    df_outliers_z = pd.DataFrame(outlier_data_z)

    # Calcola il totale per l'ordinamento
    df_outliers_z['total_outliers'] = df_outliers_z['Outlier Alti (Sopra)'] + df_outliers_z['Outlier Bassi (Sotto)']

    # Ordina il DataFrame
    df_outliers_z = df_outliers_z.sort_values(by='total_outliers', ascending=False)

    df_melted_z = df_outliers_z.melt(
        id_vars=['variable', 'total_outliers'],
        value_vars=['Outlier Alti (Sopra)', 'Outlier Bassi (Sotto)'],
        var_name='Tipo di Outlier',
        value_name='Numero di Outlier'
    )

    plt.figure(figsize=(15, 7))
    sns.barplot(
        data=df_melted_z,
        x='variable',
        y='Numero di Outlier',
        hue='Tipo di Outlier',
        order=df_outliers_z['variable'],
        palette={"Outlier Alti (Sopra)": "#d36ba8", "Outlier Bassi (Sotto)": "#8cbcd9"}
    )

    plt.title('Conteggio Outlier (Metodo Z-Score) per Variabili Simmetriche', fontsize=18)
    plt.xlabel('Variabile', fontsize=12)
    plt.ylabel('Numero di Outlier Rilevati', fontsize=12)
    plt.xticks(rotation=0, ha='center', fontsize=11)
    plt.yticks(fontsize=11)
    plt.legend(title='Tipo di Outlier', fontsize=11, title_fontsize=13)
    plt.tight_layout()

    plt.savefig("outlier_counts_symmetric_barchart.png")
    print("\nGrafico del conteggio degli outlier generato e salvato come 'outlier_counts_symmetric_barchart.png'.")
    plt.show()

* **Quantification of Outliers:** The analysis provides precise counts for these extreme values. `pitch` emerges as the feature with the most outliers (68 total), while `spectral_complexity` has the fewest (22). `centroid` (57) and `flux` (49) also show a notable number of outliers.

* **Distribution of Outliers:**
    * For most features (`pitch`, `centroid`, `spectral_complexity`, `flux`, and `rms`), the outliers are **distributed on both the high and low ends**. This indicates that there are tracks with values that are exceptionally high *and* exceptionally low for these audio characteristics.
    * The most significant exception is `loudness`, which has 27 identified outliers, all of which are **exclusively on the high side** (above the 50.17 limit). This statistical result perfectly matches its box plot, which only showed an upper tail.

This analysis confirms that while these features are symmetrically distributed, a small number of extreme values are present. These outliers are likely real but rare data points (e.g., unusually loud or high-pitched songs) that must be handled before clustering to prevent them from skewing the model.

### Multivariate analisys

In [None]:
# DBSCAN to identify multidimensional outliers
from sklearn.cluster import DBSCAN

# Select ALL the numerical features you want to analyze
numerical_features = skewed_features + simetric_features

# Prepare the data: distance-based algorithms REQUIRE standardization
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numerical_features].dropna())

# Apply DBSCAN
dbscan = DBSCAN(eps=3.0, min_samples=10)
clusters = dbscan.fit_predict(df_scaled)

# Extract the outliers
# Points labeled as -1 are the outliers (noise)
outlier_indices = np.where(clusters == -1)[0]
n_outliers = len(outlier_indices)

print(f"\n--- Algorithmic Analysis (DBSCAN) ---")
print(f"Features analyzed: {len(numerical_features)}")
print(f"Records analyzed: {len(df_scaled)}")
print(f"Parameters: eps=3.0, min_samples=10")
print(f"Found {n_outliers} multidimensional outliers (label -1).")

# 5. (Optional) Print some of the found outliers
print("\nFirst 10 multidimensional outliers found:")
display(df.iloc[outlier_indices].head(10))

In [None]:
# Ensure there are no NaNs (should be clean already)
df_analysis = df[numerical_features].dropna()
print(f"Data ready for analysis: {df_analysis.shape}")

scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_analysis)

# Applying Isolation Forest
iso_forest = IsolationForest(contamination=0.02, random_state=42)

# We train and get the predictions
# The algorithm assigns:
#  1 for Inliers (normal points)
# -1 for Outliers (anomalous points)
predictions = iso_forest.fit_predict(data_scaled)

df_analysis['is_outlier_multi'] = predictions

df['is_outlier_multi'] = df_analysis['is_outlier_multi'].reindex(df.index)

outliers_multi = df[df['is_outlier_multi'] == -1]
print(f"\nAnalysis completed.")
print(f"Number of multivariate outliers identified: {len(outliers_multi)}")

# Show some of the records identified as anomalous
print("\nExamples of Multivariate Outliers:")

# Show the original columns and our clean columns
display(outliers_multi[['full_title', 'primary_artist'] + numerical_features].head())

It identified the 93 (2%) songs that are the most stylistically unique when combining all 21 features.
The examples show two clear patterns:

Lyrical Anomalies (Rosa Chemical): This artist is flagged repeatedly. The data shows his songs have a very rare combination: they are lyrically complex (avg_token_per_clause_log, lexical_density) AND have high profanity in both Italian and English (swear_IT_log, swear_EN_log). This makes them stand out from all other artists.

Audio Anomalies (thasup): The "thasup" track is a perfect example of an audio outlier. It has a very slow bpm (82) but is at the maximum loudness (45) and maximum pitch (3191). This combination of "slow, loud, and high-pitched" is a very unusual audio profile.

Conclusion: These 93 songs are "stylistic outliers" (like experimental tracks). They are not errors, but you should remove them before clustering to get clearer, more representative clusters of the main "rap schools".

In [None]:
# We use PCA to reduce the dimension and be able to plot the outliers

# Reduce the scaled data to 2 dimensions
pca = PCA(n_components=2, random_state=42)
data_scaled_2d = pca.fit_transform(data_scaled)

# Create a DataFrame for plotting
plot_df = pd.DataFrame(data_scaled_2d, columns=['PC1', 'PC2'])
plot_df['is_outlier'] = predictions

# Create the Scatter Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
    data=plot_df,
    x='PC1',
    y='PC2',
    hue='is_outlier',
    palette={1: 'blue', -1: 'red'}, # Outliers in red
    alpha=0.6
)
plt.title('Isolation Forest Results (visualized with PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Type', labels=['Outlier (-1)', 'Inlier (1)'])
plt.show()

### Correlation Analysis of Outliers

In [None]:
# Select the clean data (only the inliers)
if 'df_clustering' not in locals():
    df_clustering = df[df['is_outlier_multi'] != -1]

df_corr = df_clustering[numerical_features].copy()
print(f"Data ready for correlation analysis: {df_corr.shape}")

# Calculate the correlation matrix (Pearson Method)
corr_matrix = df_corr.corr(method='pearson')

# Visualize the Heatmap
plt.figure(figsize=(20, 16))

# Create a "mask" to hide the upper part
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,
    fmt='.1f',
    cmap='coolwarm',
    linewidths=0.5,
    cbar_kws={"shrink": .8}
)
plt.title('Feature Correlation Matrix', fontsize=20, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

It clearly shows strong multicollinearity (redundancy) among your features. The data isn't noisy; it just confirms that some features measure the same underlying concept.
- Song Length: duration_ms_log, n_tokens_log, and n_sentences_log are highly correlated (0.8 to 0.9), as they all measure "song length".

- Loudness: rms_log and loudness are almost identical (0.9), measuring "volume".

- Spectral Brightness: centroid, rolloff_log, and flatness are very strongly correlated (0.9 and -0.8), measuring the "brightness" or "timbre" of the sound.

In [None]:
# 2. Select the clean data (BUT INCLUDING the multivariate outliers)
df_corr_completo = df[numerical_features].copy()

df_corr_completo = df_corr_completo.dropna()
print(f"Data ready for correlation analysis: {df_corr_completo.shape}")

# 3. Calculate the correlation matrix (Pearson Method)
corr_matrix_completo = df_corr_completo.corr(method='pearson')

# 4. Visualize the Heatmap
plt.figure(figsize=(20, 16))

mask = np.triu(np.ones_like(corr_matrix_completo, dtype=bool))

sns.heatmap(
    corr_matrix_completo,
    mask=mask,
    annot=True,
    fmt='.1f',
    cmap='coolwarm',
    linewidths=0.5,
    cbar_kws={"shrink": .8}
)
plt.title('Correlation Matrix (Including the 93 outliers from Isolation Forest)', fontsize=20, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Missing Values
This code analyzes missing values in the DataFrame by counting how many entries are NaN for each column and calculating the corresponding percentage. It creates a summary table showing only columns with missing data, sorted by the highest percentage.

In [None]:
# Calcolo missing values e percentuali
missing_count = df.isna().sum()
missing_percent = (missing_count / len(df)) * 100

missing_df = (
    pd.DataFrame({'missing_count': missing_count, 'missing_percent': missing_percent})
    .sort_values('missing_percent', ascending=False)
    .query('missing_percent > 0')
)

# Mostra tabella riepilogativa (gradiente rosso-magenta)
display(
    missing_df
    .style.background_gradient(subset=['missing_percent'], cmap='RdPu')  
    .format({'missing_percent': '{:.2f}%'})
)


The following heatmap visualizes missing values in the dataset, with each row representing a record and each column a feature. Colored cells indicate missing entries, providing a clear overview of where data is incomplete.

In [None]:
plt.figure(figsize=(14, 6))
sns.heatmap(df.isna(), cbar=False, cmap="viridis", yticklabels=False)
plt.title("Missing Values Matrix (Overview)", fontsize=20, pad=12, color="#000000")
plt.show()



The following bar plot shows the percentage of missing values per feature, with the top 20 features that have the most missing data

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(
    data=missing_df.head(20),
    x='missing_percent',
    y=missing_df.head(20).index,
    hue=missing_df.head(20).index,  
    palette='RdPu_r'  
)
plt.title("Percentage of Missing Values by Feature", fontsize=20, pad=15, color="#000000")
plt.xlabel("Missing values (%)", fontsize=12)
plt.ylabel("Feature name", fontsize=12)

# Etichette percentuali
for index, value in enumerate(missing_df.head(20)['missing_percent']):
    plt.text(value + 0.5, index, f"{value:.1f}%", va='center', fontsize=9, color='#b30059')

plt.xlim(0, 100)
sns.despine()
plt.tight_layout()
plt.show()

#### Missing Values Propagation After Merge

In [None]:
artists_missing = artists.isna().mean().sort_values(ascending=False) * 100
print(artists_missing)

The visualization highlights that missing values in attributes such as active_start, region, and birth_place have increased after merging due to the replication of incomplete artist metadata across multiple tracks.
This confirms that the merge process did not introduce new nulls, but propagated pre-existing ones.

In [None]:
# Colonne provenienti dal dataset artists 
artist_cols =list(artists.columns)

# Conta i NaN prima e dopo il merge
missing_before = artists[artist_cols].isna().sum()
missing_after = df[artist_cols].isna().sum()

# Differenza assoluta e percentuale
missing_diff = missing_after - missing_before
increase_percent = (missing_diff / missing_before.replace(0, pd.NA)) * 100

# Tabella riepilogativa
missing_summary = (
    pd.DataFrame({
        "missing_before": missing_before,
        "missing_after": missing_after,
        "difference": missing_diff,
        "increase_%": increase_percent
    })
    .sort_values("difference", ascending=False)
)

plot_df = missing_summary[missing_summary['difference'] > 0].copy()

plt.figure(figsize=(10, 6))
sns.barplot(
    data=plot_df,
    x='difference',
    y=plot_df.index,
    hue=plot_df.index,
    palette='RdPu_r'
)
plt.title("Increase in Missing Values After Merge", fontsize=15, pad=12, color="#000000")
plt.xlabel("Increase in number of missing values", fontsize=12)
plt.ylabel("Feature", fontsize=12)

# Etichette numeriche a fianco delle barre
for index, value in enumerate(plot_df['difference']):
    plt.text(value + 50, index, f"{int(value):,}", va='center', fontsize=9, color="#000000")

sns.despine()
plt.tight_layout()
plt.show()

After analyzing the percentage of missing values in each column, We need to better understand the overall data quality before applying any filling strategies. Cleaning and validating the data first ensures that missing values are handled correctly and that no incorrect or misleading information is introduced during imputation.

## Data Quality

In [None]:
df.info()

### Initial Data Cleaning

Based on the initial exploration of the dataset, we:

- **Removed empty column (`active_end`)** since it contained no useful information.  
- **Converted `popularity` and `year`** to numeric types to ensure consistency and enable statistical analysis.  
- **Transformed date-related columns** (`album_release_date`, `birth_date`, `active_start`, ) into proper datetime format for easier time-based operations.

Before directly converting year and popularity from objects to numeric and album_release_date, birth_date, and active_start from objects to datetime, we need to inspect the data to check if all values can be converted correctly and handle those that cannot be converted


In [None]:
# 1. Remove empty column
df.drop(columns=['active_end'], inplace=True)  # drop the 'active_end' column because it's empty
df.info()

#### Objects to Numeric
Inspecting the values in popularity and year columns to see the values that cannot be converted to numbers directly

In [None]:
numeric_cols = ['popularity','year']


# --- Check numeric columns ---
for col in numeric_cols:
    original = df[col].copy()
    converted = pd.to_numeric(original, errors='coerce')
    non_convertible = original[original.notna() & converted.isna()]
    
    print(f"\nColumn '{col}'  entries that cannot be converted to numeric:")
    if not non_convertible.empty:
        for idx, val in non_convertible.items():
            print(f"Row {idx}: {val}")
    else:
        print("All non-missing entries can be converted to numeric.")
    print('----------------------------------------------------------------')



Looking at the values of the  `popularity` column, we noticed that some entries contained **non-numeric characters**, percent signs (`%`), abbreviations like `K` (thousands) or `M` (millions), and words such as `"views"` appended to the numbers.  

Instead of converting the column directly to numeric using pd.to_numeric(errors='coerce'), which would have turned all invalid entries into NaN, we applied a cleaning function to preserve and correctly interpret useful numeric information before conversion. The function:

- Removed non-numeric characters and words like `"views"` and `%`.
- Converted abbreviations (`K → 1,000`, `M → 1,000,000`) to numeric values.
- Extracted the first numeric part if extra text was present.
- Converted the cleaned values to floats, marking any remaining invalid entries as `NaN`.


In [None]:
def clean_popularity(value):
    if pd.isna(value):
        return None
    value_str = str(value).strip().lower()  # normalize
    
    # Remove common words like 'views'
    value_str = value_str.replace('views','').replace('%','').strip()
    value_str = value_str.lower()  
    # Handle K and M
    multiplier = 1
    if value_str.endswith('k'):
        multiplier = 1_000
        value_str = value_str[:-1]
    elif value_str.endswith('m'):
        multiplier = 1_000_000
        value_str = value_str[:-1]
    
    # Take only first token if words remain
    value_str = value_str.split()[0]
    
    # Try converting to float
    try:
        return (float(value_str) * multiplier)
    except:
        return None  # invalid entries become None/NaN
    
df['popularity'].apply(clean_popularity)




Inspecting the values in the year column, we observed that while most entries were numerical, some contained unexpected or non-numeric characters. To handle this, we converted the column directly to a numeric type using pd.to_numeric() with the errors='coerce' parameter, which automatically transforms any invalid or non-numeric values into NaN.

In [None]:
df['year'] = pd.to_numeric(df['year'], errors='coerce') 
df.info()

#### Objects to DateTime

Inspecting the values in 'album_release_date', 'birth_date', 'active_start' columns to see the values that cannot be converted to DateTime directly

In [None]:
date_cols = ['album_release_date', 'birth_date', 'active_start']
# --- Check date columns ---
for col in date_cols:
    original = df[col].copy()
    converted = pd.to_datetime(original, errors='coerce')
    non_convertible = original[original.notna() & converted.isna()]
    
    print(f"\nColumn '{col}'  entries that cannot be converted to datetime:")
    if not non_convertible.empty:
        for idx, val in non_convertible.items():
            print(f"Row {idx}: {val}")
    else:
        print("All non-missing entries can be converted to datetime.")
    print('----------------------------------------------------------------')

Looking at the values in the album_release_date column that could not be converted to datetime, we noticed that many of them were just years (e.g., "2004"). If we used pd.to_datetime(errors='coerce') directly, these entries would have been turned into NaT. However, we wanted to keep this information by assigning a default month and day — the first day of the year.

- Instead of converting the column directly, we applied a cleaning function that:

- Detected values that were only a year (e.g., "2004") and changed them to a full date ("2004-01-01").

- Kept valid full dates (e.g., "2021-04-09") unchanged.

- Left missing values as they are.

- Finally, converted everything into proper datetime format for consistency.

In [None]:
def fix_year_only_dates(val):
    """
    If the value looks like a 4-digit year, convert it to 'YYYY-01-01'.
    Otherwise, return the original value.
    """
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.fullmatch(r'\d{4}', val_str):
        return f"{val_str}-01-01"
    return val_str

# Apply to album_release_date
df['album_release_date'] = df['album_release_date'].apply(fix_year_only_dates)

# Convert to datetime
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')

df.info()

Based on the values that could not be converted to datetime, we found that the birth_date column contained several invalid entries, such as URLs (e.g., "http://www.wikidata.org/.well-known/genid/...") instead of actual dates. Since these values do not represent meaningful or recoverable information, there is nothing worth preserving. Therefore, we are going to apply the pd.to_datetime(errors='coerce') function directly, allowing all invalid entries to be converted to NaT.

For the active_start column, all non-missing entries are  already in a valid date format, so they are going to be  successfully converted to datetime without any issues.

In [None]:
date_cols = ['birth_date', 'active_start', ]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # convert to datetime, invalid dates become NaT

df.info()


### Data Distribution
The following table and histogram show the numerical data distribution in the dataset:

- **Most features** (`n_sentences`, `n_tokens`, `tokens_per_sent`, `char_per_tok`, `lexical_density`, `avg_token_per_clause`, `centroid`, `rolloff`, `rms`, `zcr`, `flatness`, `flux`, `spectral_complexity`, `pitch`, `loudness`) show **bell-shaped or near-normal distributions**.

- **Highly skewed features** (`stats_pageviews`, `bpm`, `tokens_per_sent`, `duration_ms`, `popularity`) have a **long right tail**, indicating a few extreme values or outliers (common in popularity or count-based features).

- **Temporal features** (`year`, `month`, `day`) display **non-uniform distributions**; e.g., `year` is concentrated around recent decades, showing most songs are modern.

- **Geographical features** (`latitude`, `longitude`) have **peaks corresponding to specific locations**, likely representing where artists or tracks are clustered.


In [None]:

# Select numeric columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# --- Summary statistics table ---
display(df[num_cols].describe().T.style.background_gradient(cmap='RdPu'))

# --- Histograms for each numeric column ---
n_cols = 4
n_rows = -(-len(num_cols) // n_cols)  # ceil division
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.histplot(df[col].dropna(), bins=30, kde=True, color="#d36ba8", ax=axes[i])
    axes[i].set_title(col, fontsize=18, color="#b30059")   # larger title font
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")
    axes[i].tick_params(axis='both', labelsize=12)          # larger tick labels

# Remove unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Distribution of Numerical Features", fontsize=24, color="#000000", y=1.02)  # larger main title
plt.tight_layout()
plt.show()



The data distribution and the statistics presented above reveal some anomalies and irregularities in the dataset. These issues will be examined and addressed in the following section.


In [None]:
# Calculate basic statistics (mean and median)
statistics = df[num_cols].describe().T

# Calculate skewness
skews = df[num_cols].skew()

skew_analysis = pd.DataFrame({
    'mean': statistics['mean'],
    'median': statistics['50%'],
    'skewness_value': skews
})

# Define a function to classify skewness
# These are standard thresholds used in statistics:
# > +0.5 = Positive Skew (Right-tailed)
# < -0.5 = Negative Skew (Left-tailed)
# Between -0.5 and +0.5 = Substantially Symmetric
def classify_skew(skew_value):
    if skew_value > 0.5:
        return "Positive (Right Skew)"
    elif skew_value < -0.5:
        return "Negative (Left Skew)"
    else:
        return "Symmetric"

skew_analysis['skew_type'] = skew_analysis['skewness_value'].apply(classify_skew)

print("Skewness Analysis of Numerical Features")

display(skew_analysis.sort_values(by='skewness_value', ascending=False))

###  Features Inspection Anomalies Detection

#### Artists Names
This code groups the dataset by artist name to count how many songs each artist has, then displays the total number of unique artists and sorts them by song count. The analysis shows that there are 104 unique artists in the dataset. Among them, Mondo Marcio, Guè Pequeno, and Gemitaiz are the most prolific, each with over 300 songs. Other highly represented artists include Bassi Maestro, Fabri Fibra, and Vacca, each contributing more than 250 songs. On the other hand, a few artists such as O Zulù, Joey Funboy, and Hindaco have only a handful of tracks. Overall, the distribution highlights a few artists dominating the dataset while many others have significantly fewer songs.

In [None]:
print(df.shape)
# The result is a Pandas Series where the index is the artist name and the values are the counts.
artist_song_counts = df.groupby('name').size().sort_values(ascending=True)

# 2. Convert the Series to a DataFrame for cleaner display
artist_counts_df = artist_song_counts.reset_index(name='song_count')
# To see the total number of unique artists:
print(f"\nTotal number of unique artists: {len(artist_counts_df)}")

# 3. Print the results
print("Unique Artists and Their Song Count:")
print(artist_counts_df.sort_values(by='song_count',
    ascending=False))



# Sort by song count (descending for better view)
artist_counts_df = artist_counts_df.sort_values(by='song_count', ascending=False)

plt.figure(figsize=(10, 16))
bars = plt.barh(artist_counts_df['name'], artist_counts_df['song_count'], color='skyblue')
plt.xlabel('Number of Songs')
plt.ylabel('Artist')
plt.title('Number of Songs per Artist')

# Add numbers on top of each bar
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.3,               # x position (a bit after the end of the bar)
             bar.get_y() + bar.get_height()/2,  # y position (middle of bar)
             str(int(width)),           # text (integer value)
             va='center', fontsize=9)

plt.tight_layout()
plt.show()



#### Artists Description
The folowing code counts how many times each unique description appears in the dataset. This helps identify which artist descriptions are the most common or repeated, showing patterns such as groups of artists sharing the same description or potential duplicates.

The results show how the most frequent artist descriptions in the dataset. Most entries describe Italian rappers, producers, or singer-songwriters, reflecting that the dataset mainly focuses on Italian music artists.

For instance, “gruppo musicale italiano” (Italian music group) appears 620 times, making it the most common description.

Interestingly, there are also some non-musical or unrelated entries, like “dio indiano della distruzione e della trasformazione” (Indian god of destruction and transformation) or “tipo di barca a vela usata nel XVIII e XIX secolo” (type of sailing ship used in the 18th–19th century). These seem to be data errors.

We also noticed an entry labeled “gruppo musicale canadese” (Canadian music group). Upon checking, this description is incorrectly assigned to the Italian rapper Priestess. Further research revealed a mix-up with a Canadian band that shares the same name. This confusion becomes evident when comparing the active_start year in the dataset, which matches that of the Canadian group rather than the Italian artist.

In [None]:
df['description'].value_counts()

##### Correcting "Priestess" Entry

In [None]:
print('Before')
display( df[df['description'].str.contains('gruppo musicale canadese', case=False, na=False)]
    .drop_duplicates(subset=['name'])
    .sort_values(by='name'))

print('After')
# Fix Priestess' incorrect description and active_start date
df.loc[df['name'].str.lower() == 'priestess', ['description', 'active_start']] = [
    'cantante e rapper italiana',
    '2027-01-01'
]

# Verify the update
print(df[df['name'].str.lower() == 'priestess'][['name', 'description', 'active_start']])


##### Identifying groups in the dataset

This filter identifies all artists whose description includes the word "gruppo", which typically refers to musical groups or bands. The resulting list contains 7 well-known Italian music groups, such as 99 Posse, Articolo 31, Club Dogo, Colle Der Fomento, Cor Veleno, Dark Polo Gang, and Sottotono.

In [None]:
# Filter rows where 'description' contains 'gruppo'
artists_with_gruppo = (
    df[df['description'].str.contains('gruppo', case=False, na=False)]
[['name','description','birth_date','active_start']]
    .drop_duplicates(subset=['name'])
    .sort_values(by='name')
)

print("Artists with 'grupoo' in their description:",artists_with_gruppo.shape )
display(artists_with_gruppo)


#### Artist's BirthDate


##### Distribution of artist birth_date

This code extracts each artist’s birth year and groups them by decade (e.g., 1960s, 1970s, 1980s, etc.) to analyze how artists are distributed over time. It calculates the percentage of artists born in each decade and visualizes it with a bar chart. The results show that most artists were born between the 1980s and 1990s, indicating that the majority belong to the Millennial generation, while fewer artists were born in the 1960s or after 2000.

It also shows the histogram of the birth year

In [None]:
# Count missing values in 'birthdate' column
missing_birthdates = df['birth_date'].isna().sum()

print(f"Number of missing values in 'birthdate' column: {missing_birthdates}")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_birth_decades(df ,title1, title2):
    """
    Plots:
      1. Histogram of birth years
      2. Percentage of unique artists by decade of birth

    Parameters:
        df (pd.DataFrame): DataFrame containing at least 'name' and 'birth_date' columns.
    """
    # --- 1. Keep only unique artists ---
    unique_artists = df.drop_duplicates(subset=['name'])

    # --- 2. Extract birth year ---
    birth_year = unique_artists['birth_date'].dt.year


    # Drop missing values
    birth_year = birth_year.dropna()

    # --- 4. Create decade bins ---
    start = (int(birth_year.min()) // 10) * 10
    end = (int(birth_year.max()) // 10 + 1) * 10
    bins = list(range(start, end + 1, 10))
    labels = [f"{b}s" for b in bins[:-1]]

    # --- 5. Assign decades ---
    decades = pd.cut(birth_year, bins=bins, labels=labels, right=False)

    # --- 6. Calculate percentage per decade ---
    group_percent = decades.value_counts(normalize=True).sort_index() * 100
    group_df = pd.DataFrame({'decade': group_percent.index, 'percent': group_percent.values})

    # --- 7. Plot both charts side by side ---
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Histogram of birth years
    sns.histplot(birth_year, bins=20, kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title(title1, fontsize=16, pad=15)
    axes[0].set_xlabel("Birth Year", fontsize=12)
    axes[0].set_ylabel("Count", fontsize=12)

    # Percentage per decade
    sns.barplot(data=group_df, x='decade', y='percent', hue='decade', palette='coolwarm', legend=False, ax=axes[1])
    for i, val in enumerate(group_df['percent']):
        axes[1].text(i, val + 0.5, f"{val:.1f}%", ha='center', fontsize=10)
    axes[1].set_title(title2, fontsize=16, pad=15)
    axes[1].set_xlabel("Decade", fontsize=12)
    axes[1].set_ylabel("Percentage (%)", fontsize=12)

    sns.despine()
    plt.tight_layout()
    plt.show()


plot_birth_decades(df, "Distribution of Artists' Birth Years  before Cleaning",'Percentage of Unique Artists by Decade of Birth before cleaning')

# Confirm df is unchanged
print(df.columns)


##### Distribution of artist ages
The results show the distribution of unique artists’ ages in the dataset, ranging from 22 to 58 years old. Most artists fall between their late 20s and mid-40s, with small peaks around ages 32, 36, and 46, each having between 6 and 7 artists. Younger artists under 25 and older ones above 50 are less represented. Overall, the majority of artists are in their thirties and early forties, reflecting the typical active and productive age range in the music industry.

In [None]:

def plot_artist_ages(df,title):
    """
    Calculates and plots the number of unique artists by age.

    Parameters:
        df (pd.DataFrame): DataFrame containing at least 'name' and 'birth_date' columns.
    """
    # --- 1. Keep only unique artists ---
    unique_artists = df.drop_duplicates(subset=['name'])

    # --- 2. Calculate current age ---
    today = pd.Timestamp.today()
    artist_age = today.year - unique_artists['birth_date'].dt.year

    # --- 3. Drop missing or invalid ages ---
    artist_age = artist_age.dropna().astype(int)

    # --- 4. Count how many unique artists have each exact age ---
    age_counts = artist_age.value_counts().sort_index()
    print("Number of unique artists per age:")
    print(age_counts)

    # --- 5. Plot the distribution ---
    plt.figure(figsize=(8, 5))
    sns.barplot(x=age_counts.index, y=age_counts.values, color="#6A5ACD")

    plt.title(title, fontsize=16, color="#333")
    plt.xlabel("Age (years)", fontsize=12)
    plt.ylabel("Number of Artists", fontsize=12)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
plot_artist_ages(df,"Number of Unique Artists by Age (Before Cleaning)")


##### Artists with no birthdate
This code shows the number of the artist that doesn't have a birthdate. They are 32.

In [None]:

missing_birth_artists = df[df['birth_date'].isna()]['name'].drop_duplicates()
print(f"Number of artists with missing birth date: {missing_birth_artists.shape[0]}")

print("Artists with missing birth date:")
print(missing_birth_artists.tolist())

##### Filling Birthdates

This code manually fills missing birth dates for specific artists in the dataset. It first defines a dictionary mapping artist names to their known or estimated birth dates.

9 entries couldn’t be filled because their birth dates are intentionally left blank in the dictionary. For Miss Keta, the birth date is unknown, so no accurate value can be provided. The others — Bushwaka, Sottotono, Dark Polo Gang, Cor Veleno, Colle Der Fomento, Club Dogo, Articolo 31, and 99 Posse — are all music groups or duos, not individual artists, meaning they don’t have a single birth date associated with them.

In [None]:

# --- 1. Define the Missing Dates as a Dictionary ---
# Source of truth for the manual fill
birth_dates_to_fill = {
    'alfa': '2000-08-22',
    'anna pepe': '2003-08-15',
    'beba': '1994-10-10',
    'bigmama': '2000-03-10',
    'brusco': '1974-01-04',
    'caneda': '1976-09-30',
    'dargen d_amico': '1980-11-29',
    'guè pequeno': '1980-12-25',
    'johnny marsiglia': '1986-08-05',
    'nerone': '1991-05-23',
    'priestess': '1996-08-20',
    'samuel heron': '1991-01-01',
    'shiva': '1999-08-27',
    'yeиdry': '1993-07-27',
    'o zulù': '1970-11-15',
    'skioffi':'1992-06-05',
    'eva rea':'1993-01-01',
    'hindaco':'1996-01-01',
    'joey funboy':'1995-01-01',
    'mistico':'1982-01-01',
    'mike24':'1985-08-02',
    'doll kill':'1996-01-01',
    'miss simpatia':'1986-03-23',
    'miss keta':'',#unknown
    'bushwaka':'',#duo
    'sottotono':'',#group
    'dark polo gang':'',#group
    'cor veleno':'',#group
    'colle der fomento':'',#group
    'club dogo':'',#group
    'articolo 31':'',#group
    '99 posse':''#gruppo



    }

# --- 2. Fill the Missing Data (Imputation) ---

# Convert the dictionary to a Pandas Series for easy lookup and indexing
birth_date_series = pd.Series(birth_dates_to_fill)

# Iterate through the artists in your fill list and update the DataFrame
for artist, bday in birth_date_series.items():
    # Use .loc to find rows where 'artist_name' matches and update 'birth_date'
    # The second part of the condition (artist_df['birth_date'].isna()) ensures
    # we only overwrite if the date was previously missing (NaN).
    df.loc[
        (df['name'] == artist) & (df['birth_date'].isna()),
        'birth_date'
    ] = bday

# --- 3. Final Conversion and Verification ---

# Convert the 'birth_date' column to the proper datetime format again
# (This is crucial for accurate age calculation)
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

# Optional: Print out the affected rows to verify the fix
print("--- Verification of Filled Birth Dates  ---")
# Filter the DataFrame to show only the artists we just updated
filled_artists = df[df['name'].isin(birth_dates_to_fill.keys())]

# Show the unique artist names and their newly filled birth dates
print(filled_artists[['name', 'birth_date']].drop_duplicates().to_string(index=False))

##### Artists Names and their ages
This code generates a table showing each unique artist and their corresponding age, calculated from their birth date. Artists are listed from oldest to youngest, highlighting ages from 22 to 58 in this dataset.

In [None]:

# --- Calculate age without adding column to df ---
today = pd.Timestamp.today()
artist_age = (today - df['birth_date']).dt.days // 365

# --- Create a temporary DataFrame with unique artists and their age ---
artist_age_df = pd.DataFrame({
    'name': df['name'],
    'age': artist_age
}).drop_duplicates().sort_values(by='age', ascending=False)

# --- Display the table ---
print("Unique Artists and Their Age:")
display(artist_age_df.style.background_gradient(cmap='coolwarm'))

# --- Total number of unique artists ---
print(f"\nTotal number of unique artists: {artist_age_df['name'].nunique()}")



##### Checking distribution after filling Bithdate

In [None]:
plot_birth_decades(df, "Distribution of Artists' Birth Years  After Cleaning",'Percentage of Unique Artists by Decade of Birth After cleaning')
plot_artist_ages(df,'Number of Unique Artists by Age (After Filling NaN)')

#### Active start

##### Percentage of Artists by Active Start Decade
The dataset contains 4,601 missing values in the active_start column, meaning a significant number of artists have no recorded career start date. Considering only unique artists, the distribution across decades shows that the 1990s (32%), 2000s (30%), and 2010s (32%) were the most common periods for artists to begin their careers, indicating a fairly even spread among these decades. Earlier decades like the 1980s (4%) and the 2020s (2%) have much fewer entries, likely reflecting fewer documented artists or incomplete data for those periods.

In [None]:


def plot_active_start_decades(df, title, active_col='active_start', name_col='name'):
    """
    Plot percentage of unique artists by active start decade.
    """

    # --- Keep only unique artists based on name ---
    unique_df = df.drop_duplicates(subset=[name_col])

    # --- Extract year values, drop missing ---
    years = unique_df[active_col].dropna().dt.year

    # --- Define decade bins (e.g., 1960, 1970, ..., 2020) ---
    start = int(years.min() // 10 * 10)
    end = int(years.max() // 10 * 10 + 10)
    bins = list(range(start, end + 10, 10))
    labels = [f"{b}s" for b in bins[:-1]]

    # --- Bin into decades ---
    decade_groups = pd.cut(years, bins=bins, labels=labels, right=False)

    # --- Calculate percentages per decade ---
    group_percent = decade_groups.value_counts(normalize=True).sort_index() * 100
    group_df = pd.DataFrame({'decade': group_percent.index, 'percent': group_percent.values})

    # --- Print results ---
    print(title)
    print(group_df)

    # --- Plot ---
    plt.figure(figsize=(10, 6))
    sns.barplot(data=group_df, x='decade', y='percent', hue='decade', palette='coolwarm', legend=False)

    # --- Add percentage labels ---
    for i, val in enumerate(group_df['percent']):
        plt.text(i, val + 0.5, f"{val:.1f}%", ha='center', fontsize=10)

    plt.title(title, fontsize=18, pad=15)
    plt.xlabel("Decade", fontsize=12)
    plt.ylabel("Percentage (%)", fontsize=12)
    sns.despine()
    plt.tight_layout()
    plt.show()



print(f"Number of missing values in 'active_start': {df['active_start'].isna().sum()}")
plot_active_start_decades(df,'Percentage of Unique Artists by Active Start Decade before Cleaning')


##### Ages of artists when they started their career
The distribution of ages of unique artists when they started their careers shows that most began between 17 and 22 years old, which is reasonable. However, there are outliers, such as one artist listed as starting at age 1 and another at age 10, which clearly do not make sense. These anomalous values indicate potential data errors, and we need to investigate these specific cases to determine the best way to clean or correct the dataset.

In [None]:


def plot_age_at_career_start(df, title, birth_col='birth_date', active_col='active_start', name_col='name'):
    """
    Plot the distribution of unique artists' ages when they started their career.
    Does not modify the dataset or filter any values.
    """

    # --- Keep only unique artists based on name ---
    unique_df = df.drop_duplicates(subset=[name_col])

    # --- Compute age at career start ---
    age_at_start = (unique_df[active_col].dt.year - unique_df[birth_col].dt.year).dropna().astype(int)

    # --- Count occurrences ---
    age_counts = age_at_start.value_counts().sort_index()
    print(title)
    print(age_counts)

    # --- Plot ---
    plt.figure(figsize=(10, 6))
    bars = plt.bar(age_counts.index, age_counts.values, color="#6A5ACD")

    # --- Add counts on top of each bar ---
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height + 0.5, f'{int(height)}',
                 ha='center', va='bottom', fontsize=9)

    plt.title(title, fontsize=16, color="#333")
    plt.xlabel("Age at Career Start (years)", fontsize=12)
    plt.ylabel("Number of Artists", fontsize=12)
    plt.grid(alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()


plot_age_at_career_start(df,'Age of unique Artists When They Started Their Career Before Cleaning')


##### Ckecking artist whose age was 1, 10, 13, 27, or 31 when they started their career

Among the unique artists, several had unusual ages at career start. Bigmama (age 1) and Nesli (age 10) had incorrect active_start dates, while Salmo (age 13) and Mudimbi (age 27) were correct. However, Priestess (age 31) had an obviously wrong start year. We will correct the errors by updating Bigmama’s start year to 2016, Nesli’s to 1999, and Priestess’ to a 2017, leaving Salmo and Mudimbi unchanged.

In [None]:
# Calculate age at career start without adding a column
ages = df['active_start'].dt.year - df['birth_date'].dt.year

# Filter rows where age is 1, 10, or 13
outliers = df[ages.isin([1, 10, 13,27,31])].copy()

# Keep only unique artists based on name
unique_outliers = outliers.drop_duplicates(subset=['name'])

# Print relevant information including age
print("Unique artists with age 1, 10, 13, 27, or 31 at career start:")
print(unique_outliers[['name', 'birth_date', 'active_start']].assign(age_at_start=ages))


##### Correcting wrong active start

In [None]:
# Correcting wrong active_start dates
df.loc[df['name'] == 'bigmama', 'active_start'] = pd.to_datetime('2016-01-01')
df.loc[df['name'] == 'nesli', 'active_start'] = pd.to_datetime('1999-01-01')
df.loc[df['name'] == 'priestess', 'active_start'] = pd.to_datetime('2017-01-01')

# Verify the changes
outliers_corrected = df[df['name'].isin(['bigmama', 'nesli', 'salmo','priestess'])]
print(outliers_corrected[['name', 'birth_date', 'active_start']])


##### Artists with no active start date
This code shows the number of the artist that doesn't have a active start date. They are 54 out 104.

In [None]:
# --- Filter rows where 'active_start' is missing ---
missing_active_start = df[df['active_start'].isna()]

# --- Compute earliest full release date (from year, month, day) per artist ---
release_dates = pd.to_datetime(df[['year', 'month', 'day']], errors='coerce')
album_dates = pd.to_datetime(df['album_release_date'], errors='coerce')

# Group by artist and get earliest song date and earliest album release date
earliest_dates = (
    df.assign(_release_date=release_dates, _album_date=album_dates)
      .groupby('name', as_index=False)
      .agg({'_release_date': 'min', '_album_date': 'min'})
      .rename(columns={'_release_date': 'earliest_song_date', '_album_date': 'earliest_album_date'})
)

# --- Merge with artists missing 'active_start' ---
artists_missing_active = (
    missing_active_start[['name', 'active_start', 'birth_date']]
    .drop_duplicates()
    .merge(earliest_dates, on='name', how='left')
    .sort_values(by='name')
)

# --- Print the result ---
print("Artists without 'active_start' information (with birth dates, earliest song date, and earliest album release date):")
print(artists_missing_active.to_string(index=False))

# --- Optional count ---
print(f"\nTotal number of unique artists missing 'active_start': {artists_missing_active['name'].nunique()}")


In [None]:


# --- 1. Define the Missing Dates to be Filled ---
# --- Define the researched Active Start Dates ---
# Use the earliest known year/month of official activity.


# Total number of unique artists missing 'active_start': 54

active_starts_consolidated = {
    'alfa': '01-01-2017',
    'anna pepe': '01-01-2018',
    'babaman': '01-01-1996',
    'beba': '11-01-2015',
    'brusco': '01-01-1991',
    'capo plaza': '01-01-2013',
    'chadia rodriguez': '01-01-2017',
    'clementino': '01-01-2004',
    'dargen d_amico': '01-01-1999',
    'don joe': '01-01-1999',
    'fred de palma': '01-01-2008',
    'geolier': '01-01-2018',
    'guè pequeno': '01-01-1997',
    'miss keta': '01-01-2013',
    'shiva': '01-01-2014',
    'tedua': '01-01-2013',
    'tony effe': '01-01-2014',
    'sottotono': '01-01-1994',
    'bushwaka': '01-01-2007',
    'mike24': '01-01-2013',
    'mistico': '01-01-2008',
    'skioffi': '01-01-2014',
    "caneda": "01-01-1993",
    "club dogo": "01-01-2002",
    "colle der fomento": "01-01-1994",
    "dani faiv": "01-01-2014",
    "doll kill": "01-01-2012",
    "drefgold": "01-01-2012",
    "entics": "01-01-2004",
    "eva rea": "12-18-2014",
    "hell raton": "01-01-2010",
    "hindaco": "02-21-2020",
    "jack the smoker": "01-01-2000",
    "joey funboy": "01-01-2016",
    "johnny marsiglia": "01-01-2013",
    "la pina": "01-01-1994",
    "luchè": "01-01-1997",
    "mambolosco": "02-10-2017",
    "massimo pericolo": "01-01-2016",
    "miss simpatia": "03-31-2023",
    "mistaman": "01-01-1994",
    "mondo marcio": "01-01-2003",
    "nerone": "01-01-2008",
    "niky savage": "01-01-2021",
    "o zulù": "01-01-1991",
    "papa v": "01-01-2020",
    "rondodasosa": "01-01-2020",
    "samuel heron": "01-01-2012",
    "shablo": "01-01-1999",
    "slait": "01-01-2010",
    "tony boy": "01-01-2018",
    "tormento": "01-01-1991",
    "yeиdry": "01-01-2012",
    "yung snapp": "01-01-2012",
}
print(active_starts_imputed.length())

# Convert the dictionary to a Pandas Series for efficient filling
start_date_series = pd.Series(active_starts_to_fill)

# Iterate and fill the missing data in the 'active_start' column
for artist, start_date in start_date_series.items():
    # Use .loc to find rows where 'artist_name' matches and update 'active_start'
    df.loc[
        df['artist_name'] == artist,
        'active_start'
    ] = start_date

# Ensure the 'active_start' column is a proper datetime object
df['active_start'] = pd.to_datetime(df['active_start'], errors='coerce')

print("Active start dates have been filled in the 'active_start' column.")

#### Track Year and Album release date
Looking at the distribution of values in the track year in the previous section, we notice some entries before 1950 and after 2025, which don’t make much sense. Similarly, there are album release dates after 2025 that seem unrealistic. Therefore, we will investigate these cases further to understand the cause and decide how to correct them.

The following code groups songs and albums into 20-year intervals based on their release years and visualizes the percentage distribution in each range. It first cleans and converts the year fields, then calculates how many songs or albums fall into each 20-year period.

Result:
From the plots, we can see that more than half of the songs and albums were released between 2000 and 2020, indicating that most of the data comes from the recent two decades

In [None]:
# --- Convert 'year' to numeric ---
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# --- Drop missing years and make a copy ---
df_years = df.dropna(subset=['year']).copy()

# --- Create 20-year bins ensuring last bin includes the max year ---
start = int(df_years['year'].min())
end = int(df_years['year'].max())
bins = list(range(start, end, 20)) + [end]  # ensure last bin ends exactly at max
labels = [f"{b}-{min(b+19, end)}" for b in bins[:-1]]

df_years['year_group'] = pd.cut(df_years['year'], bins=bins, labels=labels, right=False)

# --- Calculate percentage per group ---
group_percent = df_years['year_group'].value_counts(normalize=True).sort_index() * 100
group_df = pd.DataFrame({'year_group': group_percent.index, 'percent': group_percent.values})

# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(data=group_df, x='year_group', y='percent', hue='year_group', palette='viridis', legend=False)

# --- Add percentage labels ---
for i, val in enumerate(group_df['percent']):
    plt.text(i, val + 0.5, f"{val:.1f}%", ha='center', fontsize=10)

plt.title("Percentage of Songs by 20-Year Intervals", fontsize=18, pad=15)
plt.xlabel("Year Range", fontsize=12)
plt.ylabel("Percentage (%)", fontsize=12)
sns.despine()
plt.tight_layout()
plt.show()


# --- Convert 'album_release_date' to datetime and extract year ---
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')
df['album_year'] = df['album_release_date'].dt.year

# --- Drop missing album years and make a copy ---
df_album_years = df.dropna(subset=['album_year']).copy()

# --- Create 20-year bins ensuring last bin includes the max year ---
start = int(df_album_years['album_year'].min())
end = int(df_album_years['album_year'].max())
bins = list(range(start, end, 20)) + [end]
labels = [f"{b}-{min(b+19, end)}" for b in bins[:-1]]

df_album_years['album_year_group'] = pd.cut(df_album_years['album_year'], bins=bins, labels=labels, right=False)

# --- Calculate percentage per group ---
group_percent = df_album_years['album_year_group'].value_counts(normalize=True).sort_index() * 100
group_df = pd.DataFrame({'album_year_group': group_percent.index, 'percent': group_percent.values})

# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(data=group_df, x='album_year_group', y='percent', hue='album_year_group', palette='mako', legend=False)

for i, val in enumerate(group_df['percent']):
    plt.text(i, val + 0.5, f"{val:.1f}%", ha='center', fontsize=10)

plt.title("Percentage of Songs by Album Release Year (20-Year Intervals)", fontsize=18, pad=15)
plt.xlabel("Album Release Year Range", fontsize=12)
plt.ylabel("Percentage (%)", fontsize=12)
sns.despine()
plt.tight_layout()
plt.show()

df.info()

Descriptive Statistics
The summary statistics show that the song release years range from 1900 to 2100, with an average around 2013, indicating some unrealistic future values.
For album release years, the range is 1962 to 2025, with an average around 2017, which is more reasonable and reflects that most albums were released in the last decade.

In [None]:
# For the 'year' column
print(df['year'].describe())  

# For 'album_release_date' (datetime type)
df['album_release_year'] = df['album_release_date'].dt.year
print(df['album_release_year'].describe())

Number of Songs before 1950 and after 2025 

In [None]:
tracks['year'] = pd.to_numeric(tracks['year'], errors='coerce')

# Count songs released before 1950 and after 2025
songs_before_1950 = tracks[tracks['year'] < 1950].shape[0]
songs_after_2025 = tracks[tracks['year'] > 2025].shape[0]

print(f"Number of songs before 1950: {songs_before_1950}")
# Filter songs with year > 2025
future_songs = tracks[tracks['year'] <1950 ][['full_title', 'album_release_date', 'year']]
# Display the results
display(future_songs)

print(f"Number of songs after 2025: {songs_after_2025}")
# Filter songs with year > 2025
future_songs = tracks[tracks['year'] > 2025][['full_title', 'album_release_date', 'year']]
# Display the results
display(future_songs)


In [None]:
tracks['album_release_date'] = pd.to_datetime(tracks['album_release_date'], errors='coerce')

cutoff_after = pd.to_datetime("2025-01-01")

album_release_date_after_2025 = tracks[tracks['album_release_date'] > cutoff_after].shape[0]

print(f"Number of album_release_date after 2025: {album_release_date_after_2025}")


#### Inconsistency with years

##### Checking if the active_start date is earlier than the artist’s birth_date

In [None]:
# Ensure both columns are datetime
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
df['active_start'] = pd.to_datetime(df['active_start'], errors='coerce')

# Find rows where the artist started before their birth date
invalid_dates = df[df['active_start'] < df['birth_date']]

print(f"Found {len(invalid_dates)} artists with 'active_start' earlier than 'birth_date'.")
display(invalid_dates[['id_artist', 'name_artist', 'birth_date', 'active_start']])


##### Ckecking if the career started at an unrealistically young age

In [None]:
# Ensure the columns are datetime
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
df['active_start'] = pd.to_datetime(df['active_start'], errors='coerce')

# Calculate the age at career start
df['career_start_age'] = df['active_start'].dt.year - df['birth_date'].dt.year

# Find anomalies: artists who started younger than 12
young_start_anomalies = df[df['career_start_age'] < 12]

# Display them
print("Artists with unrealistically young career start:", young_start_anomalies.shape)
print(young_start_anomalies[['birth_date', 'active_start', 'career_start_age','name_artist']])


##### Checking for tracks released (year) before the artist’s career started (active start)

In [None]:

# Find inconsistencies: track released before career start
inconsistency_release_before_career = df[df['year'] < df['active_start'].dt.year]

# Display the inconsistent rows
print('Number of records where a song was released before the artist started',inconsistency_release_before_career.shape)
display(inconsistency_release_before_career[['name_artist','full_title','year','active_start','album_release_year']])


##### Checking for tracks released before the artist’s birth

In [None]:

# Find tracks released before artist's birth
tracks_before_birth = df[df['year'] < df['birth_date'].dt.year]

# Display the anomalies
print(f"Number of tracks released before artist's birth: {len(tracks_before_birth)}")
display(tracks_before_birth[['full_title', 'year', 'birth_date','album_release_date', 'name_artist']])


##### Checking for album released before career start

In [None]:
album_before_career = df[df['album_release_date'] < df['active_start']]
print(f"Albums released before artist's career start: {len(album_before_career)}")
display(album_before_career[['full_title', 'album_release_date', 'active_start', 'name_artist']])

##### Checking for Albums released before artist's birth

In [None]:
album_before_birth = df[df['album_release_date'] < df['birth_date']]
print(f"Albums released before artist's birth: {len(album_before_birth)}")
display(album_before_birth[['full_title', 'album_release_date', 'birth_date', 'name_artist']])

##### Checking for Tracks released before album release excluding singles

In [None]:


# Tracks released before album (excluding singles)
tracks_before_album = df[
    (df['year'] < df['album_release_date'].dt.year) &
    (df['album_type'] != 'single')
]

print(f"Tracks released before the album (excluding singles): {len(tracks_before_album)}")
display(tracks_before_album[['full_title', 'year', 'album_release_date', 'album_type', 'name_artist']])


#### Popularity

In [None]:
# --- Count popularity values ---
pop_counts = (df['popularity'].astype(str)).value_counts().sort_index()  # sort index for ascending y-axis

# --- Horizontal bar plot ---
plt.figure(figsize=(10, 20))
sns.barplot(x=pop_counts.values, y=pop_counts.index,hue=pop_counts.index, palette='viridis')
plt.xlabel("Number of Songs", fontsize=12)
plt.ylabel("Popularity", fontsize=12)
plt.title("Distribution of Song Popularity", fontsize=16, pad=15)

# --- Add count labels ---
for i, val in enumerate(pop_counts.values):
    plt.text(val + 0.5, i, f"{val}", va='center', fontsize=9)

plt.tight_layout()
plt.show()


####  Artists Location Statistics


##### Checking if all the coordinates of the artists are inside italy's coordinates

In [None]:
geo_outliers = df[(df['latitude'] < 35.5) | (df['latitude'] > 47.1) |
                  (df['longitude'] < 6.6) | (df['longitude'] > 18.5)]
print(f"Number of Geographic coordinates outside Italy range: {len(geo_outliers)} records")
display(geo_outliers[['name_artist', 'latitude', 'longitude', 'birth_place']].head(10))

##### Artists' Country Values

All the countries have the value of Italia

In [None]:
# Count the occurrences of each country
country_counts = df['country'].value_counts()

# Plot
plt.figure(figsize=(10, 6))
country_counts.plot(kind='bar', color='skyblue', edgecolor='black')

plt.title('Distribution of Artists by Country', fontsize=14, pad=12)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Number of Artists', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


##### Checking if there is an artist his/her country not Italy but his/her coordinates are in Italy

In [None]:
# Filter rows where country is not Italy and coordinates are present
non_italy_with_coords = df[
    (df['country'].notna()) & 
    (df['country'] != "Italia") & 
    (df['latitude'].notna()) & 
    (df['longitude'].notna())
]

# Count the number of such records
num_records = len(non_italy_with_coords)
print(f"Number of non-Italy records with coordinates: {num_records}")

# Show the records
print(non_italy_with_coords[['country', 'latitude', 'longitude']])


##### Artists Nationality Distribution

Almost all artists are Italian (99.5%), with a small minority from Argentina (0.5%).

In [None]:

print(df['nationality'].value_counts())
# Count and calculate percentages
nat_counts = df['nationality'].value_counts()
nat_percent = (nat_counts / nat_counts.sum()) * 100
nat_df = nat_percent.reset_index()
nat_df.columns = ['nationality', 'percent']

# Plot
plt.figure(figsize=(10, 8))
sns.barplot(
    data=nat_df.head(20),  # show top 20 nationalities
    x='percent',
    y='nationality',
    hue='nationality',
    palette='crest',
    dodge=False
)

plt.title("Percentage of Artists by Nationality", fontsize=18, pad=15)
plt.xlabel("Percentage (%)", fontsize=12)
plt.ylabel("Nationality", fontsize=12)

# Add percentage labels
for index, value in enumerate(nat_df.head(20)['percent']):
    plt.text(value + 0.5, index, f"{value:.1f}%", va='center', fontsize=9, color='#000000')

plt.xlim(0, nat_df['percent'].max() + 5)
sns.despine()
plt.tight_layout()
plt.show()


##### Checking if there are artists with Non-Italian Nationality and Italian Coordinates (doubt)

There are 40 artists with a nationality other than Italian (all Argentinian) but also have italian geographic coordinates. All these 40 artists share the same coordinates (After searching for this coordinates refers to the province of Parma).

In [None]:
# Filter rows where country is not Italy and coordinates are present
non_italy_with_coords = df[
    (df['nationality'].notna()) & 
    (df['nationality'] != "Italia") & 
    (df['latitude'].notna()) & 
    (df['longitude'].notna())
]

# Count the number of such records
num_records = len(non_italy_with_coords)
print(f"Number of non-Italy Nationality records with coordinates: {num_records}")

# Show the records
print(non_italy_with_coords[['nationality','latitude', 'longitude']])


##### Nationality and Country Coherence Check  (doubt)

This check ensures that each artist’s nationality matches the country. For example, artists from Italy should have nationality Italia, and those from Argentine should have Argentina.
The results show no mismatches, meaning all records have consistent country–nationality relationships.

In [None]:

# Example mapping of country → expected nationality
country_to_nationality = {
    "Italy": "Italia",
    "Argentine": "Argentina",
}


# Function to check nationality vs country
def check_nationality_country(row):
    if pd.notna(row['country']) and pd.notna(row['nationality']):
        expected_nationality = country_to_nationality.get(row['country'])
        if expected_nationality and expected_nationality != row['nationality']:
            return True  # incoherent
    return False  # coherent or missing data

# Apply the function
df['nationality_country_mismatch'] = df.apply(check_nationality_country, axis=1)

# Count mismatches
num_mismatches = df['nationality_country_mismatch'].sum()
print(f"Number of nationality-country mismatches: {num_mismatches}")

# Show records with mismatch
mismatched_records = df[df['nationality_country_mismatch']]
print(mismatched_records[['country', 'nationality']])


##### Distribution of Artist's Birth Places

The majority of artists were born in major Italian cities, with Milano (1,843) and Roma (1,048) being the most frequent birthplaces, indicating a strong concentration of artists from these cultural and economic centers.

Smaller Italian towns such as Senigallia (443), Torino (397), and Avellino (329) also show notable representation, suggesting a widespread national distribution beyond just the biggest cities.

Only a few artists were born outside Italy — such as Buenos Aires (40) and Almería (26) — representing less than 1% of the total, which confirms that the dataset is predominantly composed of Italian-born artists.

In [None]:
# Count occurrences and calculate percentages
birth_place_counts = df['birth_place'].value_counts()
print(birth_place_counts)
birth_place_percent = (birth_place_counts / len(df)) * 100

# Plot
plt.figure(figsize=(14, 6))
bars = plt.bar(birth_place_counts.index, birth_place_counts.values, color='skyblue')

# Labels and title
plt.title('Distribution of Birth Places', fontsize=14)
plt.xlabel('Birth Place')
plt.ylabel('Count')

# Add both count and percentage labels above bars
for i, (count, percent) in enumerate(zip(birth_place_counts.values, birth_place_percent.values)):
    plt.text(i, count + 10, f"{count:,} \n({percent:.1f}%)", 
             ha='center', va='bottom', fontsize=6, color='black')

plt.xticks(rotation=45, ha='right')  # Rotate labels for readability
plt.tight_layout()
plt.show()


##### Checking Birth Place–Country Consistency (doubt)

This section verifies whether each artist’s birth place matches their country. It defines a list of known Italian cities and maps a few foreign cities to their respective countries. T The result shows the number of mismatches and lists the inconsistent records. The results show 26 mismatches, all involving artists born in Almería (Spain) but recorded with the country Italia

In [None]:
# Numerical Feature Definition

# List of key numerical columns to analyze
skewed_features = [
    'tokens_per_sent', 'avg_token_per_clause', 'duration_ms', 'stats_pageviews', 'swear_EN', 'char_per_tok', 'swear_IT', 'bpm', 'n_sentences', 'n_tokens', 'rolloff', 'zcr', 'lexical_density', 'flatness'
]

simetric_features =[
    'pitch', 'centroid', 'spectral_complexity', 'loudness', 'flux', 'rms'
]


In [None]:

# List of Italian cities from the data
italian_cities = [
    "Milano", "Roma", "Senigallia", "Torino", "Avellino", "Cagliari", "Salerno",
    "Olbia", "Napoli", "Vimercate", "Vicenza", "Verona", "Scampia", "Nicosia",
    "Sternatia", "Padova", "Grottaglie", "La Spezia", "Scafati", "Nocera Inferiore",
    "Sesto San Giovanni", "Genova", "Alpignano", "Fiumicino", "Treviso", "Bologna",
    "San Siro", "Rho", "Brescia", "Grugliasco", "Reggio Calabria", "Gallarate",
    "Desenzano del Garda", "Pieve Emanuele", "San Benedetto del Tronto", "Firenze",
    "Lodi"
]

# Map known foreign cities to their countries
foreign_cities_to_country = {
    "Singapore": "Singapore",
    "Buenos Aires": "Argentina",
    "Almería": "Spagna",
}


# Function to check birth_place vs country
def check_birth_place_country(row):
    if pd.notna(row['birth_place']) and pd.notna(row['country']):
        if row['birth_place'] in italian_cities and row['country'] != "Italia":
            return True  # mismatch
        elif row['birth_place'] in foreign_cities_to_country:
            if row['country'] != foreign_cities_to_country[row['birth_place']]:
                return True  # mismatch
    return False  # coherent or missing data

# Apply the function
df['birth_place_country_mismatch'] = df.apply(check_birth_place_country, axis=1)

# Count mismatches
num_mismatches = df['birth_place_country_mismatch'].sum()
print(f"Number of birth_place-country mismatches: {num_mismatches}")

# Show records with mismatch
mismatched_records = df[df['birth_place_country_mismatch']]
display(mismatched_records[['birth_place', 'country',]])


##### Birth Place vs Nationality Consistency Check

This step verifies that each artist’s birth place aligns with their nationality. A list of Italian cities and a mapping of known foreign cities (like Almería, Buenos Aires, and Singapore) were used for comparison.

The results show 107 mismatches, mainly involving artists born in Almería or Singapore but labeled with the nationality Italia, indicating possible errors or inconsistencies in the dataset.

In [None]:
# List of Italian cities
italian_cities = [
    "Milano", "Roma", "Senigallia", "Torino", "Avellino", "Cagliari", "Salerno",
    "Olbia", "Napoli", "Vimercate", "Vicenza", "Verona", "Scampia", "Nicosia",
    "Sternatia", "Padova", "Grottaglie", "La Spezia", "Scafati", "Nocera Inferiore",
    "Sesto San Giovanni", "Genova", "Alpignano", "Fiumicino", "Treviso", "Bologna",
    "San Siro", "Rho", "Brescia", "Grugliasco", "Reggio Calabria", "Gallarate",
    "Desenzano del Garda", "Pieve Emanuele", "San Benedetto del Tronto", "Firenze",
    "Lodi"
]


# Map special foreign cities to nationality
foreign_cities_to_nationality = {
    "Singapore": "Singapore",
    "Buenos Aires": "Argentina",
    "Almería": "Spagna",
}

# Function to check birth_place vs nationality
def check_birth_place_nationality(row):
    if pd.notna(row['birth_place']) and pd.notna(row['nationality']):
        if row['birth_place'] in italian_cities and row['nationality'] != "Italia":
            return True  # mismatch
        elif row['birth_place'] in foreign_cities_to_nationality:
            if row['nationality'] != foreign_cities_to_nationality[row['birth_place']]:
                return True  # mismatch
    return False  # coherent or missing data

# Apply the function
df['birth_place_nationality_mismatch'] = df.apply(check_birth_place_nationality, axis=1)

# Count mismatches
num_mismatches = df['birth_place_nationality_mismatch'].sum()
print(f"Number of birth_place-nationality mismatches: {num_mismatches}")

# Show records with mismatch
mismatched_records = df[df['birth_place_nationality_mismatch']]
print(mismatched_records[['name','birth_place', 'nationality','country']])


##### Distribution of Songs by Province and Region

This code calculates and visualizes the percentage distribution of songs by province and region. It counts occurrences, converts them to percentages, and displays bar charts with labeled values to show which areas have the highest song representation

In [None]:

# Count occurrences and convert to percentages
province_counts = df['province'].value_counts()
province_percent = (province_counts / province_counts.sum()) * 100
print('Provinces')
print(province_counts)

# Create a DataFrame for plotting
province_df = province_percent.reset_index()
province_df.columns = ['province', 'percent']


plt.figure(figsize=(10, 8))
sns.barplot(
    data=province_df.head(20),  # top 20 provinces if you want
    x='percent',
    y='province',
    hue='province',
    palette='viridis',
    dodge=False
)

plt.title("Percentage of Songs by Province", fontsize=20, pad=15, color="#000000")
plt.xlabel("Percentage (%)", fontsize=12)
plt.ylabel("Province", fontsize=12)

# Add percentage labels
for index, value in enumerate(province_df.head(20)['percent']):
    plt.text(value + 0.5, index, f"{value:.1f}%", va='center', fontsize=9, color='#000000')

plt.xlim(0, province_df['percent'].max() + 5)
sns.despine()
plt.tight_layout()
plt.show()



region_counts = df['region'].value_counts()
print('Regions')
print(region_counts)
region_percent = (region_counts / region_counts.sum()) * 100
region_df = region_percent.reset_index()
region_df.columns = ['region', 'percent']

plt.figure(figsize=(10, 8))
sns.barplot(
    data=region_df,
    x='percent',
    y='region',
    hue='region',
    palette='coolwarm',
    dodge=False
)

plt.title("Percentage of Songs by Region", fontsize=20, pad=15, color="#000000")
plt.xlabel("Percentage (%)", fontsize=12)
plt.ylabel("Region", fontsize=12)

# Add percentage labels
for index, value in enumerate(region_df['percent']):
    plt.text(value + 0.5, index, f"{value:.1f}%", va='center', fontsize=9, color='#000000')

plt.xlim(0, region_df['percent'].max() + 5)
sns.despine()
plt.tight_layout()
plt.show()


##### Province/Region – Country Consistency Check

This code verifies that Italian provinces and regions are correctly associated with the country "Italia"

In [None]:
# Example mapping of Italian regions to their provinces (from your data)
region_provinces = {
    "Lombardia": ["Milano", "Monza e della Brianza", "Brescia", "Varese", "Lodi"],
    "Campania": ["Salerno", "Napoli", "Avellino"],
    "Lazio": ["Roma"],
    "Veneto": ["Vicenza", "Verona", "Padova", "Treviso"],
    "Piemonte": ["Torino"],
    "Sardegna": ["Cagliari", "Gallura"],
    "Puglia": ["Lecce", "Taranto"],
    "Liguria": ["Genova", "La Spezia"],
    "Sicilia": ["Enna"],
    "Emilia-Romagna": ["Bologna"],
    "Calabria": ["Reggio Calabria"],
    "Marche": ["Ancona", "Ascoli Piceno"],
    "Toscana": ["Firenze"]
}

# Flatten all Italian provinces for quick lookup
all_italian_provinces = [prov for provs in region_provinces.values() for prov in provs]

# Function to check province/region ↔ country
def check_province_region_country(row):
    if pd.notna(row['country']):
        if pd.notna(row['province']) and row['province'] in all_italian_provinces:
            if row['country'] != "Italia":
                return True  # mismatch
        elif pd.notna(row['region']) and row['region'] in region_provinces.keys():
            if row['country'] != "Italia":
                return True  # mismatch
    return False  # coherent or missing data

# Apply the function
df['province_region_country_mismatch'] = df.apply(check_province_region_country, axis=1)

# Count mismatches
num_mismatches = df['province_region_country_mismatch'].sum()
print(f"Number of province/region-country mismatches: {num_mismatches}")

# Show records with mismatch
mismatched_records = df[df['province_region_country_mismatch']]
print(mismatched_records[['province', 'region', 'country']])


##### Province/Region vs Birth Place – Consistency Check (doubt)

This check compares each artist’s birth_place with the corresponding province and region. Mismatches occur when the province or region does not align with the birth_place. There are 2,901 mismatches between birth_place and province/region.

In [None]:
# Updated mapping of Italian regions to provinces including all birth_places in your data
region_provinces = {
    "Lombardia": ["Milano", "Vimercate", "Sesto San Giovanni", "Alpignano", "Fiumicino",
                  "Brescia", "Grugliasco", "Rho", "Gallarate", "Desenzano del Garda", "Lodi", "San Siro"],
    "Lazio": ["Roma"],
    "Piemonte": ["Torino"],
    "Campania": ["Salerno", "Napoli", "Avellino", "Scafati", "Nocera Inferiore"],
    "Veneto": ["Vicenza", "Verona", "Padova", "Treviso"],
    "Sardegna": ["Cagliari", "Olbia", "Gallura"],
    "Puglia": ["Lecce", "Taranto", "Grottaglie", "Sternatia", "San Benedetto del Tronto"],
    "Liguria": ["Genova", "La Spezia"],
    "Sicilia": ["Enna", "Nicosia"],
    "Emilia-Romagna": ["Bologna"],
    "Calabria": ["Reggio Calabria"],
    "Marche": ["Ancona", "Senigallia", "Ascoli Piceno"],
    "Toscana": ["Firenze", "Scampia", "Padova"]
}

# Flatten province → region mapping
province_to_region = {prov: reg for reg, provs in region_provinces.items() for prov in provs}

# Function to check birth_place ↔ province/region
def check_birth_place_province_region(row):
    if pd.notna(row['birth_place']):
        # Only check Italian birth_places
        if row['birth_place'] in province_to_region:
            expected_region = province_to_region[row['birth_place']]
            # Compare province and region if available
            if (pd.notna(row['province']) and row['province'] != row['birth_place']) or \
               (pd.notna(row['region']) and row['region'] != expected_region):
                return True  # mismatch
    return False  # coherent or missing data

# Apply the function
df['birth_place_province_region_mismatch'] = df.apply(check_birth_place_province_region, axis=1)

# Count mismatches
num_mismatches = df['birth_place_province_region_mismatch'].sum()
print(f"Number of birth_place-province/region mismatches: {num_mismatches}")

# Show mismatched records
mismatched_records = df[df['birth_place_province_region_mismatch']]
print(mismatched_records[['birth_place', 'province', 'region']])


##### Geographic Distribution of Artists by Province and Region

This analysis aggregates the number of artists by their latitude, longitude, province, and region. The resulting table shows the locations with the highest concentration of artists at the top. For example, Milano (Lombardia) has the most artists with 1,843, followed by Roma (Lazio) with 1,048, and Torino (Piemonte) with 397. The code also generates a map where the size and color of the points reflect the number of artists per location, providing a clear visual of artist density across Italy.

In [None]:
# Aggregate by latitude and longitude to count number of artists
location_counts = df.groupby(['latitude', 'longitude', 'region', 'province']).size().reset_index(name='num_artists')

# Sort by number of artists descending
location_counts = location_counts.sort_values(by='num_artists', ascending=False)

# Print the sorted table
print(location_counts)

# Define a color scale
color_scale = [(0, 'orange'), (1,'red')]

# Create the scatter map
fig = px.scatter_mapbox(
    location_counts,
    lat="latitude",
    lon="longitude",
    hover_data=["region", "province", "num_artists"],  # show count on hover
    size="num_artists",  # size of marker represents number of artists
    color="num_artists",  # color also shows density
    color_continuous_scale=color_scale,
    zoom=5,
    height=800,
    width=800
)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


#### Lyrics

In [None]:
import dtale

d = dtale.show(tracks, notebook=True)


##### Investigating Missing Lyrics (NaNs vs. Empty Strings)

In [None]:
# Find rows with 'lyrics' = NaN
nan_lyrics_rows = df[df['lyrics'].isna()]
print(f"Number of rows with 'lyrics' = NaN: {len(nan_lyrics_rows)}")
if not nan_lyrics_rows.empty:
    print("Examples of rows with 'lyrics' = NaN:")
    display(nan_lyrics_rows[['id', 'name_artist', 'full_title']])

# Find rows with 'lyrics' = Empty String ("")
empty_string_lyrics_rows = df[df['lyrics'].str.strip() == '']
print(f"\nNumber of rows with 'lyrics' = empty string: {len(empty_string_lyrics_rows)}")

In [None]:
df.dropna(subset=['lyrics'], inplace=True)

##### Found patterns in lyrics and delete it

In [None]:
# Find songs with suspiciously short lyrics
word_threshold = 20
short_lyrics = df[df['n_tokens'] < word_threshold]

print(f"\nTotal rows with suspiciously short lyrics (less than {word_threshold} words)")
print(f"Total number: {len(short_lyrics)}")

# Inspecting the first 20 to see if we find patterns
if not short_lyrics.empty:
    print("\nInspecting the short lyrics:")
    display(short_lyrics[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(20))

In [None]:
# Specific search for the found patterns
pattern = 'Contributors|Contributor|Lyrics'

pattern_matches = df[df['lyrics'].str.contains(pattern, case=False, na=False)]

print(f"\nRows containing 'junk' words")
print(f"Total number: {len(pattern_matches)}")
if not pattern_matches.empty:
    display(pattern_matches[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(23))

In [None]:
for index, row in pattern_matches.iterrows():

    print(f"\n==============================================")
    print(f"INDEX (original): {index}")

    # Print the title for context, if present
    if 'full_title' in row:
        print(f"TITLE: {row['full_title']}")

    if 'n_tokens' in row:
        print(f"Tokens: {row['n_tokens']}")

    print(f"----------------------------------------------")

    full_text = row['lyrics']
    print(full_text)

print(f"\n==============================================")
print(f"End. Displayed {len(pattern_matches)} lyrics.")

In [None]:
# maximum tokens for junk lyrics: 36
# minimum tokens for actual lyrics: 108

token_threshold = 60  # Safe threshold based on your analysis

# Find the indices of rows that meet BOTH conditions
indices_to_drop = df[
    (df['lyrics'].str.contains(pattern, case=False, na=False)) &
    (df['n_tokens'] < token_threshold)
    ].index

print(f"DataFrame shape BEFORE dropping: {df.shape}")
print(f"Number of 'junk AND short' rows (< {token_threshold} tokens) to drop: {len(indices_to_drop)}")

# Drop the rows
if len(indices_to_drop) > 0:
    df.drop(indices_to_drop, inplace=True)
    print(f"DataFrame shape AFTER dropping: {df.shape}")
else:
    print("No rows matched the criteria, no deletion performed.")

In [None]:
# Specific search for "junk" words
contributor_pattern = 'Contributors|Contributor'
contributor_matches = df[df['lyrics'].str.contains(contributor_pattern, case=False, na=False)]

print(f"\nRows containing Contributors words")
print(f"Total number: {len(contributor_matches)}")
if not contributor_matches.empty:
    display(contributor_matches[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(23))

In [None]:
# Remove the found pattern from the beginning of the lyrics

indices_to_clean = df[df['lyrics'].str.contains(contributor_pattern, case=False, na=False)].index

print(f"Number of rows to clean: {len(indices_to_clean)}")

# Define the regex for the header: Number + "Contributor" (or "Contributors") + "Lyrics"
header_regex = r"^\s*\d+\s+Contributor(s)?.*?\s+Lyrics\s*"

# replaces the found pattern with ""
df.loc[indices_to_clean, 'lyrics'] = df.loc[indices_to_clean, 'lyrics'].str.replace(
    header_regex, '', regex=True, flags=re.IGNORECASE
).str.strip()

print("\nStart headers removed from the identified rows.")

print("\nVerifying the cleaning (first 5 modified lyrics):")
for index in indices_to_clean[:5]:
    if index in df.index:
        print("==============================================")
        print(f"INDEX: {index}")
        print(f"CLEANED TEXT (preview):\n'{str(df.loc[index, 'lyrics'])[:200]}...'")

In [None]:
# Specific search for the word "Lyrics"
lyrics_pattern = 'Lyrics'
lyrics_matches = df[df['lyrics'].str.contains(lyrics_pattern, case=False, na=False)]

print(f"\nRows containing lyrics words")
print(f"Total number: {len(lyrics_matches)}")
if not lyrics_matches.empty:
    display(lyrics_matches[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(23))

In [None]:
# Specific search for the word "Coming soon"
coming_soon_pattern = 'COMING SOON'
coming_soon_matches = df[df['lyrics'].str.contains(coming_soon_pattern, case=False, na=False)]

print(f"\nRows containing coming soon words")
print(f"Numero totale: {len(coming_soon_matches)}")
if not coming_soon_matches.empty:
    display(coming_soon_matches[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(20))

In [None]:
for index, riga in coming_soon_matches.iterrows():

    print(f"\n==============================================")
    print(f"INDEX (original): {index}")

    # Print the title for context, if present
    if 'full_title' in row:
        print(f"TITLE: {row['full_title']}")

    if 'n_tokens' in row:
        print(f"Tokens: {row['n_tokens']}")

    print(f"----------------------------------------------")

    full_text = row['lyrics']
    print(full_text)

print(f"\n==============================================")
print(f"End. Displayed {len(pattern_matches)}  lyrics.")

##### Duplicate lyrics (different versions)

In [None]:
# Find rows with duplicate lyrics
duplicate_lyrics = df[df.duplicated(subset=['lyrics'], keep=False)]

# Sort the duplicate rows by lyrics for easier comparison
duplicate_lyrics_sorted = duplicate_lyrics.sort_values(by='lyrics')

print(f"--- Rows with Duplicate Lyrics ('lyrics') ---")
print(f"Total number of rows involved: {len(duplicate_lyrics_sorted)}")

if not duplicate_lyrics_sorted.empty:
    print("\nExamples of duplicate lyrics:")

    display(duplicate_lyrics_sorted[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']].head(20))

##### Intro songs

In [None]:
# Specific search for the word "Intro"
intro_pattern = 'Intro'
intro_songs = df[df['full_title'].str.contains(intro_pattern, case=False, na=False)]

print(f"\nRows containing intro words")
print(f"Total number: {len(intro_songs)}")
if not intro_songs.empty:
    display(intro_songs[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'duration_ms', 'album_type']].head(20))

In [None]:
# Sort the very short songs by n_tokens (from lowest to highest)
short_songs = intro_songs.sort_values(by='n_tokens', ascending=True)

print("Inspecting the short songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in intro_songs.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"Tokens: {row['n_tokens']}")
    print(f"Duration (ms): {row['duration_ms']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}...'")

In [None]:
# Analyze the 'n_tokens' statistics for this group
print("\n'n_tokens' statistics for 'Intro' songs:")
print(intro_songs['n_tokens'].describe())

plt.figure(figsize=(12, 5))
sns.histplot(intro_songs['n_tokens'], bins=50, kde=True)
plt.title("Distribution of 'n_tokens' ONLY for 'Intro' songs")
plt.xlabel("Number of Tokens (words)")
plt.show()

We could remove the intros that are very short. The long ones are the lyrics for the entire song.

#### n_token

In [None]:
# Find rows where 'n_tokens' is NaN
nan_token_rows = df[df['n_tokens'].isna()]

print(f"Number of rows with 'n_tokens' = NaN: {len(nan_token_rows)}")
if not nan_token_rows.empty:
    print("Examples of rows with 'n_token' = NaN:")
    display(nan_token_rows[['id', 'name_artist', 'full_title', 'lyrics']])

# Find rows where 'n_tokens' is 0
testi_zero_token = df[df['n_tokens'] <= 0]

print(f"Righe con n_tokens <= 0 ")
print(f"Numero totale: {len(testi_zero_token)}")


In [None]:
print(df['n_tokens'].describe())

print("\nDisplaying Box Plot to identify outliers...")
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['n_tokens'])
plt.title('Box Plot of n_tokens (to find high and low outliers)')
plt.xlabel('Number of Tokens')
plt.show()

In [None]:
# Statistical Analysis: IQR Method ('n_tokens' is skewed)
col = 'n_tokens'

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
low_outliers = df[df[col] < lower_bound]
high_outliers = df[df[col] > upper_bound]

print(f"Variable '{col}':")
print(f"  IQR Limits: [{lower_bound:.2f}, {upper_bound:.2f}]")
print(f"  Found {len(low_outliers)} outliers below the limit.")
print(f"  Found {len(high_outliers)} outliers above the limit.")

In [None]:
# Find songs with short lyrics (this limit is taken from the lower IQR bound)
word_threshold = 17
short_lyrics = df[df['n_tokens'] < word_threshold]

print(f"\nTotal rows with short lyrics (less than {word_threshold} words)")
print(f"Total number: {len(short_lyrics)}")

if not short_lyrics.empty:
    print("\nInspecting the short lyrics:")
    display(short_lyrics[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'duration_ms']].head(20))

In [None]:
# Sort the very short songs by n_tokens (from lowest to highest)
short_songs = short_lyrics.sort_values(by='n_tokens', ascending=True)

print("Inspecting the short songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in short_songs.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"Tokens: {row['n_tokens']}")
    print(f"Duration (ms): {row['duration_ms']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}...'")

They could be removed since they are very short and the duration in ms is too high.

In [None]:
# long songs by n_tokens (from lowest to highest)
high_token_threshold = 977

very_long_songs = df[(df['n_tokens'] > high_token_threshold) & (df['n_tokens'].notna())]

print(f"Songs with n_tokens > {high_token_threshold} (High Outliers)")
print(f"Total number: {len(very_long_songs)}")

if not very_long_songs.empty:
    print("\nExamples of very long songs:")

    display(very_long_songs[['id', 'full_title', 'lyrics', 'n_tokens', 'duration_ms']].head(20))

In [None]:
# Filter the 159 very long songs and look for those with a very short duration
low_duration_threshold = 60000 # 60 seconds
duration_anomalies = very_long_songs[very_long_songs['duration_ms'] < low_duration_threshold]

print(f"Songs with n_tokens > 977 BUT duration_ms < {low_duration_threshold/1000} seconds ")
print(f"Number found: {len(duration_anomalies)}")
if not duration_anomalies.empty:
    display(duration_anomalies[['id', 'name_artist', 'full_title', 'n_tokens', 'duration_ms']])

In [None]:
# Filter the 159 very long songs
# Look for those with very low lexical density
low_density_threshold = 0.1
density_anomalies = very_long_songs[very_long_songs['lexical_density'] < low_density_threshold]

print(f"\nSongs with n_tokens > 977 BUT lexical_density < {low_density_threshold}")
print(f"Number found: {len(density_anomalies)}")
if not density_anomalies.empty:
    display(density_anomalies[['id', 'name_artist', 'full_title', 'n_tokens', 'lexical_density', 'lyrics']].head())

In [None]:
print("\nRandom inspection of long songs (near the threshold)")
display(very_long_songs.sort_values('n_tokens').head(5)[['id', 'name_artist', 'full_title', 'n_tokens', 'lyrics']])

#### char_per_tok

In [None]:
# Find rows where 'char_per_tok' is NaN
nan_char_per_tok_rows = df[df['char_per_tok'].isna()]

print(f"Number of rows with 'char_per_tokens' = NaN: {len(nan_char_per_tok_rows)}")
if not nan_char_per_tok_rows.empty:
    print("Examples of rows with 'char_per_token' = NaN:")
    display(nan_char_per_tok_rows[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens']])

In [None]:
print("Plotting distribution for 'char_per_tok'...")

plt.figure(figsize=(14, 5))

# 1. Histogram
plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st plot
sns.histplot(df['char_per_tok'].dropna(), kde=True, bins=30)
plt.title('Histogram of char_per_tok')
plt.xlabel('Average Characters per Token')
plt.ylabel('Frequency')

# 2. Box Plot
plt.subplot(1, 2, 2) # 1 row, 2 columns, 2nd plot
sns.boxplot(x=df['char_per_tok'].dropna())
plt.title('Box Plot of char_per_tok')
plt.xlabel('Average Characters per Token')

plt.tight_layout()
plt.show()

print("\n--- Descriptive Statistics for char_per_tok ---")
print(df['char_per_tok'].describe())

The descriptive statistics show mean=4.02, std=0.28, min=2.0, max=12.0. The mean and median (50%=4.01) are almost identical.

Meaning: The distribution is highly concentrated around an average word length of 4 characters, with low variability (low standard deviation). The histogram confirms a shape that is very close to a normal distribution (bell curve).

In [None]:
# Statistical Analysis: IQR Method (''char_per_tok' is skewed)
col = 'char_per_tok'

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
low_outliers = df[df[col] < lower_bound]
high_outliers = df[df[col] > upper_bound]

print(f"Variable '{col}':")
print(f"  IQR Limits: [{lower_bound:.2f}, {upper_bound:.2f}]")
print(f"  Found {len(low_outliers)} outliers below the limit.")
print(f"  Found {len(high_outliers)} outliers above the limit.")

In [None]:
# Set the threshold for average characters per token
char_tok_threshold = 3
low_char_tok_rows = df[df['char_per_tok'] < char_tok_threshold]

print(f"\nTotal rows with few characters per token (less than {char_tok_threshold} characters)")
print(f"Total number: {len(low_char_tok_rows)}")

if not low_char_tok_rows.empty:
    print("\nInspecting lyrics with low 'char_per_tok':")
    display(low_char_tok_rows[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'char_per_tok']].head(20))

In [None]:
# Sort the very short songs by n_tokens (from lowest to highest)
short_songs = low_char_tok_rows.sort_values(by='char_per_tok', ascending=True)

print("Inspecting the short songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in low_char_tok_rows.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"TOKENS: {row['n_tokens']}")
    print(f"CHARACTER PER TOKEN: {row['char_per_tok']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}'")

In [None]:
# Set the threshold for average characters per token
char_tok_threshold = 4.61
high_char_tok_rows = df[df['char_per_tok'] > char_tok_threshold]

print(f"\nTotal rows with characters per token (more than {char_tok_threshold} characters)")
print(f"Total number: {len(high_char_tok_rows)}")

if not high_char_tok_rows.empty:
    print("\nInspecting lyrics with low 'char_per_tok':")
    display(high_char_tok_rows[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'duration_ms']].head(20))

In [None]:
# Sort the very short songs by n_tokens (from lowest to highest)
short_songs = high_char_tok_rows.sort_values(by='char_per_tok', ascending=True)

print("Inspecting the short songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in low_char_tok_rows.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"TOKENS: {row['n_tokens']}")
    print(f"CHARACTER PER TOKEN: {row['char_per_tok']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}'")

Limits: The IQR method identified outliers below 3.42 and above 4.61.

High Outliers (> 4.61):

- Result: You found 221 high outliers. The maximum value is 12.0.

- Meaning: These represent songs using words that are, on average, longer than usual (more than 4.6 characters). The max value of 12.0 is high but plausible (think of technical terms, complex vocabulary, or languages with naturally longer words). These 221 outliers likely represent valid data reflecting specific lyrical styles.

Low Outliers (< 3.42):

- Result: You found 121 low outliers. The minimum value is 2.0. You specifically inspected 11 rows with char_per_tok < 3.

- Meaning: These represent songs with, on average, very short words. Your inspection of values < 3 confirms this:
Some have very few tokens (Indices 3473, 3586), making a low average plausible.
Some contain mainly interjections or simple words (Index 3999: "Ah ah"). Plausible.
Some are very repetitive (Index 10316: "Mangio al Mc"). Plausible.
Others seem like normal songs. A low char_per_tok might indicate a very colloquial, simple style, or many monosyllabic words. Seems plausible.

#### n_sentences

In [None]:
# Find rows where 'n_sentences' is NaN
nan_n_sentences_rows = df[df['n_sentences'].isna()]

print(f"Number of rows with 'n_sentences' = NaN: {len(nan_n_sentences_rows)}")
if not nan_n_sentences_rows.empty:
    print("Examples of rows with 'n_sentences' = NaN:")
    display(nan_n_sentences_rows[['id', 'name_artist', 'full_title', 'lyrics']])

In [None]:
print("Plotting distribution for 'n_sentences'")

plt.figure(figsize=(14, 5))

# 1. Histogram
plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st plot
sns.histplot(df['n_sentences'].dropna(), kde=True, bins=30)
plt.title('Histogram of n_sentences')
plt.xlabel('Average Characters per Token')
plt.ylabel('Frequency')

# 2. Box Plot
plt.subplot(1, 2, 2) # 1 row, 2 columns, 2nd plot
sns.boxplot(x=df['n_sentences'].dropna())
plt.title('Box Plot of n_sentences')
plt.xlabel('Average Characters per Token')

plt.tight_layout()
plt.show()

print("\n--- Descriptive Statistics for n_sentences ---")
print(df['n_sentences'].describe())

In [None]:
# Statistical Analysis: IQR Method
col = 'n_sentences'

Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
low_outliers = df[df[col] < lower_bound]
high_outliers = df[df[col] > upper_bound]

print(f"Variable '{col}':")
print(f"  IQR Limits: [{lower_bound:.2f}, {upper_bound:.2f}]")
print(f"  Found {len(low_outliers)} outliers below the limit.")
print(f"  Found {len(high_outliers)} outliers above the limit.")

In [None]:
# Set the threshold for the number of sentence
n_sentences_threshold = 5
low_n_sentences_rows = df[df['n_sentences'] <= n_sentences_threshold]

print(f"\nTotal rows with few number of sentence (less than {n_sentences_threshold} sentence)")
print(f"Total number: {len(low_n_sentences_rows)}")

if not low_n_sentences_rows.empty:
    print("\nInspecting lyrics with low 'n_sentences':")
    display(low_n_sentences_rows[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'n_sentences']].head(20))

In [None]:
# Sort the very short songs by n_tokens (from lowest to highest)
short_songs = low_n_sentences_rows.sort_values(by='n_sentences', ascending=True)

print("Inspecting the short songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in low_n_sentences_rows.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"TOKENS: {row['n_tokens']}")
    print(f"CHARACTER PER TOKEN: {row['char_per_tok']}")
    print(f"NUMBER PER SENTENCE: {row['n_sentences']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}'")

In [None]:
# Set the threshold for average characters per token
n_sentences_threshold = 113
high_n_sentences_rows = df[df['n_sentences'] > n_sentences_threshold]

print(f"\nTotal rows with characters per token (more than {n_sentences_threshold} characters)")
print(f"Total number: {len(high_n_sentences_rows)}")

if not high_n_sentences_rows.empty:
    print("\nInspecting lyrics with low 'n_sentences':")
    display(high_n_sentences_rows[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'duration_ms']].head(20))

In [None]:
short_songs = high_n_sentences_rows.sort_values(by='n_sentences', ascending=True)

print("Inspecting the long songs")

# Print the full text of the first 5 (Note: .iterrows() will iterate all, not just 5)
for index, row in high_n_sentences_rows.iterrows():
    print("\n==============================================")
    print(f"INDEX (original): {index}")
    print(f"ARTIST: {row['name_artist']}")
    print(f"TITLE: {row['full_title']}")
    print(f"TOKENS: {row['n_tokens']}")
    print(f"CHARACTER PER TOKEN: {row['char_per_tok']}")
    print(f"NUMBER PER SENTENCE: {row['n_sentences']}")
    print("----------------------------------------------")
    print(f"LYRICS:\n'{str(row['lyrics'])}'")

1.  **Missing Values (NaN):**
    * **Result:** You have confirmed there are **73 NaN**
    * **Meaning:** This is **not** actual missing data where lyrics are absent. It's a **processing error** where the sentence calculation failed or was skipped for these specific tracks, even though the `lyrics` text exists.
    * **Action (Data Cleaning):** These 73 NaN values will need to be **recalculated** during the data cleaning phase by analyzing the corresponding `lyrics` content.

2.  **Distribution and Statistics:**
    * **Result:** The descriptive statistics show `mean=60.0`, `median (50%)=59.0`, `min=1.0`, `max=437.0`. The mean and median are very close, suggesting a relatively centered distribution for the bulk of the data.
    * **Distribution Shape:** Your skewness analysis (`skewness = 1.46`, Positive) indicates a slight positive skew (right tail). The histogram you provided visually confirms this: the distribution is largely unimodal and somewhat bell-shaped but stretches out towards higher sentence counts.
    * **Meaning:** Most songs contain between 46 (`25%`) and 73 (`75%`) sentences. The slight right skew is expected, as some songs are naturally much longer or structurally more fragmented (e.g., storytelling, cyphers) than the average track.

3.  **Outliers (IQR Method):**
    * **Limits:** The IQR method defined outliers as values below `5.50` and above `113.50`.
    * **High Outliers (> 113.50):**
        * **Result:** You found **205 high outliers**. The maximum value is 437.
        * **Meaning:** These represent songs with a very large number of detected sentences. Considering that you also found valid songs with very high `n_tokens` (up to ~3000), having a high sentence count (up to 437) is **plausible** for these longer tracks. High sentence counts might also arise from texts with very short sentences or many line breaks interpreted as sentence endings. These outliers likely represent **valid data** reflecting longer or structurally distinct songs (like spoken word intros, storytelling tracks).
    * **Low Outliers (< 5.50):**
        * **Result:** You found **56 low outliers**. The minimum value is 1.0.
        * **Meaning:** These represent songs identified as having very few sentences (1 to 5). This strongly correlates with the findings for `n_tokens` (where 18 outliers had `< 17` tokens). These low counts are **highly plausible** for very short texts such as skits, brief intros/outros, or tracks that are primarily instrumental with minimal lyrics. Your inspection of lyrics for `n_tokens < 17` likely included many of these cases (e.g., indices 758, 2671, 3172 from your previous inspection had few sentences). These low outliers appear to be **valid data** representing short lyrical content.

* **Data Quality Interaction:** Extremely low `n_sentences` values (like 1) combined with high `n_tokens` are **indicators of poor punctuation or formatting in the source `lyrics` data**, which also caused the unrealistic high outliers in `tokens_per_sent`. These specific low `n_sentences` values, while artifacts, correctly reflect how the text was likely processed.


#### tokens_per_sent

In [None]:
# Find rows where 'n_sentences' is NaN
nan_tokens_per_sent_rows = df[df['tokens_per_sent'].isna()]

print(f"Number of rows with 'tokens_per_sent' = NaN: {len(nan_tokens_per_sent_rows)}")
if not nan_tokens_per_sent_rows.empty:
    print("Examples of rows with 'tokens_per_sent' = NaN:")
    display(nan_tokens_per_sent_rows[['id', 'name_artist', 'full_title', 'lyrics']])

#### avg_token_per_clause

#### lexical_density

In [None]:
print("Verifying if NaN locations match across lyrical features...")

# 1. Get indices where n_tokens is NaN
nan_indices_ntokens = df[df['n_tokens'].isna()].index

# 2. Define the other columns to check
other_lyrical_cols = ['tokens_per_sent', 'char_per_tok', 'lexical_density', 'avg_token_per_clause']

# 3. Check each column against n_tokens
all_match = True
for col in other_lyrical_cols:
    nan_indices_col = df[df[col].isna()].index

    # Compare the index sets (convert to sets for easy comparison)
    if set(nan_indices_ntokens) != set(nan_indices_col):
        all_match = False
        print(f"\nMismatch found for column: '{col}'")
        # Optional: Find which indices differ
        diff1 = set(nan_indices_ntokens) - set(nan_indices_col)
        diff2 = set(nan_indices_col) - set(nan_indices_ntokens)
        if diff1:
            print(f"  Indices NaN in 'n_tokens' but not in '{col}': {list(diff1)}")
        if diff2:
            print(f"  Indices NaN in '{col}' but not in 'n_tokens': {list(diff2)}")
    else:
        print(f"  NaN indices match for 'n_tokens' and '{col}'. Count: {len(nan_indices_col)}")


# 4. Final confirmation
if all_match:
    print("\nConfirmation: The NaN values occur in exactly the same rows for 'n_tokens' and the other derived lyrical features.")
    print(f"Total number of rows with NaNs in these columns: {len(nan_indices_ntokens)}")
else:
    print("\nWarning: There is a mismatch in the location of NaN values between 'n_tokens' and at least one other derived lyrical feature.")

#### swear_IT and swear_IT_words

#### swear_EN and swear_EN_words

####  Feature lyriche

In [None]:
# --- 1. Investiga tokens_per_sent > 12.58 (specifically the highest values) ---
soglia_alta_tps = 12.58
outlier_tps_alti = df[df['tokens_per_sent'] > soglia_alta_tps].sort_values('tokens_per_sent', ascending=False)

print(f"--- Righe con tokens_per_sent > {soglia_alta_tps} (Valori più alti) ---")
print(f"Numero totale outlier alti: {len(outlier_tps_alti)}")
if not outlier_tps_alti.empty:
    print("\nEsempi con tokens_per_sent più alto:")
    display(outlier_tps_alti[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'tokens_per_sent']].head(20))

In [None]:
for index, riga in outlier_tps_alti.iterrows():

    # Aggiungiamo dei separatori per leggibilità
    print(f"\n==============================================")
    print(f"INDICE (originale): {index}")

    # Stampiamo anche il titolo per contesto, se presente
    if 'full_title' in riga:
        print(f"TITOLO: {riga['full_title']}")

    if 'n_tokens' in riga:
        print(f"Token: {riga['n_tokens']}")

    print(f"----------------------------------------------")

    testo_completo = riga['lyrics']
    print(testo_completo)

print(f"\n==============================================")
print(f"Fine. Visualizzati {len(outlier_tps_alti)} outlier_tps_alti'.")

In [None]:
# --- 2. Investiga avg_token_per_clause > 11.44 (specifically the highest values) ---
soglia_alta_atpc = 11.44
outlier_atpc_alti = df[df['avg_token_per_clause'] > soglia_alta_atpc].sort_values('avg_token_per_clause', ascending=False)

print(f"\n--- Righe con avg_token_per_clause > {soglia_alta_atpc} (Valori più alti) ---")
print(f"Numero totale outlier alti: {len(outlier_atpc_alti)}")
if not outlier_atpc_alti.empty:
    print("\nTop 5 esempi con avg_token_per_clause più alto:")
    display(outlier_atpc_alti[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'avg_token_per_clause']].head())

In [None]:
# --- 3. Investiga avg_token_per_clause == 0 ---
zero_atpc = df[df['avg_token_per_clause'] == 0]

print(f"\n--- Righe con avg_token_per_clause == 0 ---")
print(f"Numero totale: {len(zero_atpc)}")
if not zero_atpc.empty:
    print("\nEsempi di righe con avg_token_per_clause == 0:")
    display(zero_atpc[['id', 'name_artist', 'full_title', 'lyrics', 'n_tokens', 'avg_token_per_clause']].head())

