In [None]:
%matplotlib inline

import pandas as pd
# Show all rows
pd.set_option('display.max_rows', None)

# Show all columns
pd.set_option('display.max_columns', None)

# Show full column contents (no truncation)
pd.set_option('display.max_colwidth', None)

# Don‚Äôt wrap long output lines
pd.set_option('display.expand_frame_repr', False)

import numpy as np
import altair as alt
import re
import os
import matplotlib.pyplot as plt
from langdetect import detect, DetectorFactory
from itertools import cycle, islice
import dtale

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import project_fuctions as functions
DetectorFactory.seed = 42

alt.data_transformers.disable_max_rows()


## Reading data

In [None]:
artists_path = 'data\\artists.csv'
tracks_path = 'data\\tracks.csv'

This code automatically detects the correct separator for two dataset files (tracks and artists) by checking which character ‚Äî comma, semicolon, or tab ‚Äî appears most in the first line. It then loads each file into a pandas DataFrame using the detected separator, prints their shapes, and displays the first few rows.

 The tracks dataset has 11,166 rows and 45 columns, while the artists dataset has 104 rows and 14 columns.

In [None]:
# Funzione helper per capire il separatore corretto
def detect_separator(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        sample = f.readline()
    # Conta quanti separatori compaiono
    seps = {',': sample.count(','), ';': sample.count(';'), '\t': sample.count('\t')}
    best_sep = max(seps, key=seps.get)
    print(f"Detected separator for {filepath}: '{best_sep}'")
    return best_sep

# Rileva automaticamente il separatore
sep_tracks = detect_separator(tracks_path)
sep_artists = detect_separator(artists_path)

print('------------------------------------')

# Carica i dataset in base al separatore rilevato
tracks = pd.read_csv(tracks_path, sep=sep_tracks, encoding='utf-8', engine='python')
artists = pd.read_csv(artists_path, sep=sep_artists, encoding='utf-8', engine='python')

df = tracks.merge(
    artists,
    left_on="id_artist",
    right_on="id_author",
    how="left",
    suffixes=("", "_artist")
)

# Mostra alcune info per verifica
print(f"Tracks shape: {tracks.shape}")
print(f"Artists shape: {artists.shape}")
print("Shape df (merged):", df.shape)
print('------------------------------------')

print('TRACKS')
display(tracks.head(3))

print('------------------------------------')
print('ARTISTS')
display(artists.head(3))

print('------------------------------------')
print('MERGERD')
display(df.head(3))

## DATA CLEANING

### Fixing duplicates

#### Fixing Duplicated Tracks Id
After reviewing the songs associated with the duplicated IDs, we found that each duplicated ID corresponds to different songs, except for one case that will be treated later. Therefore, the most reasonable solution is to modify the duplicated IDs by appending the row number to each one. This approach ensures that all songs are preserved while maintaining unique identifiers for every track.

In [None]:
# Identify duplicated IDs
duplicate_mask = tracks.duplicated(subset='id', keep=False)

# Assign new unique IDs only to duplicated rows
tracks.loc[duplicate_mask, 'id'] = (
    tracks.loc[duplicate_mask]
    .apply(lambda x: f"{x['id']}_{x.name}", axis=1)
)


print("Example of updated duplicates:")
display(tracks[duplicate_mask][['id', 'full_title']])


##### Duplicated Tracks based on title

#### Fixing duplicate coloums

In this section, we remove all columns that store the same information or redundant representations of it.

##### Primary Artists

In [None]:
# Remove primary_artist column from the dataset
df = df.drop(columns=['primary_artist'])


##### Full title
We are going to take only title

In [None]:
df.drop(columns=['full_title'], inplace=True)

##### Name of the artists
name_artist and name both represent the same entity but name is formatted more accurately, we will retain the name column and drop name_artist for clarity and consistency.

In [None]:
df.drop(columns=['name_artist'], inplace=True)

##### Album
Once we understand that album is more stable than album, we can remove both of these two coloums (album_name and id_album). We remove also album_image because it is useless.

In [None]:
df.drop(columns=['album_name'], inplace=True)
df.drop(columns=['id_album'], inplace=True)
df.drop(columns=['album_image'], inplace=True)

#### Fixing duplicate tracks based on lyrics
As previously identified, there are 9 duplicate rows based on identical lyrics. To maintain data integrity, we will remove these duplicates from the dataset. This step is crucial to prevent skewing any analysis or derived features that rely on the lyrics content.

In [None]:
original_row_count = len(df)
print(f"Original DataFrame shape: {df.shape}")

non_nan_mask = df['lyrics'].notna()
df_nan_lyrics = df[~non_nan_mask]
df_non_nan_lyrics = df[non_nan_mask]

print(f"  > Found {len(df_nan_lyrics)} rows with NaN lyrics (keeping all for now).")
print(f"  > Found {len(df_non_nan_lyrics)} rows with non-NaN lyrics (checking for duplicates).")

duplicate_mask = df_non_nan_lyrics.duplicated(subset=['lyrics', 'album_type'], keep='first')

indices_to_drop = df_non_nan_lyrics[duplicate_mask].index

print(f"  > Found {len(indices_to_drop)} true logical duplicates (same lyrics AND album_type) to remove.")

df.drop(indices_to_drop, inplace=True)

cleaned_row_count = len(df)
rows_removed_total = original_row_count - cleaned_row_count

print(f"\nDataFrame shape after dropping logical duplicates: {df.shape}")
print(f"Total rows removed: {rows_removed_total}")

# We now expect a smaller number (e.g., 8 instead of 10) because
# the 'Madame' tracks (and any others with different album_type) were kept.
print(f"SUCCESS: Removed {rows_removed_total} rows. Duplicates with different 'album_type' were kept.")



### Fixing DataTypes

In [None]:
# coverting birth_date to dateTime   
date_cols = ['birth_date',  ]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # convert to datetime, invalid dates become NaT

#--------------------------------------------------------------------------------------------------------#



#Converting active_start to dateTime
date_cols = [ 'active_start', ]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # convert to datetime, invalid dates become NaT


#--------------------------------------------------------------------------------------------------------#

#converting album_release_date to DateTime
date_cols = ['album_release_date']
# --- Check date columns ---
for col in date_cols:
    original = df[col].copy()
    converted = pd.to_datetime(original, errors='coerce')
    non_convertible = original[original.notna() & converted.isna()]
    
    print(f"\nColumn '{col}'  entries that cannot be converted to datetime:")
    if not non_convertible.empty:
        for idx, val in non_convertible.items():
            print(f"Row {idx}: {val}")
    else:
        print("All non-missing entries can be converted to datetime.")
    print('----------------------------------------------------------------')
    
# Converting to DateTime
def fix_year_only_dates(val):
    """
    If the value looks like a 4-digit year, convert it to 'YYYY-01-01'.
    Otherwise, return the original value.
    """
    if pd.isna(val):
        return val
    val_str = str(val).strip()
    if re.fullmatch(r'\d{4}', val_str):
        return f"{val_str}-01-01"
    return val_str

# Apply to album_release_date
df['album_release_date'] = df['album_release_date'].apply(fix_year_only_dates)

# Convert album_release_date to datetime
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')

 # Convert year  to number
df['year'] = pd.to_numeric(df['year'], errors='coerce') 
df.info()



### Filling Missing Value

#### Active End

It is a complete empy coloum so we can delete it.

In [None]:
df.drop(columns=['active_end'], inplace=True)

#### Stats Pageviews

In [None]:
df.drop(columns=['stats_pageviews'], inplace=True)

#### Features Artists

In [None]:
df.drop(columns=['featured_artists'], inplace=True)

#### Popularity


In [None]:
display(
    df.loc[
        (df['popularity'].isna()),
        ['popularity', 'title']
    ].head(50)
)
print(f"We have {df['popularity'].isna().sum()} null value in popularity")

Given that the popularity score is a continuous metric with skewed distribution and that dropping rows would remove valuable tracks, we opted for median-based imputation.

In [None]:
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')

df['popularity_missing_flag'] = df['popularity'].isna().astype(int)
median_pop = df['popularity'].median()
df['popularity'] = df['popularity'].fillna(median_pop)


#### Filling Birth Dates
This code manually fills missing birth dates for specific artists in the dataset. It first defines a dictionary mapping artist names to their known or estimated birth dates.

9 entries couldn‚Äôt be filled so their birth dates are intentionally left blank in the dictionary. For Miss Keta, the birth date is unknown, so no accurate value can be provided. The others ‚Äî Bushwaka, Sottotono, Dark Polo Gang, Cor Veleno, Colle Der Fomento, Club Dogo, Articolo 31, and 99 Posse ‚Äî are all music groups or duos, not individual artists, meaning they don‚Äôt have a single birth date associated with them.

In [None]:
# --- 1. Define the Missing Dates as a Dictionary ---
# Source of truth for the manual fill
birth_dates_to_fill = {
    'alfa': '2000-08-22',
    'anna pepe': '2003-08-15',
    'beba': '1994-10-10',
    'bigmama': '2000-03-10',
    'brusco': '1974-01-04',
    'caneda': '1976-09-30',
    'dargen d_amico': '1980-11-29',
    'gu√® pequeno': '1980-12-25',
    'johnny marsiglia': '1986-08-05',
    'nerone': '1991-05-23',
    'priestess': '1996-08-20',
    'samuel heron': '1991-01-01',
    'shiva': '1999-08-27',
    'ye–∏dry': '1993-07-27',
    'o zul√π': '1970-11-15',
    'skioffi':'1992-06-05',
    'eva rea':'1993-01-01',
    'hindaco':'1996-01-01',
    'joey funboy':'1995-01-01',
    'mistico':'1982-01-01',
    'mike24':'1985-08-02',
    'doll kill':'1996-01-01',
    'miss simpatia':'1986-03-23',
    'miss keta':'',#unknown
    'bushwaka':'',#duo
    'sottotono':'',#group
    'dark polo gang':'',#group
    'cor veleno':'',#group
    'colle der fomento':'',#group
    'club dogo':'',#group
    'articolo 31':'',#group
    '99 posse':''#gruppo
    }

# --- 2. Fill the Missing Data (Imputation) ---

# Convert the dictionary to a Pandas Series for easy lookup and indexing
birth_date_series = pd.Series(birth_dates_to_fill)

# Iterate through the artists in your fill list and update the DataFrame
for artist, bday in birth_date_series.items():
    # Use .loc to find rows where 'artist_name' matches and update 'birth_date'
    # The second part of the condition (artist_df['birth_date'].isna()) ensures
    # we only overwrite if the date was previously missing (NaN).
    df.loc[
        (df['name'] == artist) & (df['birth_date'].isna()),
        'birth_date'
    ] = bday

# --- 3. Final Conversion and Verification ---

# Convert the 'birth_date' column to the proper datetime format again
# (This is crucial for accurate age calculation)
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

# Optional: Print out the affected rows to verify the fix
print("--- Verification of Filled Birth Dates  ---")
# Filter the DataFrame to show only the artists we just updated
filled_artists = df[df['name'].isin(birth_dates_to_fill.keys())]

# Show the unique artist names and their newly filled birth dates
print(filled_artists[['name', 'birth_date']].drop_duplicates().to_string(index=False))

##### Checking distribution after filling Bithdate


In [None]:
functions.plot_birth_decades(df, "Distribution of Artists' Birth Years  After Filling Nan",'Percentage of Unique Artists by Decade of Birth After Filling Nan')
functions.plot_artist_ages(df,'Number of Unique Artists by Age (After Filling NaN)')

#### Filling Active Start Date

In [None]:
# Total number of unique artists missing 'active_start': 54
active_starts_consolidated = {
    'alfa': '01-15-2017',
    'anna pepe': '01-01-2018',
    'babaman': '01-01-1989',
    'beba': '11-01-2015',
    'brusco': '01-01-1991',
    'capo plaza': '01-01-2013',
    'chadia rodriguez': '01-01-2017',
    'clementino': '04-29-2006',
    'dargen d_amico': '01-01-1999',
    'don joe': '01-01-1999',
    'fred de palma': '01-01-2008',
    'geolier': '01-01-2018',
    'gu√® pequeno': '01-01-1997',
    'miss keta': '01-01-2013',
    'shiva': '01-01-2014',
    'tedua': '01-01-2013',
    'tony effe': '01-01-2014',
    'sottotono': '01-01-1994',
    'bushwaka': '01-01-2007',
    'mike24': '01-01-2009',
    'mistico': '01-01-2008',
    'skioffi': '01-01-2014',
    "caneda": "01-01-1993",
    "club dogo": "01-01-2002",
    "colle der fomento": "01-01-1994",
    "dani faiv": "01-01-2014",
    "doll kill": "01-01-2012",
    "drefgold": "01-01-2012",
    "entics": "01-01-2004",
    "eva rea": "12-18-2014",
    "hell raton": "01-01-2010",
    "hindaco": "02-21-2020",
    "jack the smoker": "01-01-2000",
    "joey funboy": "01-01-2016",
    "johnny marsiglia": "01-01-2007",
    "la pina": "01-01-1994",
    "luch√®": "01-01-1997",
    "mambolosco": "02-10-2017",
    "massimo pericolo": "01-01-2016",
    "miss simpatia": "01-01-2007",
    "mistaman": "01-01-1994",
    "mondo marcio": "01-01-2003",
    "nerone": "01-01-2008",
    "niky savage": "01-01-2021",
    "o zul√π": "01-01-1991",
    "papa v": "01-01-2020",
    "rondodasosa": "01-01-2020",
    "samuel heron": "01-01-2012",
    "shablo": "01-01-1999",
    "slait": "01-01-2010",
    "tony boy": "01-01-2018",
    "tormento": "01-01-1991",
    "ye–∏dry": "01-01-2012",
    "yung snapp": "01-01-2012",
}


# Convert the dictionary to a Pandas Series for efficient filling
start_date_series = pd.Series(active_starts_consolidated)

# Iterate and fill the missing data in the 'active_start' column
for artist, start_date in start_date_series.items():
    # Use .loc to find rows where 'artist_name' matches and update 'active_start'
    df.loc[
        df['name'] == artist,
        'active_start'
    ] = start_date

# Ensure the 'active_start' column is a proper datetime object
df['active_start'] = pd.to_datetime(df['active_start'], errors='coerce')

print("Active start dates have been filled in the 'active_start' column.")

functions.plot_active_start_decades(df,'Percentage of Unique Artists by Active Start Decade After Filling Missing Values')
functions.plot_age_at_career_start(df,'Age of unique Artists When They Started Their Career After Filling Missing Values')

#### Adding Album_release_date for albums that doesn't have date (todooo)

In [None]:
##todoooo

#### Filling missing albums for tracks (REDooo)
We have 78 tracks without albums. We managed to find the albums for 16 tracks.

In [None]:
# # --- 1. Define mapping for known tracks ---
# # Keys = track title (partial or exact match)
# # Values = album name, release date, and type
# album_updates = {

#     "PTS (PoiTiSpiego/PostTraumaticStress)": {
#         "album_name": "Gilmar / Embrionale",
#         "album_release_date": "2012-01-01",
#         "album_type": "album"
#     },
#     "Quelli che benpensano": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Autodaf√®": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Fight da faida": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Gi√π le mani da Caino": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Pedala": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Accendimi": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Fili": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Rap Lamento": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Cali di tensione": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Cubetti tricolori": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Faccio la mia cosa": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Libri Di Sangue": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Potere Alla Parola": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Nuvole": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
#     "Il beat come anestetico": {
#         "album_name": "La morte dei miracoli",
#         "album_release_date": "1997-01-01",
#         "album_type": "album"
#     },
# }

# # --- 2. Define a helper function to safely update missing info ---
# def update_album_info(row):
#     for track, info in album_updates.items():
#         if track.lower() in str(row['title']).lower():  # partial match, case-insensitive
#             if pd.isna(row['album_name']):
#                 row['album_name'] = info['album_name']
#             if pd.isna(row['album_release_date']):
#                 row['album_release_date'] = pd.to_datetime(info['album_release_date'])
#             if pd.isna(row['album_type']):
#                 row['album_type'] = info['album_type']
#             break
#     return row

# # --- 3. Apply to the dataset ---
# df = df.apply(update_album_info, axis=1)

# # --- 4. Verify the updates ---
# updated = df[df['title'].str.contains('|'.join(album_updates.keys()), case=False, na=False)]
# print(f" Updated {len(updated)} tracks with album information.")
# display(updated[['title', 'album_name', 'album_release_date', 'album_type']])

#### Artist Location Info

In [None]:
def impute_all_artist_data(df):
    """
    Performs comprehensive imputation of all missing biographical fields,
    including Latitude and Longitude coordinates, for the Italian artist dataset.

    The function relies on a manually curated knowledge base (IMPUTATION_MAP).
    """
    df = df.copy()

    # 1. Define the Master Imputation Map with Coordinates
    # Value: (Birth Place, Nationality, Country, Province, Region, Latitude, Longitude)
    IMPUTATION_MAP = {
        'articolo 31': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'bushwaka': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'club dogo': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'dargen d_amico': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'doll kill': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'gu√® pequeno': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'miss keta': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'nerone': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'shiva': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),
        'niky savage': ('Milano', 'Italia', 'Italia', 'Milano', 'Lombardia', 45.4642, 9.1900),

        # --- ROMA (Lazio) ---
        'brusco': ('Roma', 'Italia', 'Italia', 'Roma', 'Lazio', 41.8933, 12.4829),
        'colle der fomento': ('Roma', 'Italia', 'Italia', 'Roma', 'Lazio', 41.8933, 12.4829),
        'cor veleno': ('Roma', 'Italia', 'Italia', 'Roma', 'Lazio', 41.8933, 12.4829),
        'mistico': ('Roma', 'Italia', 'Italia', 'Roma', 'Lazio', 41.8933, 12.4829),
        'dark polo gang':('Roma', 'Italia', 'Italia', 'Roma', 'Lazio', 41.8933, 12.4829),

        # --- NAPOLI (Campania) ---
        '99 posse': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'o zul√π': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'bigmama': ('Avellino', 'Italia', 'Italia', 'Avellino', 'Campania', 40.9167, 14.7833),
        'eva rea': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'joey funboy': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'mike24': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'miss simpatia': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),
        'samuel heron': ('Napoli', 'Italia', 'Italia', 'Napoli', 'Campania', 40.8518, 14.2681),

        # --- Other Regions/Cities ---
        'beba': ('Torino', 'Italia', 'Italia', 'Torino', 'Piemonte', 45.0703, 7.6869),
        'alfa': ('Genova', 'Italia', 'Italia', 'Genova', 'Liguria', 44.4073, 8.9463),
        'anna pepe': ('La Spezia', 'Italia', 'Italia', 'La Spezia', 'Liguria', 44.1027, 9.8252),
        'caneda': ('Varese', 'Italia', 'Italia', 'Varese', 'Lombardia', 45.8197, 8.8256),
        'fabri fibra': ('Senigallia', 'Italia', 'Italia', 'Ancona', 'Marche', 43.7126, 13.2201),
        'nesli': ('Senigallia', 'Italia', 'Italia', 'Ancona', 'Marche', 43.7126, 13.2201),
        'hindaco': ('Padova', 'Italia', 'Italia', 'Padova', 'Veneto', 45.4064, 11.8767),
        'johnny marsiglia': ('Palermo', 'Italia', 'Italia', 'Palermo', 'Sicilia', 38.1157, 13.3615),
        'priestess': ('Palermo', 'Italia', 'Italia', 'Palermo', 'Sicilia', 38.1157, 13.3615),
        'skioffi': ('Taranto', 'Italia', 'Italia', 'Taranto', 'Puglia', 40.4667, 17.2403),
        'sottotono': ('Varese', 'Italia', 'Italia', 'Varese', 'Lombardia', 45.8197, 8.8256),

        # --- International ---
        'baby k': ('Singapore', 'Italia', 'Singapore', np.nan, np.nan, 1.3521, 103.8198),
        'shablo': ('Buenos Aires', 'Argentina', 'Argentina', np.nan, np.nan, -34.6037, -58.3816),
        'ye–∏dry': ('Santo Domingo', 'Dominicana', 'Dominican Republic', np.nan, np.nan, 18.4861, -69.9312),

    }

    # 2. Add Latitude and Longitude columns if they don't exist
    for col in ['latitude', 'longitude']:
        if col not in df.columns:
            df[col] = np.nan

    # 3. Iterate and Apply Imputation/Fill Missing Values
    for index, row in df.iterrows():
        artist = row['name']

        if artist in IMPUTATION_MAP:
            (birth_place, nationality, country, province, region, lat, lon) = IMPUTATION_MAP[artist]

            # Use fillna() logic: only fill if the current cell is NaN

            # Biographical Imputation
            df.loc[index, 'birth_place'] = row['birth_place'] if pd.notna(row['birth_place']) else birth_place
            df.loc[index, 'nationality'] = row['nationality'] if pd.notna(row['nationality']) else nationality
            df.loc[index, 'country'] = row['country'] if pd.notna(row['country']) else country
            df.loc[index, 'province'] = row['province'] if pd.notna(row['province']) else province
            df.loc[index, 'region'] = row['region'] if pd.notna(row['region']) else region

            # Geospatial Imputation (Always fill the Lat/Lng to ensure consistency with the Birth Place)
            df.loc[index, 'latitude'] = lat
            df.loc[index, 'longitude'] = lon


    return df



df = impute_all_artist_data(df)


print ('checking the left null values after filling')
# Define columns to check
cols_to_check = [
    'birth_place',
    'nationality',
    'province',
    'region',
    'country',
    'latitude',
    'longitude'
]

# Filter rows where any of these columns are null
missing_rows = df[df[cols_to_check].isnull().any(axis=1)]

# Select only artist name + the relevant columns
columns_to_show = ['name'] + cols_to_check
missing_subset = missing_rows[columns_to_show]

# Keep only unique artist names (first occurrence)
unique_missing_subset = missing_subset.drop_duplicates(subset=['name'])

# Show the result in D-Tale
print(unique_missing_subset.shape)
display(unique_missing_subset)

#### Lyrics

In [None]:
print(f"Shape before fixing NaNs: {df.shape}")
original_row_count = len(df)

# Action 1: Drop all rows where 'lyrics' is missing (in-place)
df.dropna(subset=['lyrics'], inplace=True)

rows_removed_lyrics = original_row_count - len(df)
print(f"\nShape after dropping 'lyrics' NaNs: {df.shape}")
print(f"Rows removed: {rows_removed_lyrics}")

if rows_removed_lyrics == 3:
    print("SUCCESS: Correctly removed the 3 'NaN' lyric rows.")
else:
    print(f"WARNING: Expected to remove 3 'NaN' rows, but removed {rows_removed_lyrics}.")

### Fixing out of range and errors

####  Correcting "Priestess" Entry

Based on the inspection of the artists description above, we noticed an entry labeled ‚Äúgruppo musicale canadese‚Äù (Canadian music group). Upon checking, this description is incorrectly assigned to the Italian rapper Priestess. Further research revealed a mix-up with a Canadian band that shares the same name. This confusion becomes evident when comparing the active_start year in the dataset, which matches that of the Canadian group rather than the Italian artist.

We are going to correct its data

In [None]:
print('Before')
display( df[df['description'].str.contains('gruppo musicale canadese', case=False, na=False)]
         .drop_duplicates(subset=['name'])
         .sort_values(by='name'))

print('After')
# Fix Priestess' incorrect description and active_start date
df.loc[df['name'].str.lower() == 'priestess', ['description','active_start' ]] = [
    'cantante e rapper italiana',
    '2017-01-01'

]

# Verify the update
print(df[df['name'].str.lower() == 'priestess'][['name', 'description', 'active_start']])


#### Popularity

In [None]:
# Fix out-of-range popularity values
df['popularity'] = df['popularity'].clip(lower=0, upper=100)
#
display(df.loc[df['modified_popularity'] == 1, ['popularity', 'modified_popularity', 'title']].head(50))

Now we don't need anymore modified_popularity, so we can just drop it.

In [None]:
df.drop(columns=['modified_popularity'], inplace=True)
print("OK ‚Äî column 'modified_popularity' has been removed.")

#### Lyrics

##### Remove Junk Lyrics

In [None]:
# maximum tokens for junk lyrics: 36
# minimum tokens for actual lyrics: 108
pattern = 'Contributors|Contributor|Lyrics|COMING SOON|instrumental'

token_threshold = 60  # Safe threshold based on your analysis

# Find the indices of rows that meet BOTH conditions
conditions_to_eliminate = (
        (df['lyrics'].str.contains(pattern, case=False, na=False)) &
        (df['n_tokens'] < token_threshold)
)

# Get the actual index labels of the rows to drop
indices_to_drop = df[conditions_to_eliminate].index
num_to_eliminate = len(indices_to_drop)

print(f"Found {num_to_eliminate} 'junk AND short' rows (< {token_threshold} tokens) to ELIMINATE.")

# 2. Set the 'lyrics' column to NaN where the conditions are true
if num_to_eliminate > 0:
    df.drop(indices_to_drop, inplace=True)

    print(f"DataFrame shape AFTER elimination: {df.shape}")
    print(f"Successfully removed {num_to_eliminate} rows.")


##### Cleaning "Contributors" Headers

In [None]:
contributor_pattern = 'Contributors|Contributor'
contributor_matches = df[df['lyrics'].str.contains(contributor_pattern, case=False, na=False)]

print(f"\nRows containing Contributors words")
print(f"Total number: {len(contributor_matches)}")
if not contributor_matches.empty:
    display(contributor_matches[['id', 'name', 'title', 'lyrics', 'n_tokens']].head(23))

In [None]:
# Remove the found pattern from the beginning of the lyrics
print(f"DataFrame shape BEFORE cleaning: {df.shape}")

indices_to_clean = df[df['lyrics'].str.contains(contributor_pattern, case=False, na=False)].index

print(f"Number of rows to clean: {len(indices_to_clean)}")

if len(indices_to_clean) > 0:
    # Definisci la regex per l'intestazione
    header_regex = r"^\s*\d+\s+Contributor(s)?.*?\s+Lyrics\s*"

    original_lyrics = df.loc[indices_to_clean, 'lyrics']

    cleaned_lyrics_series = original_lyrics.str.replace(
        header_regex, '', regex=True, flags=re.IGNORECASE
    ).str.strip()

    empty_mask = (cleaned_lyrics_series == '') | (cleaned_lyrics_series.str.isspace()) | (cleaned_lyrics_series.isna())
    indices_to_drop = cleaned_lyrics_series[empty_mask].index

    indices_to_update = cleaned_lyrics_series[~empty_mask].index
    updates_to_apply = cleaned_lyrics_series[indices_to_update]

    print(f"  > Identified {len(indices_to_drop)} rows to DROP (lyrics were only the header).")
    print(f"  > Identified {len(indices_to_update)} rows to UPDATE (lyrics were contaminated).")

    # Delete the rows that are now empty
    if not indices_to_drop.empty:
        df.drop(indices_to_drop, inplace=True)
        print(f"  > Dropped {len(indices_to_drop)} 'junk' rows.")

    # Update the rows with cleaned lyrics
    if not indices_to_update.empty:
        df.loc[indices_to_update, 'lyrics'] = updates_to_apply
        print(f"  > Cleaned and updated {len(indices_to_update)} 'contaminated' rows.")

    print(f"\nDataFrame shape AFTER cleaning: {df.shape}")

print("\nVerifying the cleaning (first 5 modified lyrics):")
for index in indices_to_clean[:20]:
    if index in df.index:
        print("==============================================")
        print(f"INDEX: {index}")
        print(f"CLEANED TEXT (preview):\n'{str(df.loc[index, 'lyrics'])[:200]}...'")

##### Cleaning editorial sentences

In [None]:
def remove_editorial_sentences(text):
    if not isinstance(text, str):
        return text

    pattern = (
        r"(Il brano vanta[^\.]*\.?)|"      # frasi tipo "Il brano vanta..."
        r"(La produzione √® curata[^\.]*\.?)"  # frasi tipo "La produzione √® curata..."
        r"(La produzione √® opera di [^\.]*\.?)"
    )

    cleaned = re.sub(pattern, "", text, flags=re.IGNORECASE)
    return cleaned.strip()

df["lyrics"] = df["lyrics"].apply(remove_editorial_sentences)

##### Recalculate _auto features

In [None]:
# Apply the "atomic" counting functions
print("Recalculating n_tokens_auto...")
# We use .loc to ensure we modify the original df
df.loc[:, 'n_tokens_auto'] = df['lyrics'].apply(functions.count_tokens)

print("Recalculating n_sentences_auto...")
df.loc[:, 'n_sentences_auto'] = df['lyrics'].apply(functions.count_sentences)

print("Recalculating n_unique_words_auto...")
df.loc[:, 'n_unique_words_auto'] = df['lyrics'].apply(functions.count_unique_tokens)

print("Recalculating total_chars_auto...")
df.loc[:, 'total_chars_auto'] = df['lyrics'].apply(functions.count_total_token_chars)

# Recalculate Ratios (handling division by zero)
print("Recalculating lexical_density_auto...")
df.loc[:, 'lexical_density_auto'] = np.where(
    df['n_tokens_auto'] == 0, 0.0,
    df['n_unique_words_auto'] / df['n_tokens_auto']
)

print("Recalculating tokens_per_sent_auto...")
df.loc[:, 'tokens_per_sent_auto'] = np.where(
    df['n_sentences_auto'] == 0, 0.0,
    df['n_tokens_auto'] / df['n_sentences_auto']
)

print("Recalculating char_per_tok_auto...")
df.loc[:, 'char_per_tok_auto'] = np.where(
    df['n_tokens_auto'] == 0, 0.0,
    df['total_chars_auto'] / df['n_tokens_auto']
)

print("\n--- RECALCULATION COMPLETE ---")

# Final Verification -
nan_check = df['n_tokens_auto'].isna().sum()
print(f"NaNs in 'n_tokens_auto' (should now be 0): {nan_check}")
if nan_check == 0:
    print("SUCCESS: All 'broken' rows have been fixed.")
else:
    print("WARNING: 'NaN' values still found. Please review the 'lyrics' column for errors.")

print("\nAll derived features are now clean and synchronized with the 'lyrics' column.")

In [None]:
print(f"Shape before column cleanup: {df.shape}")
print(f"Columns before cleanup: {df.columns.tolist()}")

# Define Columns to Drop

# These are the original "dirty" features we are replacing
original_dirty_features = [
    'n_tokens',
    'n_sentences',
    'tokens_per_sent',
    'char_per_tok',
    'lexical_density',
    'avg_token_per_clause'
]

# These are the intermediate calculation/helper columns we created
helper_and_match_cols = [
    'n_sentences_match',
    'n_tokens_match',
    'tokens_per_sent_match',
    'char_per_tok_match',
    'lexical_density_match',
    'total_chars_auto',
    'n_unique_words_auto'
]

cols_to_drop = original_dirty_features + helper_and_match_cols
existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]

print(f"\nDropping {len(existing_cols_to_drop)} old/helper columns...")

#  Drop the old and helper columns
df.drop(columns=existing_cols_to_drop, inplace=True)

print("Old columns dropped.")

# Define Columns to Rename ---

rename_map = {
    'n_tokens_auto': 'n_tokens',
    'n_sentences_auto': 'n_sentences',
    'tokens_per_sent_auto': 'tokens_per_sent',
    'char_per_tok_auto': 'char_per_tok',
    'lexical_density_auto': 'lexical_density'
}

existing_rename_map = {k: v for k, v in rename_map.items() if k in df.columns}

print(f"\nRenaming {len(existing_rename_map)} '_auto' columns to their final names...")

# Rename the '_auto' columns
df.rename(columns=existing_rename_map, inplace=True)

print("Columns successfully renamed.")

print(f"Final DataFrame shape: {df.shape}")
print(f"Final columns: {df.columns.tolist()}")


The avg_token_per_clause feature was removed during the data cleaning process for two primary reasons:

Unreliable Source Data: The calculation of this feature depends on the original n_tokens column. Our feature inspection proved that the original n_tokens column is inconsistent and unreliable (with a >99% mismatch compared to our clean, recalculated n_tokens_auto). This makes any metric derived from it inherently untrustworthy.

Unverifiable Calculation: Unlike simpler metrics like n_tokens_auto or n_sentences_auto (which we could recalculate using regex), accurately identifying grammatical "clauses" (proposizioni) requires complex syntactic NLP parsing. This makes the feature impossible for us to verify or reliably recalculate.

In [None]:
display(df.loc[df['n_tokens'] < 10, ['lyrics', 'n_tokens']])


We decide to remove it.

In [None]:
to_remove = df[df['n_tokens'] < 10].shape[0]
print("Deleted rows:", to_remove)

#### Languages

Since the language of a track is inherently encoded in its lyrics, automatic language identification allows us to objectively validate the declared label and detect annotation inconsistencies, making it a reliable and justifiable approach for language quality assessment.

In [None]:
def detect_lang(text, idx):
    try:
        if not isinstance(text, str) or len(text.strip()) < 20:
            print(f"[{idx}] Skipped (too short or invalid)")
            return None
        lang = detect(text)
        if idx % 100 == 0:  # stampa ogni 100 per non intasare
            print(f"[{idx}] Detected: {lang}")
        return lang
    except Exception as e:
        print(f"[{idx}] Error: {e}")
        return None

# Applica con indice
df['predicted_language'] = [
    detect_lang(txt, i) for i, txt in enumerate(df['lyrics'])
]
# Confronto con lingua dichiarata
df['language_match'] = df['language'] == df['predicted_language']

# Statistiche
total = len(df)
matches = df['language_match'].sum()
mismatches = total - matches


In [None]:
print("\n--- RESULT SUMMARY ---")
print(f"Total tracks: {total}")
print(f"Language matches: {matches} ({matches/total:.2%})")
print(f"Mismatches: {mismatches} ({mismatches/total:.2%})")

print("\n--- SAMPLE PREDICTIONS ---")
display(df[['lyrics', 'language', 'predicted_language', 'language_match']].head(50))

In [None]:
display(df.loc[df['predicted_language'].isna(), ['title', 'lyrics', 'language', 'predicted_language']].head(50))


In [None]:
df.loc[[8854, 9181], "predicted_language"] = "it"

Now we can fix the coloums.

In [None]:
df['language'] = df['predicted_language']
display(df[['language', 'predicted_language', 'language_match']].head(50))

In [None]:
df.drop(columns=['predicted_language'], inplace=True)
df.drop(columns=['language_match'], inplace=True)

In [None]:
# --- 1) Conta lingue includendo i NaN
lang_counts = df['language'].value_counts(dropna=False).reset_index()
lang_counts.columns = ['language', 'count']

# Mostra "NaN" come etichetta senza toccare df
lang_counts['language'] = lang_counts['language'].astype(object).where(
    ~lang_counts['language'].isna(), 'NaN'
)

# Ordina per count decrescente
lang_counts = lang_counts.sort_values('count', ascending=False).reset_index(drop=True)

# --- 2) Palette (simile all'Altair che avevi)
base_palette = ["#f3e5f5", "#e1bee7", "#ce93d8", "#ba68c8", "#9c27b0"]
# se le categorie sono pi√π di 5, cicla i colori
palette = list(islice(cycle(base_palette), len(lang_counts)))

# --- 3) Plot (barh ordinato)
plt.figure(figsize=(10, 6))
bars = plt.barh(
    y=lang_counts['language'],
    width=lang_counts['count'],
    color=palette,
    edgecolor='black',
    linewidth=0.5
)

# Inverti asse Y per avere la lingua pi√π frequente in alto
plt.gca().invert_yaxis()

plt.xlabel('Number of Tracks')
plt.ylabel('Predicted Language')
plt.title('Number of Tracks per Predicted Language (Including Skipped = NaN)')

# --- 4) Aggiungi etichette con i conteggi alla fine delle barre
for bar, val in zip(bars, lang_counts['count']):
    x = bar.get_width()
    y = bar.get_y() + bar.get_height() / 2
    plt.text(x + max(lang_counts['count']) * 0.01, y, str(val), va='center')

plt.tight_layout()
plt.show()

In [None]:
display(df.loc[df['language']== "en", ['title', 'lyrics', 'language',]].head(50))

In [None]:
df.loc[[41, 291, 376, 730, 866, 880, 885, 952, 963, 1006, 1035, 1049, 1061, 1065, 1194, 1264, 1285, 1483,1722, 1735, 1842, 1945, 2500, 2557,2572, 2602, 2904, 2963, 3007, 3017, 3034, 3045,3069,3096, 3266, 3404, 3503, 3924, 4301, 4367,4886, 5003, 5005, 5007, 5008, 5034, 5050, 5069, 5076, 5081, 5234, 6777,7474, 7720, 8967, 8981, 9214, 9229, 9727, 9920, 9985, 10432], "predicted_language"] = "it"

display(df.loc[df['language']== "en", ['name', 'lyrics', 'language',]].head(50))

In [None]:
df.loc[[3601], "predicted_language"] = "it"

display(df.loc[df['language']== "es", ['name', 'lyrics', 'language',]].head(50))

#### Correcting wrong active start
Upon Inspection on the age of the artists when they started their career (code above).

Among the unique artists, several had unusual ages at career start.  Nesli (age 10)  had incorrect active start dates, while Salmo (age 13) thasup (Age 14) nitro (age 14)  ghemon (age 14 )and Mudimbi (age 27) were correct.   These values comes from the original data.

After filling birthdate and Active start date we realized that (bigmama) started at the age of one which is obviously wrong.

We have age (7) it was for the singer priestess, but we already corrected its data above in the section (Correcting "Priestess" Entry).

 We will correct the errors by updating  Nesli‚Äôs to 1999,bigmama to 2016, leaving Salmo and Mudimbi unchanged.

In [None]:
# Calculate age at career start without adding a column
ages = df['active_start'].dt.year - df['birth_date'].dt.year

# Filter rows where age is 1,7  10,  13,or 27
outliers = df[ages.isin([1,7 ,10, 13,14,27,])].copy()

# Keep only unique artists based on name
unique_outliers = outliers.drop_duplicates(subset=['name'])

print(unique_outliers[['name', 'birth_date', 'active_start']].assign(age_at_start=ages))

In [None]:
# Correcting wrong active_start dates
df.loc[df['name'] == 'bigmama', 'active_start'] = pd.to_datetime('2016-01-01')
df.loc[df['name'] == 'nesli', 'active_start'] = pd.to_datetime('1999-01-01')

# Verify the changes
outliers_corrected = df[df['name'].isin(['bigmama', 'nesli', 'salmo',])]
print(outliers_corrected[['name', 'birth_date', 'active_start']])
functions.plot_age_at_career_start(df,'Age of unique Artists When They Started Their Career After Filling Missing Values and Correcting Errors')

#### Correcting album release dates for albums that have multiple release dates

In [None]:

# --- 2. Compute the recent date per album ---
earliest_dates = (
    df.groupby('album', as_index=False)['album_release_date']
    .max()
)

# --- 3. Merge back into the main dataframe ---
df = df.drop(columns=['album_release_date']).merge(
    earliest_dates,
    on='album',
    how='left'
)

# --- 4. Confirm result ---
print("Replaced albums with multiple release dates by their recent  date.")
print(df[['album', 'album_release_date']].drop_duplicates().shape)
display(df[['album', 'album_release_date']].drop_duplicates())


#### Handling albums from 50s till 80s

As we inspected before, we found that all the albums from the 1950s to 1980s were incorrectly assigned ‚Äî their tracks and artists do not actually belong to those albums. Because this data is unreliable, we decided to fill the columns album_name, album_release_year, and album_type with None (null values) for these records.

We do this to remove incorrect associations and avoid misleading results in future analysis. By replacing these wrong values with None, we clearly mark them as invalid or unknown, ensuring that only verified album‚Äìartist‚Äìtrack relationships remain in the dataset.

In [None]:

import importlib
import project_fuctions
importlib.reload(project_fuctions)
functions.plot_unique_album_release_distribution(df)
def fix_corrupted_release_dates(df):
    """
    Fixes corrupted album release dates by imputing them with the verified modern release dates,
    including month and day, and casts the column to datetime objects.
    """
    df = df.copy() 
    
    # 1. Define the Master Mapping Dictionary: {Album Name: Correct Date String (YYYY-MM-DD)}
    CORRECT_DATE_MAP = {
        'K1 Mixtape': '2014-12-29',
        'S.O.S. EP': '2008-10-16',
        'Entics Television': '2014-01-01',    # Using YYYY-01-01 for unknown M/D
        'Cracovia': '2016-01-01',             # Using YYYY-01-01 for unknown M/D
        'Council Estate Vol.1': '2012-01-01',   # Using YYYY-01-01 for unknown M/D
        'Happy EP!': '2011-05-03',
        'Equilibrio - EP': '2010-06-07',
        'Jolly Mixtape': '2017-11-07',
        'Quello Che Vi Consiglio Vol. 4': '2013-10-18',
        'Rimo Da Quando': '2010-11-15',
        'Nobilt√† di Strada': '2007-02-16',
        'Vivere aiuta a non morire': '2013-04-30',
        'Quattro San Simoni e un funerale EP': '2015-04-28',
        'VERA BADDIE': '2024-06-28',
        'Haterproof': '2011-10-15',
        'Tutto il Contrario Remixtape': '2011-01-01', # Using YYYY-01-01 for unknown M/D
        'Radiografia - EP': '2006-01-01',    # Using YYYY-01-01 for unknown M/D
    }
    
    # 2. Apply the Mapping and Overwrite Corrupted Dates
    is_corrupted_row = df['album'].isin(CORRECT_DATE_MAP.keys())
    df.loc[is_corrupted_row, 'album_release_date'] = df.loc[is_corrupted_row, 'album'].map(CORRECT_DATE_MAP)
    
    # # 3. Final Cleaning: Cast the column to datetime objects
    df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')
    
    print("### ‚úÖ Full Release Date Imputation Complete")
    print(f"Corrected {is_corrupted_row.sum()} records with precise dates.")

    return df


fix_corrupted_release_dates(df)
functions.plot_unique_album_release_distribution(df,
                                                 
    title1="Percentage of Unique Albums by Release Decade After Cleaning",
    title2="Distribution of Unique Album Release Years After Cleaning",)

#### Cleaning Albums Released Before the Artist‚Äôs Birth

In [None]:
# Create a mask for albums released before the artist was born
mask = (df['album_release_date'] < df['birth_date'])

# Nullify incorrect album information
df.loc[mask, ['album_name', 'album_release_date', 'disc_number','track_number']] = np.nan

# Confirm how many records were affected
print(f"Albums released before artist's birth: {mask.sum()} records cleaned.")
df.loc[mask, ['name', 'album_name', 'album_release_date', 'birth_date', 'album_type','disc_number','track_number']]


#### Renumbering Duplicates Track Numbers

In [None]:
# Clean up strings (optional but helpful)
df['album'] = df['album'].str.strip()
df['title'] = df['title'].str.strip()
df['name'] = df['name'].str.strip()

# Sort to maintain consistent order
df = df.sort_values(['album', 'disc_number', 'track_number']).reset_index(drop=True)

# Renumber duplicate track numbers within each album/disc group
def renumber_duplicates(group):
    # Count duplicates of track_number within this disc
    counts = group['track_number'].value_counts()
    duplicates = counts[counts > 1].index

    if len(duplicates) > 0:
        # Reassign track numbers so that each entry within a disc has unique numbers starting at 1
        group['track_number'] = range(1, len(group) + 1)
    return group

df = df.groupby(['album', 'disc_number'], group_keys=False).apply(renumber_duplicates)


duplicates = df[df.duplicated(subset=['album', 'disc_number', 'track_number'], keep=False)]
print(f"üéµ Found {len(duplicates)} duplicate track entries.")
dtale.show(duplicates[['album', 'disc_number', 'track_number', 'title', 'name']])


#### Correcting Coordinates
We will correct coordinates to make them match the birth_place

In [None]:
# Correct coordinates for each birth place
coord_map = {
    "Almer√≠a": (36.8340, -2.4637),
    "Buenos Aires": (-34.6037, -58.3816),
    "Singapore": (1.290270, 103.851959)
}

# Update each row based on exact city name
for place, (lat, lon) in coord_map.items():
    mask = df['birth_place'] == place
    df.loc[mask, 'latitude'] = lat
    df.loc[mask, 'longitude'] = lon

# Keep only the columns we care about
cols = ['birth_place', 'latitude', 'longitude','province','region']

# Drop duplicates so each place appears once (keeping the first lat/lon found)
unique_places = df[cols].drop_duplicates(subset=['birth_place'])

# Sort alphabetically by birth_place (optional)
unique_places = unique_places.sort_values(by='birth_place').reset_index(drop=True)

# Display results
display(unique_places)



#### Fixing Explicit

In [None]:
print("\nApplying new rule: 'explicit' = (swear_IT > 0) | (swear_EN > 0)")
new_explicit_mask = (df['swear_IT'] > 0) | (df['swear_EN'] > 0)

# --- 3. Overwrite the 'explicit' Column ---
# We replace the original 'explicit' values with our new, consistent rule.
df['explicit'] = new_explicit_mask

print("\n'explicit' column successfully overwritten.")
print("\nNew 'explicit' values (After standardization):")
print(df['explicit'].value_counts())

# --- 4. Verification ---
print("\n--- Verification ---")
print("Checking for rows where 'explicit = True' but swear count is 0 (should be 0):")

# Find any errors (where explicit is True, but both swear counts are 0)
errors = df[(df['explicit'] == True) & (df['swear_IT'] == 0) & (df['swear_EN'] == 0)]
num_errors = len(errors)

print(f"Errors found: {num_errors}")

if num_errors == 0:
    print("SUCCESS: The 'explicit' column is now 100% consistent with swear counts.")
else:
    print("ERROR: Discrepancies still found. Please review the logic.")

print("\n(Note: The 1,202 'explicit-for-other-reasons' cases are now set to False.)")
print("(Note: The 3,125 'not-explicit-with-swears' cases are now set to True.)")

#### Fixing Duration

In [None]:
# Define valid duration range (in milliseconds)
MIN_DUR = 30_000      # 30 seconds
MAX_DUR = 600_000    # 10 minutes

# Replace unrealistic values with NaN
df['duration_ms'] = df['duration_ms'].where(
    (df['duration_ms'] >= MIN_DUR) & (df['duration_ms'] <= MAX_DUR),
    np.nan
)

# Show songs now missing duration (i.e., invalid before)
invalid_songs = df[df['duration_ms'].isna()][['title', 'duration_ms']]
invalid_songs


### Fixing Outliers

### Save new cleaned dataset

In [None]:
output_folder = "data"
output_file = "merge_dataset_cleaned.csv"
output_filename = os.path.join(output_folder, output_file)

print(f"--- Saving Cleaned DataFrame ---")
print(f"Saving {len(df)} rows and {len(df.columns)} columns to '{output_filename}'...")

try:
    df.to_csv(output_filename, index=False)

    print(f"\nSUCCESS: DataFrame saved successfully to '{output_filename}'.")

except Exception as e:
    print(f"\nERROR: An error occurred while saving the file.")
    print(f"Error details: {e}")