# INITIALIZATION

In [45]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict
from scipy.stats import pearsonr

from pandas_summary import DataFrameSummary


data_folder = './DATA/'
raw_data_folder = data_folder+'dataset/'
artists_raw_data = raw_data_folder+'artists.csv'
tracks_raw_data = raw_data_folder+'tracks.csv'


# CSV VISUALIZATION

In [15]:
artists = pd.read_csv(artists_raw_data, sep=';', index_col=0)
tracks = pd.read_csv(tracks_raw_data, sep=',', index_col=0)

artists
# tracks

Unnamed: 0_level_0,name,gender,birth_date,birth_place,nationality,description,active_start,active_end,province,region,country,latitude,longitude
id_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ART82291002,99 posse,M,,,,gruppo musicale italiano,1991-01-01,,,,,,
ART53496045,achille lauro,M,1990-07-11,Verona,Italia,cantautore e rapper italiano (1990-),2012-01-01,,Verona,Veneto,Italia,45.442498,10.985738
ART18853907,alfa,M,,,,,,,,,,,
ART64265460,anna pepe,F,,,,,,,,,,,
ART75741740,articolo 31,M,,,,gruppo musicale hip hop italiano,1990-01-01,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ART98118784,tormento,M,1975-09-06,Reggio Calabria,Italia,"rapper, cantautore e beatmaker italiano (1975-)",,,Reggio Calabria,Calabria,Italia,38.103539,15.639756
ART15560128,vacca,M,1979-10-21,Cagliari,Italia,rapper italiano,2001-01-01,,Cagliari,Sardegna,Italia,39.217199,9.113311
ART57587384,willie peyote,M,1985-08-28,Torino,Italia,rapper e cantautore italiano (1985-),2004-01-01,,Torino,Piemonte,Italia,45.067755,7.682489
ART71515715,yeиdry,F,,,,,,,,,,,


## CSV INFO

In [12]:
artists.info()
print()
artists.shape

<class 'pandas.core.frame.DataFrame'>
Index: 104 entries, ART82291002 to ART83631935
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          104 non-null    object 
 1   gender        104 non-null    object 
 2   birth_date    73 non-null     object 
 3   birth_place   72 non-null     object 
 4   nationality   71 non-null     object 
 5   description   86 non-null     object 
 6   active_start  50 non-null     object 
 7   active_end    0 non-null      float64
 8   province      70 non-null     object 
 9   region        68 non-null     object 
 10  country       70 non-null     object 
 11  latitude      72 non-null     float64
 12  longitude     72 non-null     float64
dtypes: float64(3), object(10)
memory usage: 11.4+ KB



(104, 13)

In [13]:
tracks.info()
print()
tracks.shape

<class 'pandas.core.frame.DataFrame'>
Index: 11166 entries, TR934808 to TR552777
Data columns (total 44 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_artist             11166 non-null  object 
 1   name_artist           11166 non-null  object 
 2   full_title            11166 non-null  object 
 3   title                 11166 non-null  object 
 4   featured_artists      3517 non-null   object 
 5   primary_artist        11166 non-null  object 
 6   language              11061 non-null  object 
 7   album                 9652 non-null   object 
 8   stats_pageviews       4642 non-null   float64
 9   swear_IT              11166 non-null  int64  
 10  swear_EN              11166 non-null  int64  
 11  swear_IT_words        11166 non-null  object 
 12  swear_EN_words        11166 non-null  object 
 13  year                  10766 non-null  object 
 14  month                 9969 non-null   float64
 15  day           

(11166, 44)

## ARTISTS INFO

### Summary

In [54]:
summary = pd.DataFrame({
    'dtype': artists.dtypes,
    'unique_values': artists.nunique(),
    'missing_values': artists.isna().sum(),
    'missing_percent': artists.isna().mean() * 100,
    'duplicates': [artists.duplicated().sum()] * len(artists.columns)
})
print(summary)

# artists.describe()


                dtype  unique_values  missing_values  missing_percent  \
name           object            104               0         0.000000   
gender         object              2               0         0.000000   
birth_date     object             73              31        29.807692   
birth_place    object             40              32        30.769231   
nationality    object              2              33        31.730769   
description    object             69              18        17.307692   
active_start   object             24              54        51.923077   
active_end    float64              0             104       100.000000   
province       object             26              34        32.692308   
region         object             13              36        34.615385   
country        object              1              34        32.692308   
latitude      float64             40              32        30.769231   
longitude     float64             40              3

In [48]:
# profile = ProfileReport(artists, title="Artists Dataset Report", explorative=True)
# profile.to_notebook_iframe()   # For Jupyter
# # or
# # profile.to_file("artists_report.html")

# dfs = DataFrameSummary(artists)
# dfs.summary()

# from skimpy import skim

# skim(artists)


### NATIONALITIES

In [21]:
print("Artists nationalities:")
print(artists["nationality"].unique())

nationality_counts = artists["nationality"].value_counts(dropna=False)

print(nationality_counts)


Artists nationalities:
[nan 'Italia' 'Argentina']
nationality
Italia       70
NaN          33
Argentina     1
Name: count, dtype: int64


In [None]:
print("Artists with missing nationalities:")
print(artists[artists["nationality"].isna()]["name"])

print()
print("Reasonable to assume that missing nationalities are 'Italia' since of the known nationalities all but one are 'Italia'")

Artists with missing nationalities:
id_author
ART82291002             99 posse
ART18853907                 alfa
ART64265460            anna pepe
ART75741740          articolo 31
ART19605256                 beba
ART02666525              bigmama
ART03111237               brusco
ART95365016             bushwaka
ART28846313               caneda
ART81071062            club dogo
ART85821920    colle der fomento
ART59609037           cor veleno
ART63985757       dargen d_amico
ART96068455       dark polo gang
ART52349448            doll kill
ART14073567              eva rea
ART04141409          guè pequeno
ART59593021              hindaco
ART88199433          joey funboy
ART07469279     johnny marsiglia
ART37807199               mike24
ART43601431            miss keta
ART51628788        miss simpatia
ART66452136              mistico
ART62385172               nerone
ART19060721          niky savage
ART42220690               o zulù
ART87389753            priestess
ART56967402         samuel her

### Gender

In [None]:
print(artists["gender"].unique())
gender_count = artists["gender"].value_counts(dropna=False)
print(gender_count)
for gender,count in gender_count.items():
    print(f"{gender} : {(count / artists.shape[0]) * 100:.2f}%")
print("no missing values for gender")


['M' 'F']
gender
M    87
F    17
Name: count, dtype: int64
M : 83.65%
F : 16.35%


### BIRTH PLACE

In [40]:
artists["birth_place"].unique()

print(artists["birth_place"].unique())

birth_place_count = artists["birth_place"].value_counts(dropna=False)

print(birth_place_count)


[nan 'Verona' 'Rho' 'Singapore' 'Milano' 'Sternatia' 'Salerno' 'Almería'
 'Avellino' 'Nocera Inferiore' 'La Spezia' 'Bologna' 'Vimercate'
 'Alpignano' 'Senigallia' 'Brescia' 'Torino' 'Roma' 'Napoli' 'Olbia'
 'Firenze' 'Scampia' 'Vicenza' 'Grottaglie' 'Nicosia' 'Gallarate'
 'Treviso' 'Desenzano del Garda' 'San Benedetto del Tronto' 'Scafati'
 'Pieve Emanuele' 'San Siro' 'Grugliasco' 'Lodi' 'Sesto San Giovanni'
 'Buenos Aires' 'Genova' 'Fiumicino' 'Padova' 'Reggio Calabria' 'Cagliari']
birth_place
NaN                         32
Milano                      15
Roma                         7
Torino                       4
Napoli                       3
Olbia                        3
Vicenza                      3
Avellino                     2
Salerno                      2
Senigallia                   2
Almería                      1
Rho                          1
Singapore                    1
Verona                       1
Sternatia                    1
Nocera Inferiore             1
La 

In [41]:
for place,count in birth_place_count.items():
    print(f"{place} : {(count / artists.shape[0]) * 100:.2f}%")

nan : 30.77%
Milano : 14.42%
Roma : 6.73%
Torino : 3.85%
Napoli : 2.88%
Olbia : 2.88%
Vicenza : 2.88%
Avellino : 1.92%
Salerno : 1.92%
Senigallia : 1.92%
Almería : 0.96%
Rho : 0.96%
Singapore : 0.96%
Verona : 0.96%
Sternatia : 0.96%
Nocera Inferiore : 0.96%
La Spezia : 0.96%
Brescia : 0.96%
Alpignano : 0.96%
Bologna : 0.96%
Vimercate : 0.96%
Firenze : 0.96%
Scampia : 0.96%
Grottaglie : 0.96%
Nicosia : 0.96%
Gallarate : 0.96%
Treviso : 0.96%
Desenzano del Garda : 0.96%
San Benedetto del Tronto : 0.96%
Scafati : 0.96%
Pieve Emanuele : 0.96%
San Siro : 0.96%
Grugliasco : 0.96%
Lodi : 0.96%
Sesto San Giovanni : 0.96%
Buenos Aires : 0.96%
Genova : 0.96%
Fiumicino : 0.96%
Padova : 0.96%
Reggio Calabria : 0.96%
Cagliari : 0.96%


### BIRTH DATE

In [None]:
artists["birth_date"].unique()

print(artists["birth_date"].unique())

birth_date_count = artists["birth_date"].value_counts(dropna=False)

print(birth_date_count)

for place,count in birth_date_count.items():
    print(f"{place} : {(count / artists.shape[0]) * 100:.2f}%")

# shiva birth date 1999-08-27

[nan '1990-07-11' '1975-08-08' '1983-02-05' '1973-08-03' '1973-10-09'
 '1998-04-20' '1998-11-07' '1982-12-21' '1983-07-11' '1993-11-05'
 '1975-05-15' '1997-05-16' '1989-11-14' '1985-12-13' '1985-03-16'
 '1993-11-29' '1976-10-17' '1989-10-15' '1989-12-27' '1969-08-18'
 '1989-11-03' '1988-11-04' '2000-03-23' '1993-05-21' '1982-04-01'
 '1979-03-30' '1990-05-14' '1997-01-01' '1979-10-02' '1972-08-05'
 '1982-12-20' '1979-02-25' '1970-06-20' '1994-08-22' '1981-01-07'
 '2002-01-16' '1988-07-25' '1992-09-12' '1991-08-16' '1979-05-22'
 '1993-11-30' '1976-09-11' '1986-12-01' '1991-11-19' '1986-10-17'
 '1967-10-07' '1980-12-29' '1997-07-22' '1993-02-11' '1979-12-15'
 '2001-02-19' '1973-04-26' '1989-07-19' '1994-04-19' '1994-11-21'
 '2002-04-29' '1998-01-30' '1989-07-20' '1995-09-18' '1984-06-29'
 '1992-12-07' '1980-11-17' '1987-12-10'
 'http://www.wikidata.org/.well-known/genid/4111f32c49a23235b2e902dc8621d27c'
 '1987-01-24' '1994-02-21' '2001-03-17' '1999-09-26' '1991-05-17'
 '1975-09-06' '1979-

### PROVINCE AND REGION

In [51]:
artists["province"].unique()

print(artists["province"].unique())

province_count = artists["province"].value_counts(dropna=False)

print(province_count)

for place,count in birth_date_count.items():
    print(f"{place} : {(count / artists.shape[0]) * 100:.2f}%")

[nan 'Verona' 'Milano' 'Lecce' 'Salerno' 'Genova' 'Avellino' 'La Spezia'
 'Bologna' 'Monza e della Brianza' 'Torino' 'Ancona' 'Brescia' 'Roma'
 'Napoli' 'Gallura' 'Firenze' 'Vicenza' 'Taranto' 'Enna' 'Varese'
 'Treviso' 'Ascoli Piceno' 'Lodi' 'Padova' 'Reggio Calabria' 'Cagliari']
province
NaN                      34
Milano                   19
Roma                      8
Torino                    6
Salerno                   4
Napoli                    4
Vicenza                   3
Gallura                   3
Avellino                  2
Genova                    2
Ancona                    2
Brescia                   2
Monza e della Brianza     1
La Spezia                 1
Lecce                     1
Verona                    1
Bologna                   1
Firenze                   1
Taranto                   1
Enna                      1
Varese                    1
Treviso                   1
Ascoli Piceno             1
Lodi                      1
Padova                    1
Reggio Ca

In [52]:
artists["region"].unique()

print(artists["region"].unique())

region_count = artists["region"].value_counts(dropna=False)

print(region_count)

for place,count in birth_date_count.items():
    print(f"{place} : {(count / artists.shape[0]) * 100:.2f}%")

[nan 'Veneto' 'Lombardia' 'Puglia' 'Campania' 'Liguria' 'Emilia-Romagna'
 'Piemonte' 'Lazio' 'Sardegna' 'Toscana' 'Sicilia' 'Marche' 'Calabria']
region
NaN               36
Lombardia         24
Campania          10
Lazio              8
Piemonte           6
Veneto             6
Sardegna           4
Liguria            3
Puglia             2
Emilia-Romagna     1
Toscana            1
Sicilia            1
Marche             1
Calabria           1
Name: count, dtype: int64
nan : 29.81%
1990-07-11 : 0.96%
1975-08-08 : 0.96%
1983-02-05 : 0.96%
1973-08-03 : 0.96%
1973-10-09 : 0.96%
1998-04-20 : 0.96%
1998-11-07 : 0.96%
1982-12-21 : 0.96%
1983-07-11 : 0.96%
1993-11-05 : 0.96%
1975-05-15 : 0.96%
1997-05-16 : 0.96%
1989-11-14 : 0.96%
1985-12-13 : 0.96%
1985-03-16 : 0.96%
1993-11-29 : 0.96%
1976-10-17 : 0.96%
1989-10-15 : 0.96%
1989-12-27 : 0.96%
1969-08-18 : 0.96%
1989-11-03 : 0.96%
1988-11-04 : 0.96%
2000-03-23 : 0.96%
1993-05-21 : 0.96%
1982-04-01 : 0.96%
1979-03-30 : 0.96%
1990-05-14 : 0.96%
19

### CAREER START AND END

### LATITUDE AND LONGITUDE
These features don't bring any meaningfull information so they are dropped

## TRACKS INFO

In [43]:
summary = pd.DataFrame({
    'dtype': tracks.dtypes,
    'unique_values': tracks.nunique(),
    'missing_values': tracks.isna().sum(),
    'missing_percent': tracks.isna().mean() * 100,
    'duplicates': [tracks.duplicated().sum()] * len(tracks.columns)
})
print(summary)


                        dtype  unique_values  missing_values  missing_percent  \
id_artist              object            104               0         0.000000   
name_artist            object            104               0         0.000000   
full_title             object          11164               0         0.000000   
title                  object          10521               0         0.000000   
featured_artists       object           1740            7649        68.502597   
primary_artist         object            104               0         0.000000   
language               object             35             105         0.940355   
album                  object            890            1514        13.559018   
stats_pageviews       float64           4422            6524        58.427369   
swear_IT                int64             44               0         0.000000   
swear_EN                int64             40               0         0.000000   
swear_IT_words         objec