In [8]:
import os 
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import sqldf
import numpy as np
import nltk
import re
from wordcloud import WordCloud

This dataset contains artist data for ~20k artists whose songs made it to the Spotify weekly charts and ~136k additional artists who had at least one feature with at least one of the chart artists.

Further, information on the occurrence of features between all of these artists is included, allowing to generate a network with 135k+ musicians as nodes and 300k+ collaboration edges between them

spotify_id: A unique identifier for an artist or entity on Spotify.
name: The name of the artist or entity.
followers: The number of followers the artist has on Spotify.
popularity: A numeric value representing the artist's popularity on Spotify, likely based on metrics such as streams, saves, and other user interactions(it'S in percentile)
genres: A list of genres associated with the artist, enclosed in square brackets (we have 2585 genres in total)
chart_hits: A list of countries and chart positions the artist has achieved, formatted as ['country_code (rank)'].


In [10]:
nodes = pd.read_csv('/home/manoj/tuhh/data_science_3rd_sem/deep_learning_for_social_analytics/project/deep-learning-for-social-analytics-project/artist_and_music_data/nodes.csv')
edges = pd.read_csv('/home/manoj/tuhh/data_science_3rd_sem/deep_learning_for_social_analytics/project/deep-learning-for-social-analytics-project/artist_and_music_data/edges.csv')

In [11]:
nodes.head()

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
0,48WvrUGoijadXXCsGocwM4,Byklubben,1738.0,24,"['nordic house', 'russelater']",['no (3)']
1,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ..."
2,652XIvIBNGg3C0KIGEJWit,Maxim,34596.0,36,[],['de (1)']
3,3dXC1YPbnQPsfHPVkm1ipj,Christopher Martin,249233.0,52,"['dancehall', 'lovers rock', 'modern reggae', ...","['at (1)', 'de (1)']"
4,74terC9ol9zMo8rfzhSOiG,Jakob Hellman,21193.0,39,"['classic swedish pop', 'norrbotten indie', 's...",['se (6)']


In [12]:
edges.head()

Unnamed: 0,id_0,id_1
0,76M2Ekj8bG8W7X2nbx2CpF,7sfl4Xt5KmfyDs2T3SVSMK
1,0hk4xVujcyOr6USD95wcWb,7Do8se3ZoaVqUt3woqqSrD
2,38jpuy3yt3QIxQ8Fn1HTeJ,4csQIMQm6vI2A2SCVDuM2z
3,6PvcxssrQ0QaJVaBWHD07l,6UCQYrcJ6wab6gnQ89OJFh
4,2R1QrQqWuw3IjoP5dXRFjt,4mk1ScvOUkuQzzCZpT6bc0


In [22]:
import pandas as pd
import ast

# Dictionary mapping ISO 3166-1 alpha-2 country codes to country names
country_code_to_name = {
    'AF': 'Afghanistan', 'AL': 'Albania', 'DZ': 'Algeria', 'AD': 'Andorra', 'AO': 'Angola',
    'AG': 'Antigua and Barbuda', 'AR': 'Argentina', 'AM': 'Armenia', 'AU': 'Australia',
    'AT': 'Austria', 'AZ': 'Azerbaijan', 'BS': 'Bahamas', 'BH': 'Bahrain', 'BD': 'Bangladesh',
    'BB': 'Barbados', 'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin',
    'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana',
    'BR': 'Brazil', 'BN': 'Brunei', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi',
    'CV': 'Cabo Verde', 'KH': 'Cambodia', 'CM': 'Cameroon', 'CA': 'Canada',
    'CF': 'Central African Republic', 'TD': 'Chad', 'CL': 'Chile', 'CN': 'China',
    'CO': 'Colombia', 'KM': 'Comoros', 'CG': 'Congo (Congo-Brazzaville)',
    'CR': 'Costa Rica', 'HR': 'Croatia', 'CU': 'Cuba', 'CY': 'Cyprus',
    'CZ': 'Czechia (Czech Republic)', 'CD': 'Democratic Republic of the Congo',
    'DK': 'Denmark', 'DJ': 'Djibouti', 'DM': 'Dominica', 'DO': 'Dominican Republic',
    'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador', 'GQ': 'Equatorial Guinea',
    'ER': 'Eritrea', 'EE': 'Estonia', 'SZ': 'Eswatini (fmr. "Swaziland")', 'ET': 'Ethiopia',
    'FJ': 'Fiji', 'FI': 'Finland', 'FR': 'France', 'GA': 'Gabon', 'GM': 'Gambia',
    'GE': 'Georgia', 'DE': 'Germany', 'GH': 'Ghana', 'GR': 'Greece', 'GD': 'Grenada',
    'GT': 'Guatemala', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana',
    'HT': 'Haiti', 'VA': 'Holy See', 'HN': 'Honduras', 'HU': 'Hungary', 'IS': 'Iceland',
    'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran', 'IQ': 'Iraq', 'IE': 'Ireland',
    'IL': 'Israel', 'IT': 'Italy', 'JM': 'Jamaica', 'JP': 'Japan', 'JO': 'Jordan',
    'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KI': 'Kiribati', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan',
    'LA': 'Laos', 'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia',
    'LY': 'Libya', 'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg',
    'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali',
    'MT': 'Malta', 'MH': 'Marshall Islands', 'MR': 'Mauritania', 'MU': 'Mauritius',
    'MX': 'Mexico', 'FM': 'Micronesia', 'MD': 'Moldova', 'MC': 'Monaco', 'MN': 'Mongolia',
    'ME': 'Montenegro', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar (formerly Burma)',
    'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands', 'NZ': 'New Zealand',
    'NI': 'Nicaragua', 'NE': 'Niger', 'NG': 'Nigeria', 'KP': 'North Korea',
    'MK': 'North Macedonia', 'NO': 'Norway', 'OM': 'Oman', 'PK': 'Pakistan', 'PW': 'Palau',
    'PS': 'Palestine State', 'PA': 'Panama', 'PG': 'Papua New Guinea', 'PY': 'Paraguay',
    'PE': 'Peru', 'PH': 'Philippines', 'PL': 'Poland', 'PT': 'Portugal', 'QA': 'Qatar',
    'RO': 'Romania', 'RU': 'Russia', 'RW': 'Rwanda', 'KN': 'Saint Kitts and Nevis',
    'LC': 'Saint Lucia', 'VC': 'Saint Vincent and the Grenadines', 'WS': 'Samoa',
    'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia', 'SN': 'Senegal',
    'RS': 'Serbia', 'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore',
    'SK': 'Slovakia', 'SI': 'Slovenia', 'SB': 'Solomon Islands', 'SO': 'Somalia',
    'ZA': 'South Africa', 'KR': 'South Korea', 'SS': 'South Sudan', 'ES': 'Spain',
    'LK': 'Sri Lanka', 'SD': 'Sudan', 'SR': 'Suriname', 'SE': 'Sweden', 'CH': 'Switzerland',
    'SY': 'Syria', 'TJ': 'Tajikistan', 'TZ': 'Tanzania', 'TH': 'Thailand',
    'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago',
    'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan', 'TV': 'Tuvalu', 'UG': 'Uganda',
    'UA': 'Ukraine', 'AE': 'United Arab Emirates', 'GB': 'United Kingdom',
    'US': 'United States of America', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
    'VE': 'Venezuela', 'VN': 'Vietnam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
}


# Convert lowercase codes to uppercase to match the dictionary keys
country_code_to_name = {k.lower(): v for k, v in country_code_to_name.items()}

# Function to parse the 'chart_hits' column
def parse_chart_hits(chart_hits):
    if pd.isna(chart_hits):
        return []
    # Safely evaluate the string to a list
    entries = ast.literal_eval(chart_hits)
    # Extract country and rank as tuples
    parsed = [(entry.split(' ')[0], int(entry.split('(')[1][:-1])) for entry in entries]
    return parsed

# Parse the 'chart_hits' column
nodes['parsed_chart_hits'] = nodes['chart_hits'].apply(parse_chart_hits)

# Expand the parsed chart hits into separate rows
expanded_data = nodes.explode('parsed_chart_hits').dropna(subset=['parsed_chart_hits'])

# Separate the country and rank into individual columns
expanded_data[['country', 'rank']] = pd.DataFrame(expanded_data['parsed_chart_hits'].tolist(), index=expanded_data.index)

# Map the country codes to full country names
expanded_data['country_full'] = expanded_data['country'].map(country_code_to_name)

# Drop intermediate column
expanded_data = expanded_data.drop(columns=['parsed_chart_hits'])

# Save the final result
expanded_data.to_csv('expanded_nodes.csv', index=False)

# Print the first few rows of the resulting DataFrame
print(expanded_data.head())
expanded_data.to_csv('node_with_countries.csv', index=False)


               spotify_id       name  followers  popularity  \
0  48WvrUGoijadXXCsGocwM4  Byklubben     1738.0          24   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   

                                   genres  \
0          ['nordic house', 'russelater']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   

                                          chart_hits  \
0                                         ['no (3)']   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   

             

In [24]:
df = pd.read_csv('node_with_countries.csv')

In [25]:
df.head()

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits,countries,country,rank,country_full
0,48WvrUGoijadXXCsGocwM4,Byklubben,1738.0,24,"['nordic house', 'russelater']",['no (3)'],['Norway'],no,3,Norway
1,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...","['Austria', 'Germany', 'Luxembourg', 'Switzerl...",at,44,Austria
2,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...","['Austria', 'Germany', 'Luxembourg', 'Switzerl...",de,111,Germany
3,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...","['Austria', 'Germany', 'Luxembourg', 'Switzerl...",lu,22,Luxembourg
4,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...","['Austria', 'Germany', 'Luxembourg', 'Switzerl...",ch,31,Switzerland


In [26]:
lowest_value = df['popularity'].min()
highest_value = df['popularity'].max()

print(f'Lowest value: {lowest_value}')
print(f'Highest value: {highest_value}')

Lowest value: 0
Highest value: 100


In [28]:
df.isnull().sum()

spotify_id         0
name               1
followers          0
popularity         0
genres             0
chart_hits         0
countries          0
country            0
rank               0
country_full    2966
dtype: int64

In [36]:
df['genre_list'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Transform genre lists to strings, handling empty lists
df['genre_string'] = df['genre_list'].apply(lambda x: ', '.join(x) if x else 'No Genre Available')

# Gather all unique genres
unique_genres = set(genre for sublist in df['genre_list'] for genre in sublist)

# Create a DataFrame for all one-hot encoded genre columns
one_hot_encoded_df = pd.DataFrame(
    {genre: df['genre_list'].apply(lambda x: int(genre in x)) for genre in unique_genres}
)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1)

# Display the updated DataFrame
print("DataFrame with Genre Strings and One-Hot Encoding:\n", df.head())

# Calculate and display the frequency of each genre
all_genres = [genre for sublist in df['genre_list'] for genre in sublist]
genre_counts = pd.Series(all_genres).value_counts()

print("\nGenre Counts:\n", genre_counts)

DataFrame with Genre Strings and One-Hot Encoding:
                spotify_id       name  followers  popularity  \
0  48WvrUGoijadXXCsGocwM4  Byklubben     1738.0          24   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
2  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
3  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
4  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   

                                   genres  \
0          ['nordic house', 'russelater']   
1  ['christlicher rap', 'german hip hop']   
2  ['christlicher rap', 'german hip hop']   
3  ['christlicher rap', 'german hip hop']   
4  ['christlicher rap', 'german hip hop']   

                                          chart_hits  \
0                                         ['no (3)']   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
2  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
3  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
4  ['at (44)', 'de

In [39]:

import pandas as pd
import ast

# Assuming 'df' is your DataFrame and 'genres' is the column with genre data as strings

# Convert genre strings to lists
df['genre_list'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Collect all unique genres
unique_genres = set(genre for sublist in df['genre_list'] for genre in sublist)

# Print the unique genres
print("Unique genres:")
for genre in unique_genres:
    print(genre)

Unique genres:
sean-nos singing
greek drill
trap boricua
danish pop
canadian pop
final fantasy
irish country
gotlandsk musik
icelandic singer-songwriter
psychedelic trance
tecnobanda
industrial
polish trap
new wave
classic israeli pop
maltese hip hop
electronica chilena
nordic folk
hokkien pop
deep soft rock
lund indie
underground amapiano
jersey club
jamaican hip hop
finnish new wave
turkce slow sarkilar
dream pop
swedish indie rock
alternative americana
denton tx indie
neoclassical darkwave
romanian metal
finnish metal
finnish power metal
pinoy praise
nz folk
dangdut
nordic post-rock
rap criolo
abstract hip hop
turkish deep house
luk thung
slovak indie
canadian contemporary r&b
atlantic canada hip hop
idol rock
classic finnish rock
mestissatge
art pop
bmore
chicago punk
puerto rican rock
uzbek pop
korean indie folk
polish alternative
k-pop boy group
salsa venezolana
vegan straight edge
indie triste
neo-manele
japanese soul
pinoy edm
r&b italiano
cumbia ranchera
canadian old school hi