In [80]:
import os 
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import sqldf
import numpy as np
import nltk
import re
from wordcloud import WordCloud

This dataset contains artist data for ~20k artists whose songs made it to the Spotify weekly charts and ~136k additional artists who had at least one feature with at least one of the chart artists.

Further, information on the occurrence of features between all of these artists is included, allowing to generate a network with 135k+ musicians as nodes and 300k+ collaboration edges between them

spotify_id: A unique identifier for an artist or entity on Spotify.
name: The name of the artist or entity.
followers: The number of followers the artist has on Spotify.
popularity: A numeric value representing the artist's popularity on Spotify, likely based on metrics such as streams, saves, and other user interactions(it'S in percentile)
genres: A list of genres associated with the artist, enclosed in square brackets (we have 2585 genres in total)
chart_hits: A list of countries and chart positions the artist has achieved, formatted as ['country_code (rank)'].


In [81]:
nodes = pd.read_csv('/home/manoj/tuhh/data_science_3rd_sem/deep_learning_for_social_analytics/project/deep-learning-for-social-analytics-project/artist_and_music_data/nodes.csv')
edges = pd.read_csv('/home/manoj/tuhh/data_science_3rd_sem/deep_learning_for_social_analytics/project/deep-learning-for-social-analytics-project/artist_and_music_data/edges.csv')

In [82]:
nodes.shape

(156422, 6)

In [83]:
nodes.isnull().sum()

spotify_id         0
name               4
followers          4
popularity         0
genres             0
chart_hits    136781
dtype: int64

In [84]:
print(nodes[nodes['name'].isnull()])


                    spotify_id name  followers  popularity genres  chart_hits
12785   4oPYazJJ1o4rWBrTw9lm40  NaN    12853.0          42     []  ['no (1)']
118831  0TssKMrMlrXSxIwNrghB1V  NaN       14.0          25     []         NaN
122392  6JWuxVtbFw3VM4dxG8WIyg  NaN      632.0          38     []         NaN
147330  4EE7hLCmir49XJjVXPk07e  NaN    10840.0          17     []         NaN


In [85]:
nodes.dropna(subset=['name'], inplace=True)


In [86]:
print(nodes[nodes['followers'].isnull()])


                    spotify_id              name  followers  popularity  \
26079   4Jgl9FmNQF6ontIRyY19Ig             MC JL        NaN          18   
105067  3cCFieWefBXyyDRsjNuArE  Christian Wagner        NaN           4   
127808  1lLHQcDQFM03FcxZ5mQimA     Pekin Ibrahim        NaN           9   
136354  7estJE1m5cJnQs3Rc4iar0           Toni102        NaN           2   

                          genres chart_hits  
26079   ['deep funk ostentacao']        NaN  
105067                        []        NaN  
127808                        []        NaN  
136354                        []        NaN  


In [87]:
nodes.dropna(subset=['followers'], inplace=True)


In [88]:
nodes.isnull().sum()

spotify_id         0
name               0
followers          0
popularity         0
genres             0
chart_hits    136774
dtype: int64

In [89]:
nodes[nodes['chart_hits'].isnull()]

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
124,55tJwpPIz9BMrSLM45iEXX,9MUSES,58.0,38,"['k-pop', 'k-pop girl group']",
349,0fM4WhKpjJZc3a0eX947Tc,Sirius,1.0,25,[],
769,6RDpAm5XMUZqVZlBJ1UYQw,Starian,2.0,45,[],
892,0fAxJ75WJT3STJb3xWtpbc,MOLY,25.0,43,"['dominican pop', 'modern salsa', 'pop peruano...",
3700,0xHANYVyAfbp6GmjPKcOad,Harris Baba,1.0,21,[],
...,...,...,...,...,...,...
156417,2ces6d2YsQP1RpGMYpdFy8,David Urwitz,5470.0,29,['classic swedish pop'],
156418,6AeznZajNbXUulT7W4tK5l,Darmiko,2022.0,23,[],
156419,3GEijIjrgb4lPe9WtURBzz,Katriell,268.0,0,[],
156420,0ldQL0icSoMz9OOZcWG8Zt,Yung Fresh,181.0,19,[],


In [90]:


# Filter rows where 'chart_hits' is NaN
chart_hits_nan_rows = nodes[nodes['chart_hits'].isnull()]

# Save the filtered DataFrame to a new CSV file
chart_hits_nan_rows.to_csv('song_hit_exp_in_country.csv', index=False)


In [91]:
df_null_country = pd.read_csv('song_hit_exp_in_country.csv')
df_null_country.shape

(136774, 6)

In [92]:
df_null_country.head(10)

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
0,55tJwpPIz9BMrSLM45iEXX,9MUSES,58.0,38,"['k-pop', 'k-pop girl group']",
1,0fM4WhKpjJZc3a0eX947Tc,Sirius,1.0,25,[],
2,6RDpAm5XMUZqVZlBJ1UYQw,Starian,2.0,45,[],
3,0fAxJ75WJT3STJb3xWtpbc,MOLY,25.0,43,"['dominican pop', 'modern salsa', 'pop peruano...",
4,0xHANYVyAfbp6GmjPKcOad,Harris Baba,1.0,21,[],
5,2UoLlLFDkqewHH7EzaZcl7,Luisito Ayala Y La Puerto Rican Power,156.0,55,"['salsa', 'salsa puertorriquena']",
6,2JkJfNbvLY88Fvzspynu6G,Singhsta,30.0,41,[],
7,76Pz9hE2Q0eHm8mfVf5fET,Gobi Beast,0.0,3,[],
8,4R3mugkUqCALXgkwSptTbg,Đạt G,68.0,39,['vietnamese hip hop'],
9,6QR0aIEAemEigDCKjOVxe0,HYPNOSISMIC -D.R.B- (Division All Stars),20.0,41,"['anime', 'hypnosis mic', 'j-division']",


In [93]:

# Remove rows where 'chart_hits' is NaN, updating the DataFrame in place
nodes.dropna(subset=['chart_hits'], inplace=True)

nodes.shape

(19640, 6)

In [94]:
nodes.head(5)

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
0,48WvrUGoijadXXCsGocwM4,Byklubben,1738.0,24,"['nordic house', 'russelater']",['no (3)']
1,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ..."
2,652XIvIBNGg3C0KIGEJWit,Maxim,34596.0,36,[],['de (1)']
3,3dXC1YPbnQPsfHPVkm1ipj,Christopher Martin,249233.0,52,"['dancehall', 'lovers rock', 'modern reggae', ...","['at (1)', 'de (1)']"
4,74terC9ol9zMo8rfzhSOiG,Jakob Hellman,21193.0,39,"['classic swedish pop', 'norrbotten indie', 's...",['se (6)']


In [95]:

# Find common names between 'df['name']' and 'nodes['name']'
common_names_mask = df_null_country['name'].isin(nodes['name'])

# Extract rows from 'df' where the name is in 'nodes'
common_name_rows = df_null_country[common_names_mask]

# Display these rows
print(common_name_rows)

                    spotify_id       name  followers  popularity genres  \
1       0fM4WhKpjJZc3a0eX947Tc     Sirius        1.0          25     []   
44      04421rZTe44qdyNd2TLv5U  Southside        4.0          27     []   
142     0sXCaHp8TTF9fGncW7q0Ju         2K     1456.0          23     []   
168     0O8RNAxmdNK4loD8WUXEjk       Lira      438.0          16     []   
195     7bhf2PavNjzbUf02U2UVZp       Mr T       63.0           4     []   
...                        ...        ...        ...         ...    ...   
136563  6BmOzg47JGZBbnFl76Arbt   Papillon        1.0           0     []   
136608  0i6Wjb9BelaqViIBGBRx77       Koki       96.0          20     []   
136669  68m8Vm4YiCFp3NMwnmUMeP       Vice     3477.0          22     []   
136742  02JnAIyUx6CaBN2EZawwuv    Sentino       11.0           0     []   
136761  66RCMcPCPiqbG1N3UfokUQ     Cecile     1115.0          32     []   

        chart_hits  
1              NaN  
44             NaN  
142            NaN  
168            

In [96]:
import pandas as pd
import ast

# Dictionary mapping ISO 3166-1 alpha-2 country codes to country names
country_code_to_name = {
    'AF': 'Afghanistan', 'AL': 'Albania', 'DZ': 'Algeria', 'AD': 'Andorra', 'AO': 'Angola',
    'AG': 'Antigua and Barbuda', 'AR': 'Argentina', 'AM': 'Armenia', 'AU': 'Australia',
    'AT': 'Austria', 'AZ': 'Azerbaijan', 'BS': 'Bahamas', 'BH': 'Bahrain', 'BD': 'Bangladesh',
    'BB': 'Barbados', 'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin',
    'BT': 'Bhutan', 'BO': 'Bolivia', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana',
    'BR': 'Brazil', 'BN': 'Brunei', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi',
    'CV': 'Cabo Verde', 'KH': 'Cambodia', 'CM': 'Cameroon', 'CA': 'Canada',
    'CF': 'Central African Republic', 'TD': 'Chad', 'CL': 'Chile', 'CN': 'China',
    'CO': 'Colombia', 'KM': 'Comoros', 'CG': 'Congo (Congo-Brazzaville)',
    'CR': 'Costa Rica', 'HR': 'Croatia', 'CU': 'Cuba', 'CY': 'Cyprus',
    'CZ': 'Czechia (Czech Republic)', 'CD': 'Democratic Republic of the Congo',
    'DK': 'Denmark', 'DJ': 'Djibouti', 'DM': 'Dominica', 'DO': 'Dominican Republic',
    'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador', 'GQ': 'Equatorial Guinea',
    'ER': 'Eritrea', 'EE': 'Estonia', 'SZ': 'Eswatini (fmr. "Swaziland")', 'ET': 'Ethiopia',
    'FJ': 'Fiji', 'FI': 'Finland', 'FR': 'France', 'GA': 'Gabon', 'GM': 'Gambia',
    'GE': 'Georgia', 'DE': 'Germany', 'GH': 'Ghana', 'GR': 'Greece', 'GD': 'Grenada',
    'GT': 'Guatemala', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana',
    'HT': 'Haiti', 'VA': 'Holy See', 'HN': 'Honduras', 'HU': 'Hungary', 'IS': 'Iceland', 'HK': 'Hong kong',
    'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran', 'IQ': 'Iraq', 'IE': 'Ireland',
    'IL': 'Israel', 'IT': 'Italy', 'JM': 'Jamaica', 'JP': 'Japan', 'JO': 'Jordan',
    'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KI': 'Kiribati', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan',
    'LA': 'Laos', 'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia',
    'LY': 'Libya', 'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg',
    'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali',
    'MT': 'Malta', 'MH': 'Marshall Islands', 'MR': 'Mauritania', 'MU': 'Mauritius',
    'MX': 'Mexico', 'FM': 'Micronesia', 'MD': 'Moldova', 'MC': 'Monaco', 'MN': 'Mongolia',
    'ME': 'Montenegro', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar (formerly Burma)',
    'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands', 'NZ': 'New Zealand',
    'NI': 'Nicaragua', 'NE': 'Niger', 'NG': 'Nigeria', 'KP': 'North Korea',
    'MK': 'North Macedonia', 'NO': 'Norway', 'OM': 'Oman', 'PK': 'Pakistan', 'PW': 'Palau',
    'PS': 'Palestine State', 'PA': 'Panama', 'PG': 'Papua New Guinea', 'PY': 'Paraguay',
    'PE': 'Peru', 'PH': 'Philippines', 'PL': 'Poland', 'PT': 'Portugal', 'QA': 'Qatar',
    'RO': 'Romania', 'RU': 'Russia', 'RW': 'Rwanda', 'KN': 'Saint Kitts and Nevis',
    'LC': 'Saint Lucia', 'VC': 'Saint Vincent and the Grenadines', 'WS': 'Samoa',
    'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia', 'SN': 'Senegal',
    'RS': 'Serbia', 'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore',
    'SK': 'Slovakia', 'SI': 'Slovenia', 'SB': 'Solomon Islands', 'SO': 'Somalia',
    'ZA': 'South Africa', 'KR': 'South Korea', 'SS': 'South Sudan', 'ES': 'Spain',
    'LK': 'Sri Lanka', 'SD': 'Sudan', 'SR': 'Suriname', 'SE': 'Sweden', 'CH': 'Switzerland',
    'SY': 'Syria', 'TJ': 'Tajikistan', 'TZ': 'Tanzania', 'TH': 'Thailand',
    'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TW':'Taiwan',
    'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan', 'TV': 'Tuvalu', 'UG': 'Uganda',
    'UA': 'Ukraine', 'AE': 'United Arab Emirates', 'GB': 'United Kingdom',
    'US': 'United States of America', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
    'VE': 'Venezuela', 'VN': 'Vietnam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
}


# Convert lowercase codes to uppercase to match the dictionary keys
country_code_to_name = {k.lower(): v for k, v in country_code_to_name.items()}

# Function to parse the 'chart_hits' column
def parse_chart_hits(chart_hits):
    if pd.isna(chart_hits):
        return []
    # Safely evaluate the string to a list
    entries = ast.literal_eval(chart_hits)
    # Extract country and rank as tuples
    parsed = [(entry.split(' ')[0], int(entry.split('(')[1][:-1])) for entry in entries]
    return parsed

# Parse the 'chart_hits' column
nodes['parsed_chart_hits'] = nodes['chart_hits'].apply(parse_chart_hits)

# Expand the parsed chart hits into separate rows
expanded_data = nodes.explode('parsed_chart_hits').dropna(subset=['parsed_chart_hits'])

# Separate the country and rank into individual columns
expanded_data[['country', 'rank']] = pd.DataFrame(expanded_data['parsed_chart_hits'].tolist(), index=expanded_data.index)

# Map the country codes to full country names
expanded_data['country_full'] = expanded_data['country'].map(country_code_to_name)

# Drop intermediate column
expanded_data = expanded_data.drop(columns=['parsed_chart_hits'])

# Save the final result
expanded_data.to_csv('expanded_nodes.csv', index=False)

# Print the first few rows of the resulting DataFrame
print(expanded_data.head())
expanded_data.to_csv('node_with_countries.csv', index=False)


               spotify_id       name  followers  popularity  \
0  48WvrUGoijadXXCsGocwM4  Byklubben     1738.0          24   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   

                                   genres  \
0          ['nordic house', 'russelater']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   
1  ['christlicher rap', 'german hip hop']   

                                          chart_hits country  rank  \
0                                         ['no (3)']      no     3   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...      at    44   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...      de   111   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...      lu    22   


In [97]:
df = pd.read_csv('node_with_countries.csv')

In [98]:
lowest_value = df['popularity'].min()
highest_value = df['popularity'].max()

print(f'Lowest value: {lowest_value}')
print(f'Highest value: {highest_value}')

Lowest value: 0
Highest value: 100


In [99]:
df.shape

(79395, 9)

In [100]:
df.isnull().sum()

spotify_id      0
name            0
followers       0
popularity      0
genres          0
chart_hits      0
country         0
rank            0
country_full    0
dtype: int64

In [101]:
lowest_value = df['rank'].min()
highest_value = df['rank'].max()

print(f'Lowest value: {lowest_value}')
print(f'Highest value: {highest_value}')

Lowest value: 1
Highest value: 379


In [102]:
print(df[df['name'].isnull()])
# We will remove the row which doesn't have the artist name and any way it's 1 record.

Empty DataFrame
Columns: [spotify_id, name, followers, popularity, genres, chart_hits, country, rank, country_full]
Index: []


In [103]:
df.head(20)

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits,country,rank,country_full
0,48WvrUGoijadXXCsGocwM4,Byklubben,1738.0,24,"['nordic house', 'russelater']",['no (3)'],no,3,Norway
1,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...",at,44,Austria
2,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...",de,111,Germany
3,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...",lu,22,Luxembourg
4,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...",ch,31,Switzerland
5,4lDiJcOJ2GLCK6p9q5BgfK,Kontra K,1999676.0,72,"['christlicher rap', 'german hip hop']","['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...",vn,1,Vietnam
6,652XIvIBNGg3C0KIGEJWit,Maxim,34596.0,36,[],['de (1)'],de,1,Germany
7,3dXC1YPbnQPsfHPVkm1ipj,Christopher Martin,249233.0,52,"['dancehall', 'lovers rock', 'modern reggae', ...","['at (1)', 'de (1)']",at,1,Austria
8,3dXC1YPbnQPsfHPVkm1ipj,Christopher Martin,249233.0,52,"['dancehall', 'lovers rock', 'modern reggae', ...","['at (1)', 'de (1)']",de,1,Germany
9,74terC9ol9zMo8rfzhSOiG,Jakob Hellman,21193.0,39,"['classic swedish pop', 'norrbotten indie', 's...",['se (6)'],se,6,Sweden


In [36]:
df['genre_list'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Transform genre lists to strings, handling empty lists
df['genre_string'] = df['genre_list'].apply(lambda x: ', '.join(x) if x else 'No Genre Available')

# Gather all unique genres
unique_genres = set(genre for sublist in df['genre_list'] for genre in sublist)

# Create a DataFrame for all one-hot encoded genre columns
one_hot_encoded_df = pd.DataFrame(
    {genre: df['genre_list'].apply(lambda x: int(genre in x)) for genre in unique_genres}
)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df = pd.concat([df, one_hot_encoded_df], axis=1)

# Display the updated DataFrame
print("DataFrame with Genre Strings and One-Hot Encoding:\n", df.head())

# Calculate and display the frequency of each genre
all_genres = [genre for sublist in df['genre_list'] for genre in sublist]
genre_counts = pd.Series(all_genres).value_counts()

print("\nGenre Counts:\n", genre_counts)

DataFrame with Genre Strings and One-Hot Encoding:
                spotify_id       name  followers  popularity  \
0  48WvrUGoijadXXCsGocwM4  Byklubben     1738.0          24   
1  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
2  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
3  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   
4  4lDiJcOJ2GLCK6p9q5BgfK   Kontra K  1999676.0          72   

                                   genres  \
0          ['nordic house', 'russelater']   
1  ['christlicher rap', 'german hip hop']   
2  ['christlicher rap', 'german hip hop']   
3  ['christlicher rap', 'german hip hop']   
4  ['christlicher rap', 'german hip hop']   

                                          chart_hits  \
0                                         ['no (3)']   
1  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
2  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
3  ['at (44)', 'de (111)', 'lu (22)', 'ch (31)', ...   
4  ['at (44)', 'de

In [104]:

import pandas as pd
import ast

# Assuming 'df' is your DataFrame and 'genres' is the column with genre data as strings

# Convert genre strings to lists
df['genre_list'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Collect all unique genres
unique_genres = set(genre for sublist in df['genre_list'] for genre in sublist)

# Print the unique genres
print("Unique genres:")
for genre in unique_genres:
    print(genre)

Unique genres:
funk rock
turkish instrumental
slovak trap
italian alternative
rock alternatif francais
russian synthpop
indie rock italiano
graz indie
pakistani folk
baltic post-punk
toronto rap
muzica moldoveneasca
rock uruguayo
argentine punk
quebec indie
melbourne indie
samoan pop
j-reggae
javanese dangdut
neomelodici
cumbia paraguaya
electric blues
indonesian indie
britpop
future house
hands up
bernu dziesmas
funk capixaba
pop chileno
athens indie
final fantasy
banda carnavalera
russian metal
british soundtrack
neurofunk
jawaiian
southern hip hop
british soul
progressive thrash
indie nica
chinese new year
musica mocambicana
chinderlieder
turkce kadin rap
classic portuguese pop
folklore ecuatoriano
dutch singer-songwriter
groove metal
arrochadeira
birmingham hip hop
japanese chill rap
poezja spiewana
deep underground hip hop
coverchill
j-punk
bay area indie
dance pop
crunk
r&b argentino
uk contemporary jazz
new romantic
latin alternative
musica hondurena
indie tico
german hip hop
me

In [112]:

# Define the top 15 genres or genre classifications
top_genres = {
    'Pop': ['pop', 'j-pop', 'k-pop', 'synthpop', 'electropop', 'teen pop'],
    'Rock': ['rock', 'punk', 'garage'],
    'Hip Hop': ['hip hop', 'rap', 'drill'],
    'Electronic': ['edm', 'house', 'techno', 'electronic', 'trance', 'dance'],
    'Jazz': ['jazz'],
    'Folk World': ['folk', 'celtic', 'world', 'americana', 'country'],
    'RandB Soul': ['soul', 'r&b', 'rhythm and blues', 'funk', 'blues'],
    'Metal': ['metal'],
    'Classical Orchestral': ['classical', 'orchestral', 'symphony'],
    'Reggae Dancehall': ['reggae', 'dub', 'riddim', 'ska'],
    'Latin': ['salsa', 'latin', 'bachata', 'cumbia'],
    'Alternative Indie': ['indie', 'alternative', 'emo'],
    'Blues': ['blues'],
    'Punk': ['punk'],
    'Soundtrack': ['soundtrack', 'anime', 'broadway', 'movie']
}

# Initialize the genre columns with 0
for genre in top_genres:
    df[genre] = 0

# Function to encode genres based on the presence of keywords
def encode_genres(genres_list):
    # Initialize dictionary to track present genres
    present_genres = {genre: 0 for genre in top_genres}
    # Iterate over each specific genre
    for genre in genres_list:
        genre_lower = genre.lower()  # Ensure case-insensitive matching
        # Check against each broad genre and its keywords
        for broad_genre, keywords in top_genres.items():
            if any(keyword in genre_lower for keyword in keywords):
                present_genres[broad_genre] = 1
    return present_genres

# Apply the function to each row and update the DataFrame
for index, row in df.iterrows():
    encoded = encode_genres(row['genres'])
    for genre, value in encoded.items():
        df.at[index, genre] = value

# Display the resulting DataFrame
print(df[['country_full', 'genres'] + list(top_genres.keys())])

      country_full                                             genres  Pop  \
0           Norway                     ['nordic house', 'russelater']    0   
1          Austria             ['christlicher rap', 'german hip hop']    0   
2          Germany             ['christlicher rap', 'german hip hop']    0   
3       Luxembourg             ['christlicher rap', 'german hip hop']    0   
4      Switzerland             ['christlicher rap', 'german hip hop']    0   
...            ...                                                ...  ...   
79390      Finland    ['finnish dance pop', 'finnish pop', 'iskelma']    0   
79391      Austria                                     ['german pop']    0   
79392      Germany                                     ['german pop']    0   
79393        Spain                                 ['urbano espanol']    0   
79394        Chile  ['chilean rock', 'rap chileno', 'reggae en esp...    0   

       Rock  Hip Hop  Electronic  Jazz  Folk World  RandB Soul 

In [115]:
top_genres = {
    'Pop': ['pop', 'j-pop', 'k-pop', 'synthpop', 'electropop', 'teen pop'],
    'Rock': ['rock', 'punk', 'garage'],
    'Hip Hop': ['hip hop', 'rap', 'drill'],
    'Electronic': ['edm', 'house', 'techno', 'electronic', 'trance', 'dance'],
    'Jazz': ['jazz'],
    'Folk World': ['folk', 'celtic', 'world', 'americana', 'country'],
    'RandB Soul': ['soul', 'r&b', 'rhythm and blues', 'funk', 'blues'],
    'Metal': ['metal'],
    'Classical Orchestral': ['classical', 'orchestral', 'symphony'],
    'Reggae Dancehall': ['reggae', 'dub', 'riddim', 'ska'],
    'Latin': ['salsa', 'latin', 'bachata', 'cumbia'],
    'Alternative Indie': ['indie', 'alternative', 'emo'],
    'Blues': ['blues'],
    'Punk': ['punk'],
    'Soundtrack': ['soundtrack', 'anime', 'broadway', 'movie']
}
# Initialize the genre columns with 0
for genre in top_genres:
    df[genre] = 0
# Function to perform one-hot encoding based on substring presence
def encode_genres(genres_list):
    present_genres = {genre: 0 for genre in top_genres}
    for genre in genres_list:
        genre_lower = genre.lower()
        for broad_genre, keywords in top_genres.items():
            # Check if any keyword is a substring of the genre element
            if any(keyword in genre_lower for keyword in keywords):
                present_genres[broad_genre] = 1
    return present_genres

# Apply the function to encode the genres
for index, row in df.iterrows():
    encoded = encode_genres(row['genres'])
    for genre, value in encoded.items():
        df.at[index, genre] = value

# Display the resulting DataFrame
print(df.head(5)[['country_full', 'genres'] + list(top_genres.keys())])

  country_full                                  genres  Pop  Rock  Hip Hop  \
0       Norway          ['nordic house', 'russelater']    0     0        0   
1      Austria  ['christlicher rap', 'german hip hop']    0     0        0   
2      Germany  ['christlicher rap', 'german hip hop']    0     0        0   
3   Luxembourg  ['christlicher rap', 'german hip hop']    0     0        0   
4  Switzerland  ['christlicher rap', 'german hip hop']    0     0        0   

   Electronic  Jazz  Folk World  RandB Soul  Metal  Classical Orchestral  \
0           0     0           0           0      0                     0   
1           0     0           0           0      0                     0   
2           0     0           0           0      0                     0   
3           0     0           0           0      0                     0   
4           0     0           0           0      0                     0   

   Reggae Dancehall  Latin  Alternative Indie  Blues  Punk  Soundtrack  
0

In [107]:
df.head(10)

0                       ['nordic house', 'russelater']
1               ['christlicher rap', 'german hip hop']
2               ['christlicher rap', 'german hip hop']
3               ['christlicher rap', 'german hip hop']
4               ['christlicher rap', 'german hip hop']
5               ['christlicher rap', 'german hip hop']
6                                                   []
7    ['dancehall', 'lovers rock', 'modern reggae', ...
8    ['dancehall', 'lovers rock', 'modern reggae', ...
9    ['classic swedish pop', 'norrbotten indie', 's...
Name: genres, dtype: object