In [1]:
import os 
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import sqldf
import numpy as np
import nltk
import re
from wordcloud import WordCloud

In [2]:
nodes = pd.read_csv('/home/manoj/tuhh/data_science_3rd_sem/deep_learning_for_social_analytics/project/deep-learning-for-social-analytics-project/cleaned_datasets/archieve_but_important/nodes.csv')


In [3]:
nodes.dtypes


spotify_id     object
name           object
followers     float64
popularity      int64
genres         object
chart_hits     object
dtype: object

In [4]:
nodes.describe()

Unnamed: 0,followers,popularity
count,156418.0,156422.0
mean,86223.71,21.157497
std,940100.1,18.33829
min,0.0,0.0
25%,24.0,4.0
50%,363.0,18.0
75%,6258.0,34.0
max,102156900.0,100.0


In [5]:
nodes.isnull().sum()

spotify_id         0
name               4
followers          4
popularity         0
genres             0
chart_hits    136781
dtype: int64

In [6]:
nodes.dropna(subset=['name'], inplace=True)
#delete null names as they are not required.


In [7]:
nodes.dropna(subset=['followers'], inplace=True)
#delete null followers

In [None]:

from concurrent.futures import ProcessPoolExecutor
from langdetect import detect, DetectorFactory

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

def is_english(text):
    try:
        # Fast ASCII check to quickly determine if a text is likely English
        if all(ord(char) < 128 for char in text):
            return True
        return detect(text) == 'en'
    except:
        return False

def filter_english_on_name(df):
    with ProcessPoolExecutor() as executor:
        # Mapping the language detection function to the 'name' column only as spotify_id has some ids
        name_english = list(executor.map(is_english, df['name']))
    # return the english name    
    return df[name_english]

nodes_english = filter_english_on_name(nodes)

# Print the filtered DataFrame
print(nodes_english)

In [None]:
nodes_english.shape

In [None]:
# As per the discussion in the interium presentation we are dropping the chart_hits attributes.
nodes_english = nodes_english.drop(columns=['chart_hits'])

In [None]:
nodes_english.shape

In [None]:

# This is to know the unique genres.
import pandas as pd
import ast


# Convert genre strings to lists
nodes_english['genre_list'] = nodes_english['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Collect all unique genres
unique_genres = set(genre for sublist in nodes_english['genre_list'] for genre in sublist)

# Print the unique genres
print("Unique genres:")
for genre in unique_genres:
    print(genre)

In [None]:
import pandas as pd
import ast
import re


# Convert genre strings to lists
nodes_english['genre_list'] = nodes_english['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Collect all unique genres
unique_genres = set(genre for sublist in nodes_english['genre_list'] for genre in sublist)

# Define the mapping from sub-genres to prominent genres
top_genres = {
    'pop': ['pop', 'j-pop', 'k-pop', 'synthpop', 'electropop', 'teen pop'],
    'rock': ['rock', 'punk', 'garage'],
    'hip_hop': ['hip hop', 'rap', 'drill'],
    'electronic': ['edm', 'house', 'techno', 'electronic', 'trance', 'dance'],
    'jazz': ['jazz'],
    'folk world': ['folk', 'celtic', 'world', 'americana', 'country'],
    'randb_soul': ['soul', 'r&b', 'rhythm and blues', 'funk', 'blues'],
    'metal': ['metal'],
    'classical_orchestral': ['classical', 'orchestral', 'symphony'],
    'reggae_dancehall': ['reggae', 'dub', 'riddim', 'ska'],
    'latin': ['salsa', 'latin', 'bachata', 'cumbia'],
    'alternative indie': ['indie', 'alternative', 'emo'],
    'blues': ['blues'],
    'punk': ['punk'],
    'soundtrack': ['soundtrack', 'anime', 'broadway', 'movie']
}

# Function to assign each unique genre to a top genre
def assign_genre(unique_genre):
    # Clean up the genre string for matching
    genre_clean = unique_genre.lower().strip()
    
    # Check against each top genre's sub-genres
    for top_genre, subgenres in top_genres.items():
        for subgenre in subgenres:
            # Use regex to check for word boundaries to avoid partial matches
            if re.search(rf'\b{subgenre}\b', genre_clean):
                return top_genre
    return 'unknown'

# Map each unique genre to a top genre
genre_mapping = {genre: assign_genre(genre) for genre in unique_genres}

# Apply this mapping to a column in your DataFrame
nodes_english['prominent_genres'] = nodes_english['genre_list'].apply(
    lambda genres: [genre_mapping.get(genre, 'Other') for genre in genres]
)

# You can now view the resultant DataFrame to check the mappings
print(nodes_english[['genres', 'prominent_genres']])

In [None]:

unique_prominent_genres = set(gen for sublist in nodes_english['prominent_genres'] for gen in sublist)

# Count the number of unique prominent genres
unique_count = len(unique_prominent_genres)

# Print the count and the unique prominent genres
print(f"Number of unique prominent genres: {unique_count}")
print("Unique prominent genres:", unique_prominent_genres)

In [None]:
# Step 1: Explode the genre lists into separate rows
df_exploded = nodes_english.explode('prominent_genres')

# Step 2: Ensuring deduplication across these exploded items per original entry
df_exploded = df_exploded.groupby(df_exploded.index).apply(lambda x: x.drop_duplicates(subset=['prominent_genres'])).reset_index(level=0, drop=True)

# Step 3: Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df_exploded['prominent_genres'])

# Step 4: Sum up genre indicators for each original record by grouping back using the initial index
final_one_hot_encoded = one_hot_encoded.groupby(one_hot_encoded.index).sum()

# Combine back to original DataFrame as needed
df_final = pd.concat([nodes_english.drop(columns=['prominent_genres']), final_one_hot_encoded], axis=1)

# Display the one-hot encoded result
print(df_final)

In [None]:
df_final = df_final.drop(columns=['genres','genre_list'])


In [None]:
df_final.to_csv('artist_without_country_wise_rank.csv', index=False)

In [None]:
df = pd.read_csv('artist_without_country_wise_rank.csv')

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Artist Features')
plt.show()

In [None]:
genres_columns= ['alternative Indie', 'classical_orchestral',
                      'electronic', 'folk world', 'hazz', 'hip_hop', 'latin', 'metal', 
                      'pop', 'randb_Soul', 'reggae_dancehall', 'rock', 'soundtrack', 'unknown']
correlation_matrix = df[genres_columns].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Artist Features with only genres')
plt.show()

In [None]:
#columns after 'followers' and 'popularity'
genre_columns = columns_to_include[2:] 

# Sum of occurrences for each genre
genre_counts = df[genre_columns].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=genre_counts.index, y=genre_counts.values)
plt.title('Number of Artists in Each Genre')
plt.xticks(rotation=90)
plt.show()

In [None]:

print(df[['followers', 'popularity']].head(5))

In [None]:
# plotting the distribution for the followers.
import matplotlib.ticker as ticker 


plt.figure(figsize=(8, 6))
sns.boxplot(x=df['followers'], color='lightblue')
plt.title('Box Plot of Followers')
plt.xlabel('Number of Followers')

plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.show()

In [None]:

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(df['followers'], bins=50, kde=False, color='skyblue')
plt.title('Histogram of Followers')
plt.xlabel('Number of Followers')
plt.ylabel('Frequency')

plt.xscale('log')

plt.show()

In [None]:
artists_above_20m = df[df['followers'] > 20000000]

# Count the number of such artists
count_artists_above_20m = len(artists_above_20m)
print(f"Number of artists with more than 20 million followers: {count_artists_above_20m}")

In [None]:

# Visualize the artists with more than 20 million followers
plt.figure(figsize=(10, 6))
sns.barplot(x=artists_above_20m['name'], y=artists_above_20m['followers'], palette='viridis')
plt.title('Artists with More Than 20 Million Followers')
plt.xlabel('Artist')
plt.ylabel('Number of Followers')
plt.xticks(rotation=45, ha='left')
plt.show()

In [None]:
artists_above_100m = df[df['followers'] > 100000000]
plt.figure(figsize=(10, 6))
sns.barplot(x=artists_above_100m['name'], y=artists_above_100m['followers'], palette='viridis')
plt.title('Artists with More Than 100 Million Followers')
plt.xlabel('Artist')
plt.ylabel('Number of Followers')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
artists_above_60m = df[df['followers'] > 60000000]
plt.figure(figsize=(10, 6))
sns.barplot(x=artists_above_60m['name'], y=artists_above_60m['followers'], palette='viridis')
plt.title('Artists with More Than 60 Million Followers')
plt.xlabel('Artist')
plt.ylabel('Number of Followers')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:

plt.figure(figsize=(8, 6))
sns.histplot(df['followers'], bins=50, kde=False, color='skyblue')
plt.title('Histogram of Followers')
plt.xlabel('Number of Followers')
plt.ylabel('Frequency')

# Set the x-axis limits to focus on 1 to 9 million
# 140000 artist has less then 2.5million followers
plt.xlim(1, 9000000)

plt.show()

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 2)
sns.histplot(df['popularity'], bins=50, kde=True, color='green')
plt.title('Distribution of Popularity')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
# LEts perform the standard scaler as these 2 attributes are the in the order.
scaler = StandardScaler()
df[['followers', 'popularity']] = scaler.fit_transform(df[['followers', 'popularity']])

In [None]:
df.head(5)

In [None]:

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df['followers'], bins=50, kde=True, color='blue')
plt.title('Distribution of Followers')

plt.subplot(1, 2, 2)
sns.histplot(df['popularity'], bins=50, kde=True, color='green')
plt.title('Distribution of Popularity')

plt.tight_layout()
plt.show()

In [None]:
df.to_csv("nodes_cleaned.csv", index= False)

In [None]:
df_1 = pd.read_csv("nodes_cleaned.csv")

In [None]:
df_1.head()

In [None]:
# Since spotify_id and names are our identifiers.

columns_to_include = ['followers', 'popularity', 'alternative Indie', 'classical_orchestral',
                      'electronic', 'folk world', 'hazz', 'hip_hop', 'latin', 'metal',
                      'pop', 'randb_Soul', 'reggae_dancehall', 'rock', 'soundtrack', 'unknown']

X = df[columns_to_include]