# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
from google.colab import files
import pickle
import os

!pip install scipy
import scipy.stats as stats

!pip install papermill
!pip install nbconvert
!pip install nbformat
!pip install IPython

import papermill as pm
import nbformat
from nbconvert import HTMLExporter
from IPython.display import HTML, display

import json
from google.colab import drive

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
BG_BLACK = "\033[40m"
BG_RED = "\033[41m"
BG_GREEN = "\033[42m"
BG_YELLOW = "\033[43m"
BG_BLUE = "\033[44m"
BG_MAGENTA = "\033[45m"
BG_CYAN = "\033[46m"
BG_WHITE = "\033[47m"
BG_DARK_GRAY = "\033[100m"
BG_BRIGHT_RED = "\033[101m"
BG_BRIGHT_GREEN = "\033[102m"
BG_BRIGHT_YELLOW = "\033[103m"
BG_BRIGHT_BLUE = "\033[104m"
BG_BRIGHT_MAGENTA = "\033[105m"
BG_BRIGHT_CYAN = "\033[106m"
BG_WHITE = "\033[107m"
RESET = "\033[0m" # Reset all formatting

# Mount drive, Load config

In [None]:
PROJECT_PATH = '/content/drive/MyDrive/Projects/GitHub/Spotify/'
CONFIG_FILE = f"{PROJECT_PATH}src/config.json"

In [None]:
drive.mount('/content/drive')

with open(CONFIG_FILE, 'r') as f:
    project_config = json.load(f)
    project_config.pop('_comment', None)
    project_config.pop('_note', None)
    f.close()

In [None]:
np.random.seed(31071967)

# Run project notebook N-1

In [None]:
if project_config['chain_notebooks'] == '1':

  input_file = f"{project_config['project_path']}{project_config['notebooks_directory']}{project_config['notebook1']}"
  output_file = f"{project_config['project_path']}{project_config['output_directory']}{project_config['output1']}"

  # --- Execute the proviuse notebook with parameters ---
  pm.execute_notebook(
      input_path = input_file,
      output_path = output_file,
      log_output=False,  # don't print logs while running
      progress_bar=True
  )

  # --- Convert the executed notebook to HTML ---
  nb = nbformat.read(output_file, as_version=4)
  html_exporter = HTMLExporter()
  html_exporter.template_name = "lab"  # modern look; alternatives: 'classic', 'basic'
  body, _ = html_exporter.from_notebook_node(nb)

  # --- Display the HTML result inline ---
  display(HTML(body))

# Load pickle

In [None]:
pickle_file      = project_config['project_path'] + project_config['pickles_directory'] + project_config['pickle1']
test_pickle_file = project_config['project_path'] + project_config['pickles_directory'] + project_config['pickle1_test']

In [None]:
# Read pickle into DataFrame
df = pd.read_pickle(pickle_file)

display(df.head(1)), display(df.tail(1))

In [None]:
# types of cols
small_cat_cols     = ['mode_mean_popularity','key_mean_popularity', 'playlist_genre_grouped_mean_popularity','release_decade_mean_popularity','release_month_mean_popularity']
small_cat_symboles = ['mode','key','playlist_genre_grouped','release_decade','release_month', 'release_year']
large_cat_cols     = ['track_artist_mean_popularity','track_album_id_mean_popularity','playlist_id_mean_popularity']
cont_cols          = ['acousticness', 'danceability','duration_ms','energy', 'liveness', 'loudness', 'speechiness', 'tempo','valence']
y_col              = 'track_popularity'
x_cols             = large_cat_cols + small_cat_cols + cont_cols
MERGE_ON_COL       = 'track_id'

# Skewness, Boxplots

In [None]:
# Reset counts before applying the function
normal_distributions = 0
other_distributions = 0

def highlight_distribution_type(cell_value):

    highlight = 'background-color: mediumspringgreen;'
    default = ''
    negative = 'background-color: hotpink;'

    global normal_distributions, other_distributions

    if cell_value > 1:
        other_distributions += 1
        return highlight
    elif cell_value < -1:
        other_distributions += 1
        return negative
    else:
        normal_distributions += 1
        return default

display( pd.DataFrame(df[[y_col]+x_cols].skew(),columns=['skewness']).sort_values(by='skewness', ascending=False).style.applymap(highlight_distribution_type) )

print(f'Normal distributions: {normal_distributions}')
print(f'Other distributions: {other_distributions}')

In [None]:
## boxplots
cols_to_plot = [y_col] + large_cat_cols + small_cat_cols + cont_cols

plt.figure(figsize=(20,200))
for plot_counter, col in enumerate(cols_to_plot, start=1):
  ax = plt.subplot(60, 3, plot_counter)
  sb.boxplot(data=df, x=col, ax=ax)
  plt.subplots_adjust(hspace = 0.7)

plt.show()



# Correlations, Pairpolts



In [None]:
# calculating highest and lowest correlations
corr = df[large_cat_cols + cont_cols + [y_col]].corr() #(method='spearman')

# Unstack into pairs
corr_pairs = corr.unstack()

# Drop self-correlations
corr_pairs = corr_pairs[corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1)]

# Sort by correlation value
sorted_corr = corr_pairs.sort_values(ascending=False)

print()
print(f"{BG_BRIGHT_RED} Highest positive correlations{RESET}")
print(sorted_corr.head(20)[::2])
print()
print(f"{BG_BRIGHT_BLUE} Highest Negative correlations{RESET}")
print(sorted_corr.tail(20)[-1::-2])

sb.heatmap(corr, cbar = True,  square=True, annot=True, annot_kws={'size': 6}, fmt=".2f", cmap= 'coolwarm')

In [None]:
pair_plots = [large_cat_cols+[y_col], cont_cols+[y_col]]

for plot in pair_plots:
  pp = sb.pairplot(df, vars=plot, height=3, aspect=1.1)
  pp.fig.subplots_adjust(wspace=0.15, hspace=0.15)
  plt.show()

# Anova

In [None]:
def show_anova(*samples, names=None, groups_title=None):

  if names is None: names = [f"Sample {i+1}" for i in range(len(samples))]
  if groups_title is None: groups_title = 'groups'

  # Run ANOVA
  f_stat, p_val = stats.f_oneway(*samples)

  if (p_val < 0.05): print(f"{BG_BRIGHT_RED}There is a significant difference between {groups_title}{RESET}")
  else: print(f"{BG_BRIGHT_GREEN}No significant difference between {groups_title}{RESET}")

  # Summary table
  summary = pd.DataFrame({
      "Group": names,
      "Mean": [pd.Series(s).mean() for s in samples],
      "Variance": [pd.Series(s).var() for s in samples],
      "N": [len(s) for s in samples]
    })

  print("\nSummary Table")
  print("-------------")
  print(summary)
  print()
  print("ANOVA Results")
  print("-------------")
  print(f"F-statistic: {f_stat:.4f}")
  print(f"p-value:     {p_val:.4e}\n")


  plt.figure(figsize=(8, 4))

  # Plot the density of each group
  for i in range(len(samples)):
    sb.histplot(samples[i], kde=True, label=names[i], color=f'C{i}', bins=20, stat="density", alpha=0.5)
    plt.axvline(np.mean(samples[i]), color=f'C{i}', linestyle='--', linewidth=1)

  plt.ylabel('Density', fontsize=12)
  plt.legend(fontsize=10)
  plt.xticks(fontsize=10)
  plt.yticks(fontsize=10)
  plt.grid(False)

  plt.show()

  return

In [None]:
print('\n\n################################ ANOVA on popularity between groups with very large categories #################################\n')

popularity_groups = large_cat_cols + [y_col]
sample_size = 1000

samples = [df[col].sample(sample_size) for col in popularity_groups]

show_anova(*samples,
           names=popularity_groups,
           groups_title='popularity of groups with very large categories')

# Melt into long format for seaborn
df_melt = df.melt(value_vars=popularity_groups, var_name='Popularity Type', value_name='Popularity')
plt.figure(figsize=(8,6))
sb.boxplot(x='Popularity Type', y='Popularity', data=df_melt)
plt.xticks(rotation=75)
plt.tight_layout()
plt.show()


In [None]:
for col in small_cat_symboles:

  print(f'\n\n\n########################## ANOVA between popularity of categories in {col} #############################\n')

  sample_size = 1000
  groups_names = list(df[col].value_counts().index)

  # Filter out groups with fewer than sample_size records
  valid_groups = [name for name in groups_names if len(df[df[col]==name]['track_popularity']) >= sample_size]

  # Create a list of samples for ANOVA from valid groups
  samples = [df[df[col]==name]['track_popularity'].sample(sample_size) for name in valid_groups]

  show_anova(*samples, names=valid_groups, groups_title=f"categories of {col}")

  plt.figure(figsize=(8,6))
  sb.boxplot(x=col, y='track_popularity', data=df)
  plt.xticks(rotation=90)
  plt.show()


In [None]:
sample_size = 1000
n_bins = 10
bin_labels = [f"bin{i}" for i in range(n_bins)]

for col in cont_cols:
  print(f'\n\n####################### ANOVA between popularity of {n_bins} bins of {col} ##############################\n')

  df[f"{col}_bins"] = pd.qcut(df[col], q=n_bins, labels=bin_labels)

  samples = [df[df[f"{col}_bins"]==label]['track_popularity'].sample(sample_size) for label in bin_labels]

  show_anova(*samples, names=bin_labels, groups_title=f"{col} bins")

  plt.figure(figsize=(8,6))
  sb.boxplot(x=f"{col}_bins", y='track_popularity', data=df)
  plt.show()

  df.drop(columns=[f"{col}_bins"], inplace=True)