## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
from google.colab import files
import pickle
import os

!pip install scipy
import scipy.stats as stats

!pip install papermill
!pip install nbconvert
!pip install nbformat
!pip install IPython

import papermill as pm
import nbformat
from nbconvert import HTMLExporter
from IPython.display import HTML, display

import json
from google.colab import drive

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
from google.colab import files
import pickle
import os
from scipy import stats
from scipy.stats import zscore

!pip install missingno
import missingno as msno

!pip install fancyimpute
import fancyimpute

warnings.filterwarnings("ignore")
%matplotlib inline

# Mount drive, Load config

In [None]:
PROJECT_PATH = '/content/drive/MyDrive/Projects/GitHub/Spotify/'
CONFIG_FILE = f"{PROJECT_PATH}src/config.json"

In [None]:
drive.mount('/content/drive')

with open(CONFIG_FILE, 'r') as f:
    project_config = json.load(f)
    project_config.pop('_comment', None)
    project_config.pop('_note', None)
    f.close()

In [None]:
np.random.seed(31071967)

#Run project notebook N-1

In [None]:
if project_config['chain_notebooks'] == '1':

  input_file = f"{project_config['project_path']}{project_config['notebooks_directory']}{project_config['notebook3']}"
  output_file = f"{project_config['project_path']}{project_config['output_directory']}{project_config['output3']}"

  # --- Execute the proviuse notebook with parameters ---
  pm.execute_notebook(
      input_path = input_file,
      output_path = output_file,
      log_output=False,  # don't print logs while running
      progress_bar=True
  )

  # --- Convert the executed notebook to HTML ---
  nb = nbformat.read(output_file, as_version=4)
  html_exporter = HTMLExporter()
  html_exporter.template_name = "lab"  # modern look; alternatives: 'classic', 'basic'
  body, _ = html_exporter.from_notebook_node(nb)

  # --- Display the HTML result inline ---
  display(HTML(body))

# Load pickle

In [None]:
pickle_file      = project_config['project_path'] + project_config['pickles_directory'] + project_config['pickle3']
test_pickle_file = project_config['project_path'] + project_config['pickles_directory'] + project_config['pickle3_test']

In [None]:
# Read pickle into DataFrame
df = pd.read_pickle(pickle_file)
display(df.head(1))

if project_config['split_df'] == '1':
  df_test = pd.read_pickle(test_pickle_file)
  display(df_test.tail(1))

In [None]:
# types of cols
small_cat_cols     = ['mode_mean_popularity','key_mean_popularity', 'playlist_genre_grouped_mean_popularity','release_decade_mean_popularity','release_month_mean_popularity']
small_cat_symboles = ['mode','key','playlist_genre_grouped','release_decade','release_month', 'release_year']
large_cat_cols     = ['track_artist_mean_popularity','track_album_id_mean_popularity','playlist_id_mean_popularity']
cont_cols          = ['acousticness', 'danceability','duration_ms','energy', 'liveness', 'loudness', 'speechiness', 'tempo','valence']
y_col              = 'track_popularity'
X_cols             = large_cat_cols + small_cat_cols + cont_cols
MERGE_ON_COL       = 'track_id'

In [None]:
def pickle_col(df, col='all', drop_col=False, include_merge_ID=True, pickle_name=""):

  import pickle

  if pickle_name == "":
    file_name = f"{project_config['project_path']}{project_config['pickles_directory']}{col}.pkl"
  else:
    file_name = f"{project_config['project_path']}{project_config['pickles_directory']}{pickle_name}.pkl"

  with open(file_name, 'wb') as f:

    if col =='all':
      pickle.dump(df, f)

    elif col in df.columns: # in case we aready droped the col before

      # track_id for a later merge, if we need.
      #and y_col so can can invetigate the pickel later indepandantly from the main df
      pickle.dump(df[[MERGE_ON_COL, col, y_col]], f)

    f.close()

    if drop_col == True:
      df.drop(col, axis=1, inplace=True, errors='ignore')

  if project_config['split_df'] == '1':
    with open(file_name+".test.pkl", 'wb') as f:

      if col =='all':
        pickle.dump(df_test, f)

      elif col in df_test.columns: # in case we aready droped the col before
        pickle.dump(df_test[[MERGE_ON_COL, col, y_col]], f)

      f.close()

    if drop_col == True:
      df_test.drop(col, axis=1, inplace=True, errors='ignore')

  return df

# Target Encoding


In [None]:
# artist_id/name --> artist_mean_popularity
# track_album_id --> track_album_mean_popularity
# track_playlist_id --> playlist_id_mean_popularity

# mode --> mode_mean_popularity
# key --> key_mean_popularity
# genre --> genre_mean_popularity

# release_date --> release_month_mean_popularity, release_year_mean_popularity, release_decade_mean_popularity

#Feature selection

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge

In [None]:
# Fit models and determine if a feature is selected (1) or not (0)
lasso = Lasso(alpha=5).fit(df[X_cols], df[y_col])
lasso_selected = (np.abs(lasso.coef_) > 0).astype(int)

In [None]:
# Fit Ridge model
ridge = Ridge(alpha=5).fit(df[X_cols], df[y_col])
ridge_selected = (np.abs(ridge.coef_) > 0).astype(int)

In [None]:
gb = GradientBoostingRegressor().fit(df[X_cols], df[y_col])
gb_selected = (gb.feature_importances_ > 0).astype(int)

In [None]:
rf = RandomForestRegressor().fit(df[X_cols], df[y_col])
rf_selected = (rf.feature_importances_ > 0).astype(int)

In [None]:
# Create a DataFrame to store results
selection_df = pd.DataFrame({
    'Feature': df[X_cols].columns,
    'Lasso': lasso_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected
})

# Sum the number of selections for each feature
selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest','Ridge']].sum(axis=1)

# Output the results
display(selection_df)

In [None]:
for feature in selection_df[selection_df['Sum'] < 3]['Feature']:
  print(f"dropping features with no majority of recomendations: {feature}")
  pickle_col(df, col=feature, drop_col=True)

In [None]:
from google.colab import files
import os

with open(f"{project_config['project_path']}{project_config['pickles_directory']}{project_config['pickle4']}", 'wb') as f:
  pickle.dump(df, f)
  f.close()

if project_config['split_df'] == '1':
  with open(f"{project_config['project_path']}{project_config['pickles_directory']}{project_config['pickle4_test']}", 'wb') as f:
    pickle.dump(df_test, f)
    f.close()