## M1 popularity

On fold 1 some first exploratory experiments are performed.
Evaluation: https://vitalflux.com/interpreting-f-statistics-in-linear-regression-formula-examples

In [1]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from scipy import stats 

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
import plotly.express as px

sns.set_theme()

In [2]:
class Feats():
    """
    Each feature has a first name, optionally a second name, a statistic and a number.
    This class allows to group feautures according to these aspects or combinations of these aspects.
    Each method produces a list of feature names or a list of lists of feature names.
    """

    def __init__(self, csv):
        self.fts = pd.read_csv(csv, dtype={'n':"string"})
        self.fts = self.fts.fillna('')

    def format(self, select):
        return select.apply(lambda x: '_'.join(x).replace('__', '_'), axis=1).tolist()

    def all(self):
        select = self.fts.copy()
        return self.format(select)

    def first(self):
        select = self.fts.copy()
        select = select.loc[select['n']=='01']
        return self.format(select)

    def mfcc(self):
        select = self.fts.copy()
        select = select.loc[select['name1']=='mfcc']
        return self.format(select)

    def tonnetz(self):
        select = self.fts.copy()
        select = select.loc[select['name1']=='tonnetz']
        return self.format(select)

    def min(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='min']
        return self.format(select)
    
    def max(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='max']
        return self.format(select)

    def median(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='median']
        return self.format(select)

    def mean(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='mean']
        return self.format(select)
    
    def std(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='std']
        return self.format(select)

    def skew(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='skew']
        return self.format(select)

    def kurtosis(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='kurtosis']
        return self.format(select)

    def per_nns(self):
        """
        List of lists per name1, name2, stat, per name1, name2.
        """
        select = self.fts.copy()
        select = [[self.format(grp2) for idx2, grp2 in grp.groupby(by=['stat'], sort=False)] for idx, grp in select.groupby(by=['name1', 'name2'])]
        return select
    
    def per_sn(self):
        """
        List of lists per stat, n. 
        """
        select = self.fts.copy()
        select = [[self.format(grp2) for idx2, grp2 in grp.groupby(by=['n'], sort=False)] for idx, grp in select.groupby(by=['stat'], sort=False)]
        return select

fts = Feats('features.csv')

In [3]:
# Read data
df = pd.read_csv('data/fold/f1_train.csv',  parse_dates=['release'])
df = df.drop(columns=['release'])

#### Preprocessing

In [4]:
n_orig = df.shape[0]

# Drop na values
n_na = df['popularity'].isna().sum()
df = df.dropna(subset=['popularity'])

# Drop zero values
n_zero =df.loc[df['popularity']==0].shape[0]
df = df.loc[df['popularity']>0]

print(f'm1: Dropped {n_na} na entries and {n_zero} zero entries of {n_orig}, {df.shape[0]} entries left.')

y_before = df['popularity']
y = stats.boxcox(y_before)[0]

X = df.drop(columns=['popularity'])

scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X, columns=fts.all())

m1: Dropped 0 na entries and 1 zero entries of 68207, 68206 entries left.
