## M1 - popularity: models

### Setup

#### Imports

In [None]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from scipy import stats 

import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.decomposition import PCA
# from sklearn.decomposition import KernelPCA
from sklearn.manifold import Isomap
from sklearn.feature_selection import RFE

from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

sns.set_theme()
# Show plots
viz = True

#### Functions

In [None]:
class Feats():
    """
    Each feature has a first name, optionally a second name, a statistic and a number.
    This class allows to group feautures according to these aspects or combinations of these aspects.
    Each method produces a list of feature names or a list of lists of feature names.
    """

    def __init__(self, csv):
        self.fts = pd.read_csv(csv, dtype={'n':"string"})
        self.fts = self.fts.fillna('')

    def format(self, select):
        return select.apply(lambda x: '_'.join(x).replace('__', '_'), axis=1).tolist()

    def all(self):
        select = self.fts.copy()
        return self.format(select)

    def first(self):
        select = self.fts.copy()
        select = select.loc[select['n']=='01']
        return self.format(select)

    def mean(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='mean']
        return self.format(select)
    
    def std(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='std']
        return self.format(select)

    def kurtosis(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='kurtosis']
        return self.format(select)
    
    def chroma(self):
        select = self.fts.copy()
        select = select.loc[select['name1']=='chroma']
        return self.format(select)

    def mfcc(self):
        select = self.fts.copy()
        select = select.loc[select['name1']=='mfcc']
        return self.format(select)

    def tonnetz(self):
        select = self.fts.copy()
        select = select.loc[select['name1']=='tonnetz']
        return self.format(select)

fts = Feats('features.csv')

#### Data

In [None]:
# Load fold
df_train = pd.read_csv('data/fold/f2_train.csv',  parse_dates=['release'])
df_train = df_train.drop(columns=['release'])
df_test = pd.read_csv('data/fold/f2_test.csv',  parse_dates=['release'])
df_test = df_test.drop(columns=['release'])
print(f'train set: {df_train.shape[0]:,} entries, test set: {df_test.shape[0]:,} entries, total: {df_train.shape[0]+df_test.shape[0]:,} entries')

### Preprocessing

In [None]:
# Drop NA and zero values train
n_orig = df_train.shape[0]
n_na = df_train['popularity'].isna().sum()
df_train = df_train.dropna(subset=['popularity'])
n_zero =df_train.loc[df_train['popularity']==0].shape[0]
df_train = df_train.loc[df_train['popularity']>0]
print(f'Train set: Dropped {n_na} NA entries and {n_zero} zero entries of {n_orig}, {df_train.shape[0]} entries left.')

# Drop NA and zero values test
n_orig = df_test.shape[0]
n_na = df_test['popularity'].isna().sum()
df_test = df_test.dropna(subset=['popularity'])
n_zero =df_test.loc[df_test['popularity']==0].shape[0]
df_test = df_test.loc[df_test['popularity']>0]
print(f'Test set: Dropped {n_na} NA entries and {n_zero} zero entries of {n_orig}, {df_test.shape[0]} entries left.')

# Scale X train and test
X_train = df_train.drop(columns=['popularity'])
X_test = df_test.drop(columns=['popularity'])
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=fts.all())
X_test = pd.DataFrame(X_test, columns=fts.all())
print('')
print(f'Scaled train X:\n\tmean: {X_train.mean().tolist()[:5]}...\n\tstd: {X_train.std().tolist()[:5]}...')
print(f'Scaled test X:\n\tmean: {X_test.mean().tolist()[:5]}...\n\tstd: {X_test.std().tolist()[:5]}...')

# Transform y train and test
y_train_before = df_train['popularity']
y_train = stats.boxcox(y_train_before)[0]
y_train = pd.Series(y_train)
y_test_before = df_test['popularity']
y_test = stats.boxcox(y_test_before)[0]
y_test = pd.Series(y_test)
fig, axs = plt.subplots(ncols=2, figsize=(10,5))
ax1 = sns.histplot(x=y_train_before, kde=True, ax=axs[0])
ax1.set(title="Distribution popularity")
ax2 = sns.histplot(x=y_train, kde=True, ax=axs[1])
ax2.set(title="Distribution boxcox transformed popularity")
fig;

### Models

In [None]:
scores = {}

#### Benchmark

In [None]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

r2_train_bm = round(model.score(X_train, y_train), 6)
r2_test_bm = round(metrics.r2_score(y_test, y_pred), 6)

print(f'Benchmark: Train R2 for all features: {r2_train_bm}, Test R2 for all features: {r2_test_bm}')

#### Linear regression

In [None]:
name = 'linear_regression'

X_train_fts = X_train[fts.all()]
X_test_fts = X_test[fts.all()]

model = LinearRegression().fit(X_train_fts, y_train)
y_pred = model.predict(X_test_fts)

scores[name] = {
    'R2_train': round(model.score(X_train_fts, y_train), 6),
    'R2_test': round(metrics.r2_score(y_test, y_pred), 6),
    'MSE_test': round(metrics.mean_squared_error(y_test, y_pred), 6),
    'MAE_test': round(metrics.mean_absolute_error(y_test, y_pred), 6),
}

#### Elastic net

In [None]:
name = 'elastic_net'

X_train_fts = X_train[fts.all()]
X_test_fts = X_test[fts.all()]

model = ElasticNet().fit(X_train_fts, y_train)
y_pred = model.predict(X_test_fts)

scores[name] = {
    'R2_train': round(model.score(X_train_fts, y_train), 6),
    'R2_test': round(metrics.r2_score(y_test, y_pred), 6),
    'MSE_test': round(metrics.mean_squared_error(y_test, y_pred), 6),
    'MAE_test': round(metrics.mean_absolute_error(y_test, y_pred), 6),
}

#### Polynomial regression

In [None]:
name = 'poly_linear_regression_mean_fts'

X_train_fts = X_train[fts.mean()]
X_test_fts = X_test[fts.mean()]
poly = PolynomialFeatures(2)
X_train_fts = poly.fit_transform(X_train_fts)
X_test_fts = poly.fit_transform(X_test_fts)

model = LinearRegression().fit(X_train_fts, y_train)
y_pred = model.predict(X_test_fts)

scores[name] = {
    'R2_train': round(model.score(X_train_fts, y_train), 6),
    'R2_test': round(metrics.r2_score(y_test, y_pred), 6),
    'MSE_test': round(metrics.mean_squared_error(y_test, y_pred), 6),
    'MAE_test': round(metrics.mean_absolute_error(y_test, y_pred), 6),
}

#### SVR - support vector regression

In [None]:
name = 'svr_10%_sub'

X_train_fts = X_train[fts.all()]
X_test_fts = X_test[fts.all()]
idx = X_train_fts.sample(frac=0.1).index
X_train_fts_sub = X_train_fts.iloc[idx]
y_train_sub = y_train[idx]

model = SVR().fit(X_train_fts_sub, y_train_sub)
y_pred = model.predict(X_test_fts)

scores[name] = {
    'R2_train': round(model.score(X_train_fts_sub, y_train_sub), 6),
    'R2_test': round(metrics.r2_score(y_test, y_pred), 6),
    'MSE_test': round(metrics.mean_squared_error(y_test, y_pred), 6),
    'MAE_test': round(metrics.mean_absolute_error(y_test, y_pred), 6),
}

#### K-neighbors regression

In [None]:
name = 'k-neighbors'

X_train_fts = X_train[fts.all()]
X_test_fts = X_test[fts.all()]

model = KNeighborsRegressor().fit(X_train_fts, y_train)
y_pred = model.predict(X_test_fts)

scores[name] = {
    'R2_train': round(model.score(X_train_fts, y_train), 6),
    'R2_test': round(metrics.r2_score(y_test, y_pred), 6),
    'MSE_test': round(metrics.mean_squared_error(y_test, y_pred), 6),
    'MAE_test': round(metrics.mean_absolute_error(y_test, y_pred), 6),
}

#### Results

In [None]:
df_scores = pd.DataFrame.from_dict(scores, orient='index')
df_scores.style\
    .highlight_max(color='green', axis=0)\
    .highlight_min(color='red', axis=0)