## 2. Preprocessing

All data is preprocessed according to the following steps.
1. Data is split in a training set of 80% of the data and a test set of 20% of the data.
2. Missing feature data is imputed using the mean. Missing target data is inferred from other available metadata.
3. Outliers are removed, data is normalized and centered. Target Y1 is binned per 10 listenings and target Y2 is binned per year.

Next to feature set f1, which contains all features, two more feature sets are created with PCA dimensionality reduction. For feature set f2 PCA is applied per column name group, and for feature set f3 PCA is applied on the total of features.

### Setup

In [21]:
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [14]:
df = pd.read_csv('data/split/train.csv')

# For both targets, a feature set is created
df_y1 = df.copy().drop(columns=['release'])
df_y2 = df.copy().drop(columns=['popularity'])

In [15]:
class Feature_reader():
    """
    Each feature has a first name, optionally a second name, a statistic and a number.
    This class allows to group feautures according to these aspects or combinations of these aspects.
    Each method produces a list of feature names or a list of lists of feature names.
    """

    def __init__(self, csv):
        self.fts = pd.read_csv(csv, dtype={'n':"string"})
        self.fts = self.fts.fillna('')

    def format(self, select):
        return select.apply(lambda x: '_'.join(x).replace('__', '_'), axis=1).tolist()

    def all(self):
        select = self.fts.copy()
        return self.format(select)

    def first(self):
        select = self.fts.copy()
        select = select.loc[select['n']=='01']
        return self.format(select)

    def min(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='min']
        return self.format(select)
    
    def max(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='max']
        return self.format(select)

    def median(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='median']
        return self.format(select)

    def mean(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='mean']
        return self.format(select)
    
    def std(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='std']
        return self.format(select)

    def skew(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='skew']
        return self.format(select)

    def kurtosis(self):
        select = self.fts.copy()
        select = select.loc[select['stat']=='kurtosis']
        return self.format(select)

    def per_nns(self):
        """
        List of lists per name1, name2, stat, per name1, name2.
        """
        select = self.fts.copy()
        select = [[self.format(grp2) for idx2, grp2 in grp.groupby(by=['stat'], sort=False)] for idx, grp in select.groupby(by=['name1', 'name2'])]
        return select
    
    def per_sn(self):
        """
        List of lists per stat, n. 
        """
        select = self.fts.copy()
        select = [[self.format(grp2) for idx2, grp2 in grp.groupby(by=['n'], sort=False)] for idx, grp in select.groupby(by=['stat'], sort=False)]
        return select

fts = Feature_reader('features.csv')

### 2.1 Missing data

#### y1

In [16]:
n_orig = df_y1.shape[0]
n_na = df_y1['popularity'].isna().sum()
df_y1 = df_y1.dropna(subset=['popularity'])
print(f'y1: Dropped {n_na} entries of {n_orig}, {n_orig - n_na} entries left.')

y1: Dropped 0 entries of 85259, 85259 entries left.


#### y2

In [9]:
n_orig = df_y2.shape[0]
n_na = df_y2['release'].isna().sum()
df_y2 = df_y2.dropna(subset=['release'])
print(f'y2: Dropped {n_na} entries of {n_orig}, {n_orig - n_na} entries left.')

Dropped 28915 entries of 85259, 56344 entries left.


### 2.2 Target transformation

#### y1

In [19]:
y1 = df_y1['popularity']
y1 = y1.to_numpy()
np.savetxt('featsets/y1.csv', y1, delimiter=",")

#### y2

In [20]:
y2 = df_y2['release']
y2 = y2.to_numpy()
np.savetxt('featsets/y2.csv', y1, delimiter=",")

### 2.3 Featsets
Different steps and kinds of preprocessing are combined to produce featuresets.
https://towardsdatascience.com/feature-selection-and-dimensionality-reduction-f488d1a035de

In [32]:
class Featset:
    def __init__(self, X, name):
        self.X = X
        self.name = name

        self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        self.outlier_removal = 'None'
        self.scaling_ft = 'None'
        self.dimension_reduction = 'None'

    def remove_stde(self):
        pass

    def remove_lof(self):
        clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
        y_pred = clf.fit_predict(self.X)
        X_scores = clf.negative_outlier_factor_
        pass

    def remove(self, type):
        if type == "stde":
            self.stde()
            self.outlier = 'stde'
        elif type == "lof":
            self.lof()
            self.outlier = 'lof'
        else:
            print("Not available")
            pass

    def scaling(self, type):
        pass
    
    def reduce_pca(self, type):
        pass

    def reduce(self, type):
        if type == "pca":
            self.reduce_pca()
            self.outlier = 'stde'
        else:
            print("Not available")
            pass

    def save(self):
        np.savetxt(f'featsets/{self.name}.csv', self.X, delimiter=",")

#### X1

In [30]:
X1 = df_y1.drop(columns=['popularity']).to_numpy()

In [33]:
name = 'X1/00_first_try'
fts = Featset(X1, name)
fts.save()

#### X2

In [35]:
X2 = df_y2.drop(columns=['release']).to_numpy()

In [36]:
name = 'X2/00_first_try'
fts = Featset(X2, name)
fts.save()