# Statistical Test

In [24]:
#Reading Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [25]:
#Reading Data
path = "./Data/Spotify_1921_to_2020.csv"
df_raw = pd.read_csv(path)

In [26]:
#scaling
scale_cols = ['duration_ms', 'key', 'loudness', 'popularity', 'tempo']
for col in scale_cols:
    maxVal = df_raw[col].abs().max()
    df_raw[col] = df_raw[col]/maxVal

#add weighting features1 based on statistical test on relationship with popularity
'''
weight_cols = ['energy', 'danceability', 'speechiness']
w_rate = 3
for col in weight_cols:
    df_raw[col] = w_rate*df_raw[col]

#add weighting features2
weight_cols = ['loudness', 'speechiness']
weight_cols = ['loudness']
w_rate = 1.5
for col in weight_cols:
    df_raw[col] = w_rate*df_raw[col]
'''
print(df_raw.select_dtypes(include = np.number).abs().max())

acousticness           0.996
danceability           0.988
duration_ms            1.000
energy                 1.000
explicit               1.000
instrumentalness       1.000
key                    1.000
liveness               1.000
loudness               1.000
mode                   1.000
popularity             1.000
speechiness            0.969
tempo                  1.000
valence                1.000
year                2020.000
dtype: float64


In [27]:
#data preprocessing
def preprocessing(df, topN=20, startYear=1955):
    #use data from 1955
    df = df[df['year']>=startYear]
    df = df.select_dtypes(include = np.number)
    df = df.groupby('year').apply(lambda x: x.nlargest(topN, ['popularity']))
    df.drop(['year'], axis = 1, inplace = True)
    df = df.groupby('year').agg(['mean'])
    df.columns = [col[0] for col in df.columns.values]
    df.reset_index(inplace = True)
    return df

In [28]:
#set parameter
topN = 100
startYear = 1990
targetYear = 2016

#preprocess data
df = preprocessing(df_raw, topN=topN, startYear=startYear)
print(df.head())
print(df.columns)

   year  acousticness  danceability  duration_ms    energy  explicit  \
0  1990      0.206522       0.54928     0.049620  0.666730      0.05   
1  1991      0.222771       0.55346     0.050100  0.628230      0.06   
2  1992      0.251523       0.59920     0.047129  0.624950      0.15   
3  1993      0.192939       0.60352     0.046679  0.625160      0.14   
4  1994      0.225749       0.56453     0.046071  0.630753      0.16   

   instrumentalness       key  liveness  loudness  mode  popularity  \
0          0.052565  0.445455  0.171580 -0.146606  0.78      0.6378   
1          0.032905  0.538182  0.182937 -0.159454  0.67      0.6632   
2          0.018784  0.453636  0.182672 -0.147418  0.76      0.6549   
3          0.070677  0.512727  0.174003 -0.161816  0.72      0.6638   
4          0.037245  0.526364  0.185944 -0.146506  0.68      0.6708   

   speechiness     tempo   valence  
0     0.058954  0.501952  0.521157  
1     0.050675  0.484472  0.539474  
2     0.072896  0.487823  0.5

In [29]:
#add new feature for time-series regression
def add_n_years(df, startYear):
    df['n_years'] = df['year'] - startYear
    return df

#train-test data preparation
def get_train(df, targetYear = 2020):
    return df[df['year']<targetYear]

def get_test(df, targetYear = 2020, cols = []):    
    return df[df_raw['year'] == targetYear][cols]

In [30]:
#add new feature : # of years from 1921
df = add_n_years(df, startYear)
df_raw = add_n_years(df_raw, startYear)

#get train, test data
train = get_train(df)
cols = train.columns
test = get_test(df_raw, targetYear = targetYear, cols = train.columns)

In [31]:
def stat_test_lm(df, cols):
    for i, col in enumerate(cols):
        x = df['n_years'].values.reshape(-1, 1)
        y = df[col].values
        x = sm.add_constant(x)
        model = sm.OLS(y, x)
        results = model.fit()
        print(f'<{i+1}. {col}>')
        print(results.summary(), '\n\n')

In [32]:
#apply sub_model to all features
cols = ['acousticness', 'danceability', 'duration_ms', 'energy',
       'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'popularity', 'speechiness', 'tempo', 'valence']
drop_cols = ['explicit', 'key', 'mode', 'popularity'] 
#remove columns in drop_cols
cols = [col for col in cols if col not in drop_cols]

#Statistical Test on Linear Regression models
stat_test_lm(train, cols = cols)

<1. acousticness>
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.144
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     4.714
Date:                Mon, 30 Jan 2023   Prob (F-statistic):             0.0386
Time:                        17:54:19   Log-Likelihood:                 57.654
No. Observations:                  30   AIC:                            -111.3
Df Residuals:                      28   BIC:                            -108.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1856      0.013  

The above statistical test shows that 'danceability', 'duration_ms', 'instrumentalness', 'loudness', 'speechiness', and 'valence' have some linear relationship with time. Moreover, in terms of 'duration_ms', according to the value of R-squared, 69.7% of the variation is explained by time.