In [1]:
import pandas as pd

final_dataset = pd.read_pickle('final_dataset.pkl')
final_dataset.head()

Unnamed: 0,acousticness,artist_popularity,danceability,duration_ms,energy,followers,instrumentalness,key,liveness,loudness,...,name,peak_pos,speechiness,tempo,time_signature,uri,valence,weeks,hit,popularity
0,0.294,28,0.698,235584.0,0.606,425,3e-06,10,0.151,-7.447,...,Blood,0,0.0262,115.018,4,spotify:track:5qljLQuKnNJf4F4vfxQB0V,0.622,0,False,0.0
1,0.863,36,0.719,656960.0,0.308,2965,0.0,6,0.253,-10.34,...,The Ugly Duckling,0,0.922,115.075,3,spotify:track:3VAX2MJdmdqARLSU5hPMpm,0.589,0,False,0.0
2,0.763,10,0.719,316578.0,0.126,158,0.0,3,0.113,-20.254,...,The Crime At Pickets Mill,0,0.938,112.822,3,spotify:track:6aCe9zzoZmCojX7bbgKKtf,0.533,0,False,0.0
3,0.971,62,0.367,183653.0,0.349,201820,0.296,11,0.633,-7.74,...,Already Gone,0,0.0268,81.85,4,spotify:track:4PrAZpH9Ic7S47E78BN6E4,0.192,0,False,0.0
4,0.824,36,0.688,29240.0,0.304,2965,0.0,10,0.142,-9.96,...,Three Blind Mice,0,0.531,77.056,3,spotify:track:1WJzRtI1ABzV3TPIeJZVvi,0.414,0,False,0.0


In [2]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(final_dataset, test_size=0.2, random_state=42)
train.shape, test.shape

((40152, 22), (10038, 22))

In [3]:
shortened_dataset = train[['energy', 'liveness', 'tempo', 'speechiness', 'acousticness',
                             'instrumentalness', 'time_signature', 'danceability', 'key',
                             'duration_ms', 'loudness', 'valence', 'mode']]
X = shortened_dataset
X.shape

(40152, 13)

In [4]:
shortened_dataset_test = test[['energy', 'liveness', 'tempo', 'speechiness', 'acousticness',
                             'instrumentalness', 'time_signature', 'danceability', 'key',
                             'duration_ms', 'loudness', 'valence', 'mode']]
X_test = shortened_dataset_test
X_test.shape

(10038, 13)

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40152 entries, 23745 to 15795
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   energy            40152 non-null  object
 1   liveness          40152 non-null  object
 2   tempo             40152 non-null  object
 3   speechiness       40152 non-null  object
 4   acousticness      40152 non-null  object
 5   instrumentalness  40152 non-null  object
 6   time_signature    40152 non-null  int16 
 7   danceability      40152 non-null  object
 8   key               40152 non-null  int32 
 9   duration_ms       40152 non-null  object
 10  loudness          40152 non-null  object
 11  valence           40152 non-null  object
 12  mode              40152 non-null  int16 
dtypes: int16(2), int32(1), object(10)
memory usage: 3.7+ MB


In [6]:
X = X.astype(float)
X_test = X_test.astype(float)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40152 entries, 23745 to 15795
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   energy            40152 non-null  float64
 1   liveness          40152 non-null  float64
 2   tempo             40152 non-null  float64
 3   speechiness       40152 non-null  float64
 4   acousticness      40152 non-null  float64
 5   instrumentalness  40152 non-null  float64
 6   time_signature    40152 non-null  float64
 7   danceability      40152 non-null  float64
 8   key               40152 non-null  float64
 9   duration_ms       40152 non-null  float64
 10  loudness          40152 non-null  float64
 11  valence           40152 non-null  float64
 12  mode              40152 non-null  float64
dtypes: float64(13)
memory usage: 4.3 MB


In [7]:
y_pop = train[['peak_pos']]
y_pop_test = test[['peak_pos']]
y_pop.shape

(40152, 1)

In [8]:
from morfist import MixedRandomForest, cross_validation
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_validate
import numpy as np

In [9]:
# Config
n_trees = 11
nfolds = 2

In [10]:
# Data
X, y_pop = X.values, y_pop.values
x_reg, y_reg = X, y_pop.ravel()
x_mix_1, y_mix_1 = x_reg, np.vstack([y_reg, y_reg < y_reg.mean()]).T

# Test
X_test, y_pop_test = X_test.values, y_pop_test.values
x_reg_test, y_reg_test = X_test, y_pop_test.ravel()
x_mix_1_test, y_mix_1_test = x_reg_test, np.vstack([y_reg_test, y_reg_test < y_reg_test.mean()]).T

In [11]:
x_reg.shape, y_reg.shape, x_mix_1.shape, y_mix_1.shape

((40152, 13), (40152,), (40152, 13), (40152, 2))

In [12]:
def test_reg():
    reg_rf = MixedRandomForest(
        n_estimators=n_trees,
        min_samples_leaf=5
    )

    reg_skrf = RandomForestRegressor(n_estimators=n_trees)
    reg_scores = cross_validation(
        reg_rf,
        x_reg,
        y_reg,
        folds=nfolds,
        verbose=1
    )

    scores = cross_validate(
        reg_skrf,
        x_reg,
        y_reg,
        scoring=('neg_mean_squared_error')
    )

    print('Regression: ')
    print(f'\tmorfist (rmse): {reg_scores.mean()}')
    print('\tscikit-learn')
    print(f'\t\t rmse: {np.sqrt(-scores["test_score"].mean())}')
    
    return reg_skrf, reg_rf

In [13]:
print('######### Train ###########')
reg_skrf, reg_rf = test_reg()

######### Train ###########
Running fold 1 of 2 ...
Running fold 2 of 2 ...
Regression: 
	morfist (rmse): 17.134520933768155
	scikit-learn
		 rmse: 17.973554068402876


In [14]:
print('######### Test ############')
reg_scores = cross_validation(
    reg_rf,
    x_reg_test,
    y_reg_test,
    folds=nfolds,
    verbose=1
)

scores = cross_validate(
    reg_skrf,
    x_reg_test,
    y_reg_test,
    scoring=('neg_mean_squared_error')
)

print('Regression: ')
print(f'\tmorfist (rmse): {reg_scores.mean()}')
print('\tscikit-learn')
print(f'\t\t rmse: {np.sqrt(-scores["test_score"].mean())}')

######### Test ############
Running fold 1 of 2 ...
Running fold 2 of 2 ...
Regression: 
	morfist (rmse): 17.074456942546966
	scikit-learn
		 rmse: 17.767470008395758


In [15]:
def test_mix_1():
    mix_rf = MixedRandomForest(
        n_estimators=n_trees,
        min_samples_leaf=5,
        class_targets=[1]
    )

    mix_scores = cross_validation(
        mix_rf,
        x_mix_1,
        y_mix_1,
        folds=nfolds,
        verbose=1,
        class_targets=[1]
    )
    print('Mixed output: ')
    print(f'\ttask 1 (original) (rmse): {mix_scores[0]}')
    print(f'\ttask 2 (additional) (accuracy): {mix_scores[1]}')
    
    return mix_rf

In [16]:
mix_rf = test_mix_1()

Running fold 1 of 2 ...


  pred_avg[:, i], _ = scipy.stats.mode(pred[:, i, :].T)


Running fold 2 of 2 ...
Mixed output: 
	task 1 (original) (rmse): 16.933083609869865
	task 2 (additional) (accuracy): 0.9015242080095637


In [17]:
print('######## Test ############')
mix_rf = MixedRandomForest(
    n_estimators=n_trees,
    min_samples_leaf=5,
    class_targets=[1]
)

mix_scores = cross_validation(
    mix_rf,
    x_mix_1_test,
    y_mix_1_test,
    folds=nfolds,
    verbose=1,
    class_targets=[1]
)
print('Mixed output: ')
print(f'\ttask 1 (original) (rmse): {mix_scores[0]}')
print(f'\ttask 2 (additional) (accuracy): {mix_scores[1]}')

######## Test ############
Running fold 1 of 2 ...
Running fold 2 of 2 ...
Mixed output: 
	task 1 (original) (rmse): 16.835958585251475
	task 2 (additional) (accuracy): 0.9005778043434948
