# Decision Tree Regression

#### Importing libraries

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

#### Loading the dataset

In [2]:
data = pd.read_csv('dataset/spotify_features.csv', sep=',')
data.head()

Unnamed: 0,track_popularity,instrumentalness,duration_ms,energy,acousticness,album_month,album_year,danceability,loudness,liveness,genre_code,release_month_code,valence,artist_code,speechiness,tempo
0,67.0,0.00421,162600.0,0.815,0.0724,12,2019,0.726,-4.969,0.357,2,2,0.693,4535,0.106983,99.972
1,70.0,2.3e-05,176616.0,0.931,0.0794,7,2019,0.675,-3.432,0.19031,2,5,0.613,7724,0.0742,124.008
2,60.0,9e-06,169093.0,0.93,0.0287,7,2019,0.718,-3.778,0.204,2,5,0.509838,6862,0.102,121.956
3,62.0,0.0,187675.0,0.856,0.187,7,2019,0.449,-4.788,0.176,2,5,0.152,3635,0.0623,112.648
4,58.0,5e-06,207894.0,0.923,0.146,6,2019,0.679,-6.5,0.124,2,6,0.752,2056,0.181,121.984


#### Data preprocessing

In [3]:
X = data.drop(columns=['track_popularity'])
y = data['track_popularity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_val.shape}, {y_val.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (11908, 15), (11908,)
Validation set: (3970, 15), (3970,)
Test set: (3970, 15), (3970,)


#### Standardization

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#### Training the model

In [5]:
model = DecisionTreeRegressor(max_depth=6, random_state=42)
model.fit(X_train_scaled, y_train)

#### Model evaluation

In [6]:
train_score = model.score(X_train_scaled, y_train)
val_score = model.score(X_val_scaled, y_val)
test_score = model.score(X_test_scaled, y_test)

print(f"Train R^2: {train_score:.2f}")
print(f"Validation R^2: {val_score:.2f}")
print(f"Test R^2: {test_score:.2f}")

mae = mean_absolute_error(y_test, model.predict(X_test_scaled))
print(f"Mean Absolute Error: {mae:.2f}")
mse = mean_squared_error(y_test, model.predict(X_test_scaled))
print(f"Mean Squared Error: {mse:.2f}")

# slechte resultaat want te veel features

Train R^2: 0.18
Validation R^2: 0.11
Test R^2: 0.12
Mean Absolute Error: 18.49
Mean Squared Error: 504.32


#### cross validate

In [7]:
from sklearn.model_selection import cross_validate

scores = cross_validate(model, X, y, cv=6, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.079175,0.001,-0.012347,0.178864
1,0.09808,0.002009,0.023206,0.178487
2,0.074741,0.002016,-0.09088,0.199748
3,0.112638,0.002519,0.011478,0.155614
4,0.080553,0.000989,-0.031552,0.172029
5,0.072695,0.001013,-0.046688,0.138333
