In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("UltimateClassicRock.csv")
df['Decade'] = (df['Year'] // 10) * 10
df['Decade'] = df['Decade'].astype(str) + 's'
order = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s', '2020s']
df['Decade'] = pd.Categorical(df['Decade'], categories=order, ordered=True)


def convert_duration(duration):
    minutes, seconds = map(int, duration.split(':'))
    return minutes * 60 + seconds

df['Duration'] = df['Duration'].apply(convert_duration)

In [3]:
df.head()

Unnamed: 0,Track,Artist,Album,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Decade
0,Play A Simple Song,38 Special,38 Special,1977,193,4,0.521,0.367,0,-13.866,1,0.0278,0.692,3e-06,0.108,0.789,83.412,16,1970s
1,Four Wheels,38 Special,38 Special,1977,283,4,0.535,0.71,2,-12.287,1,0.0428,0.01,0.023,0.0495,0.445,160.361,10,1970s
2,Fly Away,38 Special,38 Special,1977,313,4,0.563,0.563,2,-10.781,1,0.0263,0.0357,0.00185,0.14,0.564,106.739,13,1970s
3,Tell Everybody,38 Special,38 Special,1977,249,4,0.638,0.694,11,-10.206,0,0.031,0.161,3.4e-05,0.0908,0.936,124.962,10,1970s
4,Just Wanna Rock & Roll,38 Special,38 Special,1977,357,4,0.388,0.701,2,-9.984,1,0.036,0.013,0.0422,0.115,0.769,126.769,11,1970s


In [4]:
A = df.drop(['Track', 'Artist', 'Album', 'Year', 'Energy'], axis=1)
b = df['Energy']
A.head()

Unnamed: 0,Duration,Time_Signature,Danceability,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Decade
0,193,4,0.521,0,-13.866,1,0.0278,0.692,3e-06,0.108,0.789,83.412,16,1970s
1,283,4,0.535,2,-12.287,1,0.0428,0.01,0.023,0.0495,0.445,160.361,10,1970s
2,313,4,0.563,2,-10.781,1,0.0263,0.0357,0.00185,0.14,0.564,106.739,13,1970s
3,249,4,0.638,11,-10.206,0,0.031,0.161,3.4e-05,0.0908,0.936,124.962,10,1970s
4,357,4,0.388,2,-9.984,1,0.036,0.013,0.0422,0.115,0.769,126.769,11,1970s


# Pipeline, Column Transformer and Preprocessing

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [6]:
#A - attributes dataset
#b - target 
# dedicate 10% of data to testing, and shuffle to randomize
A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.2, shuffle=True)

In [7]:
num_attributes = [
    "Duration", 
    "Danceability", 
    "Key", 
    "Loudness", 
    "Speechiness", 
    "Acousticness", 
    "Instrumentalness", 
    "Liveness", 
    "Valence",
    "Tempo",
    "Popularity"
]

cat_attributes = ["Key", "Mode", "Decade"]

In [8]:
# pipeline for numerical attr: fills missing values with the mean and scales the data
num_pipeline = Pipeline(
    [
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ],
    verbose = True
)

In [9]:
# pipeline for categorical features: fills missing values with the most frequent category and applies one-hot encoding
# one-hot encoding - each unique category gets its own binary column, 1 if value matches the cat, 0 if not
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")), #any new category during prediction will be encoded as all zeros 
])

In [10]:
# preprocessing pipeline that applies numerical and categorical transformations to the respective columns
preprocessing = ColumnTransformer([
    ("numerical", num_pipeline, num_attributes),   # apply num_pipeline to numerical attr
    ("categorical", cat_pipeline, cat_attributes),  # apply cat_pipeline to categorical attr
])


In [11]:
A_train_preprocess = preprocessing.fit_transform(A_train)
A_test_preprocess = preprocessing.fit_transform(A_test)

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s


# Linear Regression, Decision Tree, SVM

## 1) Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
linear_regression = LinearRegression()
linear_regression.fit(A_train_preprocess, b_train)
print("Coefficients:", linear_regression.coef_) # how much it changes +-
print("Intercept:", linear_regression.intercept_) # starting point

Coefficients: [ 0.00466596 -0.01921006 -0.00075556  0.13180469  0.03457237 -0.07170815
  0.01832139  0.01789255  0.05275569  0.00471747 -0.00147816 -0.00641664
  0.00359331  0.00252035  0.00376827  0.00243458 -0.00620946 -0.00556291
 -0.0035634   0.01257453  0.00886551 -0.00871168 -0.00329247  0.00314359
 -0.00314359 -0.01961479 -0.00703024  0.03352505  0.01171523 -0.00925497
  0.00156746 -0.01090774]
Intercept: 0.6524551298082071


## 2) Decision Tree

In [14]:
from sklearn.tree import DecisionTreeRegressor

In [15]:
#DecisionTreeRegressor with a maximum depth of 2
tree_regressor = DecisionTreeRegressor(max_depth=2)
tree_regressor.fit(A_train_preprocess, b_train)

In [16]:
print("Tree structure:")
print(tree_regressor)

from sklearn.tree import export_text
feature_names = [f'Feature_{i}' for i in range(A_train_preprocess.shape[1])]
tree_rules = export_text(tree_regressor, feature_names=feature_names)
print("\nTree structure in text form:")
print(tree_rules)

Tree structure:
DecisionTreeRegressor(max_depth=2)

Tree structure in text form:
|--- Feature_3 <= -0.21
|   |--- Feature_5 <= 1.07
|   |   |--- value: [0.55]
|   |--- Feature_5 >  1.07
|   |   |--- value: [0.31]
|--- Feature_3 >  -0.21
|   |--- Feature_4 <= -0.28
|   |   |--- value: [0.68]
|   |--- Feature_4 >  -0.28
|   |   |--- value: [0.84]



## 3) SVM

In [17]:
from sklearn.svm import LinearSVR

In [18]:
svm_regressor = LinearSVR(epsilon=0.5)
svm_regressor.fit(A_train_preprocess, b_train)

# Evaluation


In [19]:
from sklearn.metrics import mean_squared_error, r2_score

## 1) Linear regression

In [20]:
b_pred_lin = linear_regression.predict(A_test_preprocess)

mse_lin = mean_squared_error(b_test, b_pred_lin)  # Mean Squared Error
r2_lin = r2_score(b_test, b_pred_lin)  # R2 score

print("\nLinear Regression:")
print(f"Mean Squared Error: {mse_lin}")
print(f"R^2 score: {r2_lin}")


Linear Regression:
Mean Squared Error: 0.012504309254516607
R^2 score: 0.7637746757415895


## 2) Decision Tree

In [21]:
b_pred_tree = tree_regressor.predict(A_test_preprocess)

mse_tree = mean_squared_error(b_test, b_pred_tree)  # Mean Squared Error
r2_tree = r2_score(b_test, b_pred_tree)  # R2 score

print("\nDecision Tree:")
print(f"Mean Squared Error: {mse_tree}")
print(f"R^2 score: {r2_tree}")


Decision Tree:
Mean Squared Error: 0.023581231843490577
R^2 score: 0.5545148456217815


## 3) SVM

In [22]:
b_pred_svm = svm_regressor.predict(A_test_preprocess)

mse_svm = mean_squared_error(b_test, b_pred_svm)  # Mean Squared Error
r2_svm = r2_score(b_test, b_pred_svm)  # R2 score

print("\nSVR:")
print(f"Mean Squared Error: {mse_svm}")
print(f"R^2 score: {r2_svm}")


SVR:
Mean Squared Error: 0.07317581458237761
R^2 score: -0.3824018724866154


# Implementing Linear Regression using NumPy

### Closed-form Solution for Linear Regression

The closed-form solution for linear regression is given by the following equation:

$$
\theta_{\text{best}} = (X^T X)^{-1} X^T y
$$



In [23]:
theta_best = np.linalg.inv(A_train_preprocess.T @ A_train_preprocess) @ A_train_preprocess.T @ b_train
theta_best

array([ 7.65100331e-01, -2.29522759e+00,  5.38596341e+13,  5.62877649e-01,
        9.02253979e-01,  8.03755489e-01, -2.59797322e-01, -5.13932585e-01,
        1.30961295e+00, -6.14094140e-01,  1.49496439e-01,  7.94920835e+13,
        6.40748699e+13,  4.86576564e+13,  3.32404428e+13,  1.78232292e+13,
        2.40601564e+12, -1.30111979e+13, -2.84284115e+13, -4.38456251e+13,
       -5.92628386e+13, -7.46800522e+13, -9.00972658e+13, -6.92157343e+00,
       -6.30173933e+00, -2.18433774e+00, -4.57449102e+00, -2.84730947e+00,
       -1.78405885e+00, -1.62308442e+00, -1.84271430e+00, -5.80818022e-01])

In [24]:
b_predict = A_test_preprocess @ theta_best
b_predict

array([-1.34155033e+11, -1.61320975e+12, -1.61320975e+12, ...,
       -7.68035627e+11,  7.71384981e+10,  2.88432030e+11], shape=(2884,))

+ Inverse of matrix has a high cost n^3
+ Not scalable – Uses too much memory with big datasets.

In [25]:
mse_manual = mean_squared_error(b_test, b_predict)
r2_manual = r2_score(b_test, b_predict)

print("Manual Linear Regression (Closed-form solution):")
print(f"  Mean Squared Error: {mse_manual}")
print(f"  R^2 score: {r2_manual}")

print("scikit-learn LinearRegression:")
print(f"  Mean Squared Error: {mse_lin}")
print(f"  R^2 score: {r2_lin}")


Manual Linear Regression (Closed-form solution):
  Mean Squared Error: 1.2196829382979775e+24
  R^2 score: -2.304165641839196e+25
scikit-learn LinearRegression:
  Mean Squared Error: 0.012504309254516607
  R^2 score: 0.7637746757415895


## 2) Implementing Linear Regression using gradient descent

In [32]:
from linear_regression import gradient_descent, add_intercept

In [45]:

A_train_with_intercept = add_intercept(A_train_preprocess)  
theta = np.zeros(A_train_with_intercept.shape[1])  
learning_rate = 0.1  
iterations = 10  
batch_size = 2  


theta, cost_history = gradient_descent(A_train_with_intercept, b_train, theta, learning_rate, iterations, batch_size)


print("Optimal theta:", theta)
print("MSE:", cost_history[-1])


Optimal theta: [ 0.36358292  0.05102964 -0.04154603  0.00976622  0.148902    0.00299694
 -0.11768056  0.00155227  0.01163257  0.02177965 -0.00313994  0.04592658
  0.01476225  0.01444581  0.07305982  0.06015735 -0.02387031  0.07511632
  0.04901329 -0.00155317  0.01312727  0.02477182  0.03361357  0.03093889
  0.18495531  0.17862761  0.04244954  0.13299854  0.0896151   0.00138585
  0.04874974  0.00674426  0.0416399 ]
MSE: 0.01325392907230254


In [46]:
print("scikit-learn LinearRegression:")
print(f"  Mean Squared Error: {mse_lin}")

scikit-learn LinearRegression:
  Mean Squared Error: 0.012504309254516607
