In [None]:
### IMPORT LIBRARY
import numpy as np
import pandas as pd
from Regression import RidgeRegression

#IMPORT DATA
dataset = pd.read_csv('dataset.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 8)

# NaN CONTROL
print (dataset.isna().sum().sort_values(ascending = False))
dataset[dataset.isna().any(axis=1)]



In [None]:
## CLEANING DATA: Shifting + normalizing
dataset['Explicit'] = dataset['explicit'].astype(int)
dataset = dataset.drop('explicit',axis = 1)
dataset['Major'] = dataset['mode'].astype(int)
dataset = dataset.drop('mode',axis = 1)
# SHIFTING loudness
min_value = dataset['loudness'].min()
dataset['loudness'] = dataset['loudness'] - min_value
# NORMALIZING  loudness + tempo + duration_ms (there are outliers...) 
max_value_shifted = dataset['loudness'].max()
dataset['loudness'] = dataset['loudness'] / max_value_shifted
max_value_shifted = dataset['duration_ms'].max()
dataset['duration_ms'] = dataset['duration_ms'] / max_value_shifted
max_value_shifted = dataset['tempo'].max()
dataset['tempo'] = dataset['tempo'] / max_value_shifted




In [None]:
## ONE-HOT ENCODER - GENRE
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = False, drop='first')

encoded_genre = encoder.fit_transform(dataset[['track_genre']])
encoded_genre_df = pd.DataFrame(encoded_genre, columns=encoder.get_feature_names_out(['track_genre']))
encoded_genre_df.index = dataset['Unnamed: 0']

dataset_encoded = pd.concat([dataset.drop('track_genre', axis=1), encoded_genre_df], axis=1) # maybe change name

#### OHE - KEY+TIME
encoded_tsign = encoder.fit_transform(dataset[['time_signature']])
encoded_tsign_df = pd.DataFrame(encoded_tsign, columns=encoder.get_feature_names_out(['time_signature']))

encoded_key = encoder.fit_transform(dataset[['key']])
encoded_key_df = pd.DataFrame(encoded_key, columns=encoder.get_feature_names_out(['key']))

encoded_key_df.index = dataset['Unnamed: 0']
encoded_tsign_df.index = dataset['Unnamed: 0']

dataset_encoded2 = pd.concat([dataset.drop('time_signature', axis=1), encoded_tsign_df], axis=1)
dataset_encoded2 = pd.concat([dataset_encoded2.drop('key', axis=1), encoded_key_df], axis=1)

# Concatenate the one-hot encoded columns with the original DataFrame
dataset_encoded3 = pd.concat([dataset.drop('key', axis=1), encoded_key_df], axis=1)


## TRAIN-VALIDATION-TEST SET

# NUMERIC - no  aggr
numeric_df = dataset.select_dtypes(include=['int', 'float']).drop(['time_signature', 'key', 'Unnamed: 0'], axis = 1) # Numeric cloumns
# NUMERIC - aggr
df_aggr_no_genre = dataset.groupby('track_id').agg('first').sort_values(by='Unnamed: 0') 
numeric_df_aggr = df_aggr_no_genre.select_dtypes(include=['int', 'float']).drop(['time_signature', 'key', 'Unnamed: 0'], axis = 1)

## OHE GENRE - NO AGGREGATION
genre_df = dataset_encoded.select_dtypes(include=['int', 'float','uint8']).drop(['time_signature', 'key', 'Unnamed: 0'], axis = 1)


#### OHE - AGGREGATION with genre 0-1
genre_columns = dataset_encoded.select_dtypes(include=['uint8']).columns.to_list()
other_columns = dataset_encoded.select_dtypes(include=['int', 'float','object','bool']).columns.to_list()  # numeric

## AGGREGATION FUNCTIONS
aggregation_functions_g = {}
for col in dataset_encoded.columns:
    if col in other_columns:
        aggregation_functions_g[col] = 'first'
    elif col in genre_columns:
        aggregation_functions_g[col] = 'max'

genre_df_aggr = dataset_encoded.groupby('track_id').agg(aggregation_functions_g).sort_values(by='Unnamed: 0')

# CATEGORICAL DATA [sum of genres = 1]
genre_df_aggr = genre_df_aggr.select_dtypes(include=['int', 'float','uint8']).drop(['time_signature', 'key', 'Unnamed: 0'], axis = 1)

## OHE other - NO AGGREGATION
KT_df = dataset_encoded2.select_dtypes(include=['int', 'float','uint8']).drop(['Unnamed: 0'], axis = 1)

## OHE other - AGGREGATION
KT_df_aggr = df_aggr_no_genre.select_dtypes(include=['int', 'float','uint8']).drop(['Unnamed: 0'], axis = 1)

## COLUMN - OHE other - NO AGGREGATION
K_df = dataset_encoded3.select_dtypes(include=['int', 'float','uint8']).drop(['time_signature', 'Unnamed: 0'], axis = 1)

K_df_aggr = df_aggr_no_genre.select_dtypes(include=['int', 'float','uint8']).drop(['time_signature', 'Unnamed: 0'], axis = 1)




In [None]:
from sklearn.model_selection import train_test_split

## 1st dataset - num not aggregated
X = numeric_df.drop('popularity',axis = 1).values
y = numeric_df['popularity'].values 
# 2nd dataset - num aggregated
X_2 = numeric_df_aggr.drop('popularity',axis = 1).values
y_2 = numeric_df_aggr['popularity'].values
## 3rd - ohe genre no aggregated
X_3 = genre_df.drop('popularity', axis = 1).values
y_3 = genre_df['popularity'].values
## 4th dataset - ohe genre aggregated 
X_4 = genre_df_aggr.drop('popularity', axis = 1).values
y_4 = genre_df_aggr['popularity'].values
## 5th dataset - ohe key+time_sign not aggregated
X_5 = KT_df.drop('popularity',axis = 1).values
y_5 = KT_df['popularity'].values
## 6th dataset - ohe key+time_sign aggregated
X_6 = KT_df_aggr.drop('popularity',axis = 1).values
y_6 = KT_df_aggr['popularity'].values
## 7th dataset - ohe key not aggregated
X_7 = K_df.drop('popularity',axis = 1).values  # Convert selected columns to numpy array
y_7 = K_df['popularity'].values
## 8th dataset - ohe key aggregated
X_8 = K_df_aggr.drop('popularity',axis = 1).values
y_8 = K_df_aggr['popularity'].values

# SPLIT the dataset into train and test sets (70% train, 30% test) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size=0.30, random_state=42)
X_3_train, X_3_test, y_3_train, y_3_test = train_test_split(X_3, y_3, test_size=0.30, random_state=42)
X_4_train, X_4_test, y_4_train, y_4_test = train_test_split(X_4, y_4, test_size=0.30, random_state=42)
X_5_train, X_5_test, y_5_train, y_5_test = train_test_split(X_5, y_5, test_size=0.30, random_state=42)
X_6_train, X_6_test, y_6_train, y_6_test = train_test_split(X_6, y_6, test_size=0.30, random_state=42)
X_7_train, X_7_test, y_7_train, y_7_test = train_test_split(X_7, y_7, test_size=0.30, random_state=42)
X_8_train, X_8_test, y_8_train, y_8_test = train_test_split(X_8, y_8, test_size=0.30, random_state=42)



## 1st REGRESSION
model = RidgeRegression(reg_param=1.5) 
betas_1 = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
#print("Betas 1 [numeric features - not aggregation]:", betas_1)
print("Predictions 1 [numeric features - not aggregation]:", y_pred)

## 2nd REGRESSION
model = RidgeRegression(reg_param=1.5) 
betas_2 = model.fit(X_2_train, y_2_train)
y_pred_2 = model.predict(X_2_test)
#print("Betas 2 [numeric features - aggregation]:", betas_2)
print("Predictions 2 [numeric features - aggregation]:", y_pred_2)

# 3rd REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_3 = model.fit(X_3_train, y_3_train)
y_pred_3 = model.predict(X_3_test)
#print("Betas 3 [numeric + genre features - not aggregation]:", betas_3)
print("Predictions 3 [numeric features + genre features - not aggregation]:", y_pred_3)

## 4th - REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_4 = model.fit(X_4_train, y_4_train)
y_pred_4 = model.predict(X_4_test)
#print("Betas 4 [numeric + genre features - aggregation]:", betas_4)
print("Predictions 4 [numeric features + genre - aggregation]:", y_pred_4)

## 5th - REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_5 = model.fit(X_5_train, y_5_train)
y_pred_5 = model.predict(X_5_test)
print("Predictions 5 [numeric features + key and time signature - no aggregation]:", y_pred_5)

## 6th - REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_6 = model.fit(X_6_train, y_6_train)
y_pred_6 = model.predict(X_6_test)
#print("Betas 6:", betas_6)
print("Predictions 6: [numeric features + key and time signature - aggregation]", y_pred_6)

## 7th - REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_7 = model.fit(X_7_train, y_7_train)
y_pred_7 = model.predict(X_7_test)
print("Predictions 7 [numeric features + key - no aggregation]:", y_pred_7)

## 8th - REGRESSION
model = RidgeRegression(reg_param=1.5)
betas_8 = model.fit(X_8_train, y_8_train)
y_pred_8 = model.predict(X_8_test)
#print("Betas 8:", betas_8)
print("Predictions 8 [numeric features + key - no aggregation]:", y_pred_8)







In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mse_2 = mean_squared_error(y_2_test, y_pred_2)
mse_3 = mean_squared_error(y_3_test, y_pred_3)
mse_5 = mean_squared_error(y_5_test, y_pred_5)
mse_4 = mean_squared_error(y_4_test, y_pred_4)
mse_6 = mean_squared_error(y_6_test, y_pred_6)
mse_7 = mean_squared_error(y_7_test, y_pred_7)
mse_8 = mean_squared_error(y_8_test, y_pred_8)

r_squared = r2_score(y_test, y_pred)
r_squared_3 = r2_score(y_3_test, y_pred_3)
r_squared_5 = r2_score(y_5_test, y_pred_5)
r_squared_4 = r2_score(y_4_test, y_pred_4)
r_squared_2 = r2_score(y_2_test, y_pred_2)
r_squared_6 = r2_score(y_6_test, y_pred_6)
r_squared_7 = r2_score(y_7_test, y_pred_7)
r_squared_8 = r2_score(y_8_test, y_pred_8)

print('MSE 1: ',mse,'| R^2 1: ',r_squared)
print('MSE 2: ',mse_2,'  | R^2 2: ',r_squared_2)
print('MSE 3: ',mse_3,'| R^2 3: ',r_squared_3)
print('MSE 4: ',mse_4,'| R^2 4: ',r_squared_4)
print('MSE 5: ',mse_5,' | R^2 5: ',r_squared_5)
print('MSE 6: ',mse_6,' | R^2 6: ',r_squared_6)
print('MSE 7: ',mse_7,'| R^2 7: ',r_squared_7)
print('MSE 8: ',mse_8,' | R^2 8: ',r_squared_8)


In [None]:
from sklearn.model_selection import cross_validate

model = RidgeRegression(reg_param=1.5)
scores = ['r2' , 'neg_mean_squared_error']

cv_acc = cross_validate(model, X_2, y_2, cv=5, scoring=scores)
r_squared_2 = r2_score(y_2_test, y_pred_2)
print('[2] r2 Max:   ',cv_acc['test_r2'].max(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].max())
print('[2] r2 Min:  ',cv_acc['test_r2'].min(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].min())
print('[2] r2 Mean: ',cv_acc['test_r2'].mean(),'\t mse Mean: ',-cv_acc['test_neg_mean_squared_error'].mean())
print('R^2 2:\t      ',r_squared_2,'\t    MSE  : ',mse_2)
print('--------------------------------------------------------------------')

cv_acc = cross_validate(model, X_4, y_4, cv=5, scoring=scores)
r_squared_4 = r2_score(y_4_test, y_pred_4)
print('[4] r2 Max:   ',cv_acc['test_r2'].max(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].max())
print('[4] r2 Min:  ',cv_acc['test_r2'].min(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].min())
print('[4] r2 Mean: ',cv_acc['test_r2'].mean(),'\t mse Mean: ',-cv_acc['test_neg_mean_squared_error'].mean())
print('R^2 4:\t      ',r_squared_4,'\t    MSE  : ',mse_4)
print('--------------------------------------------------------------------')

cv_acc = cross_validate(model, X_6, y_6, cv=5, scoring=scores)
r_squared_6 = r2_score(y_6_test, y_pred_6)
print('[6] r2 Max:   ',cv_acc['test_r2'].max(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].max())
print('[6] r2 Min:  ',cv_acc['test_r2'].min(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].min())
print('[6] r2 Mean: ',cv_acc['test_r2'].mean(),'\t mse Mean: ',-cv_acc['test_neg_mean_squared_error'].mean())
print('R^2 6:\t      ',r_squared_6,'\t    MSE  : ',mse_6)
print('--------------------------------------------------------------------')

cv_acc = cross_validate(model, X_8, y_8, cv=5, scoring=scores)
r_squared_8 = r2_score(y_8_test, y_pred_8)
print('[8] r2 Max:   ',cv_acc['test_r2'].max(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].max())
print('[8] r2 Min:  ',cv_acc['test_r2'].min(), '\t mse Min:  ',-cv_acc['test_neg_mean_squared_error'].min())
print('[8] r2 Mean: ',cv_acc['test_r2'].mean(),'\t mse Mean: ',-cv_acc['test_neg_mean_squared_error'].mean())
print('R^2 8:\t      ',r_squared_8,'\t    MSE  : ',mse_8)
print('--------------------------------------------------------------------')


In [None]:
# SPLIT the dataset into train and test sets (70% train, 30% test) 
X_Test, X_Val, y_Test, y_Val = train_test_split(X, y, test_size=0.50, random_state=42)
X_2_Test, X_2_Val, y_2_Test, y_2_Val = train_test_split(X_2, y_2, test_size=0.50, random_state=42)
X_3_Test, X_3_Val, y_3_Test, y_3_Val = train_test_split(X_3, y_3, test_size=0.50, random_state=42)
X_4_Test, X_4_Val, y_4_Test, y_4_Val = train_test_split(X_4, y_4, test_size=0.50, random_state=42)
X_5_Test, X_5_Val, y_5_Test, y_5_Val = train_test_split(X_5, y_5, test_size=0.50, random_state=42)
X_6_Test, X_6_Val, y_6_Test, y_6_Val = train_test_split(X_6, y_6, test_size=0.50, random_state=42)
X_7_Test, X_7_Val, y_7_Test, y_7_Val = train_test_split(X_7, y_7, test_size=0.50, random_state=42)
X_8_Test, X_8_Val, y_8_Test, y_8_Val = train_test_split(X_8, y_8, test_size=0.50, random_state=42)


In [None]:
import matplotlib.pyplot as plt
# Define a logarithmic range of values for the regularization parameter 
vec = np.exp(np.linspace(np.log(0.001), np.log(10), 50))
alphas = np.insert(vec,0,[0.0]) # inserting also 0 to see if linear regression
alphas = vec
mse_val_2 = []
mse_val_4 = []
mse_val_6 = []
mse_val_8 = []

# Iterate over each value of alpha
for alpha in alphas:
    # Create and fit the ridge regression model
    model = RidgeRegression(reg_param=alpha)
   
    # Fit thr model on train set
    model.fit(X_2_train, y_2_train)
    # Evaluate the model's performance using MSE
    mse_val_2.append(mean_squared_error(y_2_Val, model.predict(X_2_Val)))

    model.fit(X_4_train, y_4_train)
    mse_val_4.append(mean_squared_error(y_4_Val, model.predict(X_4_Val)))

    model.fit(X_6_train, y_6_train)
    mse_val_6.append(mean_squared_error(y_6_Val, model.predict(X_6_Val)))

    model.fit(X_8_train, y_8_train)
    mse_val_8.append(mean_squared_error(y_8_Val, model.predict(X_8_Val)))


fig, axs = plt.subplots(2, 2, figsize=(20,10))    

axs[0, 0].plot(alphas, mse_val_2, label='Validation MSE', color='green')
axs[0, 0].set_title('Mean Squared Error vs. Alpha for Ridge Regression, 2nd case')
axs[0, 0].set_xscale('log')
axs[0, 1].plot(alphas, mse_val_4, label='Validation MSE', color='green')
axs[0, 1].set_title('Mean Squared Error vs. Alpha for Ridge Regression, 4th case')
axs[0, 1].set_xscale('log')
axs[1, 0].plot(alphas, mse_val_6, label='Validation MSE', color='green')
axs[1, 0].set_title('Mean Squared Error vs. Alpha for Ridge Regression, 6th case')
axs[1, 0].set_xscale('log')
axs[1, 1].plot(alphas, mse_val_8, label='Validation MSE', color='green')
axs[1, 1].set_title('Mean Squared Error vs. Alpha for Ridge Regression, 8th case')
axs[1, 1].set_xscale('log')

for ax in axs.flat:
    ax.set(xlabel='Regularization parameter', ylabel='Mean Squared Error (MSE)')
    ax.legend()
    ax.grid(True)

    


In [None]:
# DICTIONARIES with features as keys and correspondent regression coefficient as item
my_dict_1 = dict(zip(numeric_df_aggr.drop('popularity',axis = 1).columns.append(pd.Index(['intercept'])), betas_1))
sorted_dict_1 = sorted(my_dict_1.items(), key=lambda item: item[1], reverse=True)
my_dict_2 = dict(zip(numeric_df_aggr.drop('popularity',axis = 1).columns, betas_2))
sorted_dict_2 = sorted(my_dict_2.items(), key=lambda item: item[1], reverse=True)
my_dict_3 = dict(zip(genre_df.drop('popularity',axis = 1).columns.append(pd.Index(['intercept'])), betas_3))
sorted_dict_3 = sorted(my_dict_3.items(), key=lambda item: item[1], reverse=True)
my_dict_4 = dict(zip(genre_df_aggr.drop('popularity',axis = 1).columns.append(pd.Index(['intercept'])), betas_4))
sorted_dict_4 = sorted(my_dict_4.items(), key=lambda item: item[1], reverse=True)
my_dict_6 = dict(zip(KT_df_aggr.columns, betas_6))
sorted_dict_6 = sorted(my_dict_6.items(), key=lambda item: item[1], reverse=True)
my_dict_8 = dict(zip(K_df_aggr.columns, betas_8))
sorted_dict_8 = sorted(my_dict_8.items(), key=lambda item: item[1], reverse=True)

sorted_dict_1 = dict(sorted_dict_1)
sorted_dict_2 = dict(sorted_dict_2)
sorted_dict_3 = dict(sorted_dict_3)
sorted_dict_4 = dict(sorted_dict_4)
sorted_dict_6 = dict(sorted_dict_6)
sorted_dict_8 = dict(sorted_dict_8)


#sorted_dict_1
sorted_dict_2
#sorted_dict_3
#sorted_dict_4
#sorted_dict_6
#sorted_dict_8



In [None]:
print('MSE 1: ',mse,'| R^2 1: ',r_squared)
print('MSE 2: ',mse_2,'  | R^2 2: ',r_squared_2)
print('MSE 3: ',mse_3,'| R^2 3: ',r_squared_3)
print('MSE 4: ',mse_4,'| R^2 4: ',r_squared_4)
print('MSE 5: ',mse_5,' | R^2 5: ',r_squared_5)
print('MSE 6: ',mse_6,' | R^2 6: ',r_squared_6)
print('MSE 7: ',mse_7,'| R^2 7: ',r_squared_7)
print('MSE 8: ',mse_8,' | R^2 8: ',r_squared_8)


In [None]:
model.fit(X_6_train, y_6_train)

In [None]:
model.fit(X_8_train, y_8_train)