# Linear Regression

#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

#### Loading the dataset

In [2]:
data = pd.read_csv('dataset/spotify_features.csv', sep=',')
data.head()

Unnamed: 0,track_popularity,instrumentalness,duration_ms,energy,acousticness,album_month,album_year,danceability,loudness,liveness,genre_code,release_month_code,valence,artist_code,speechiness,tempo
0,67.0,0.00421,162600.0,0.815,0.0724,12,2019,0.726,-4.969,0.357,2,2,0.693,4535,0.106983,99.972
1,70.0,2.3e-05,176616.0,0.931,0.0794,7,2019,0.675,-3.432,0.19031,2,5,0.613,7724,0.0742,124.008
2,60.0,9e-06,169093.0,0.93,0.0287,7,2019,0.718,-3.778,0.204,2,5,0.509838,6862,0.102,121.956
3,62.0,0.0,187675.0,0.856,0.187,7,2019,0.449,-4.788,0.176,2,5,0.152,3635,0.0623,112.648
4,58.0,5e-06,207894.0,0.923,0.146,6,2019,0.679,-6.5,0.124,2,6,0.752,2056,0.181,121.984


#### Data Preprocessing

In [3]:
X = data.drop(columns=['track_popularity'])
y = data['track_popularity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_val.shape}, {y_val.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (11908, 15), (11908,)
Validation set: (3970, 15), (3970,)
Test set: (3970, 15), (3970,)


#### Standardization

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#### Linear Regression Model Training

In [5]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

#### Model Evaluation

In [6]:
train_score = model.score(X_train_scaled, y_train)
val_score = model.score(X_val_scaled, y_val)
test_score = model.score(X_test_scaled, y_test)

print(f"Train R^2: {train_score:.2f}")
print(f"Validation R^2: {val_score:.2f}")
print(f"Test R^2: {test_score:.2f}")

Train R^2: 0.07
Validation R^2: 0.07
Test R^2: 0.07


#### cross-validation

In [7]:
from sklearn.model_selection import cross_validate

scores = cross_validate(model, X, y, cv=6, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.007536,0.002987,-0.017765,0.07563
1,0.006526,0.001007,0.004878,0.079373
2,0.007157,0.001,-0.240633,0.102407
3,0.00545,0.001004,0.012162,0.063163
4,0.009543,0.00231,-0.013317,0.080448
5,0.00556,0.000981,-0.112667,0.056029


#### hyperparameter tuning

In [8]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Pipeline with standard scaling and ridge or lasso regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge())  # Start with Ridge, can change to Lasso for comparison
])

# Hyperparameters for Ridge and Lasso
ridge_params = {
    'regressor': [Ridge()],
    'regressor__alpha': [0.1, 1.0, 10, 50, 100]
}

lasso_params = {
    'regressor': [Lasso()],
    'regressor__alpha': [0.01, 0.1, 1.0, 10, 50]
}

# Combine both parameter sets for GridSearch
param_grid = [ridge_params, lasso_params]

# GridSearchCV setup with 5-fold cross-validation
gridsearch = GridSearchCV(pipeline, param_grid, cv=8, scoring='r2', n_jobs=-1) # n_jobs is how much cores can it use
gridsearch.fit(X_train, y_train)

# Best estimator and its scores
print(f"Best parameters: {gridsearch.best_params_}")
best_model = gridsearch.best_estimator_

# Evaluate best model on training, validation, and test sets
train_score = best_model.score(X_train, y_train)
val_score = best_model.score(X_val, y_val)
test_score = best_model.score(X_test, y_test)

print(f"Best Model Train R^2: {train_score:.2f}")
print(f"Best Model Validation R^2: {val_score:.2f}")
print(f"Best Model Test R^2: {test_score:.2f}")

# this may suggest that the relationship between features and target is non-linear

Best parameters: {'regressor': Lasso(), 'regressor__alpha': 0.01}
Best Model Train R^2: 0.07
Best Model Validation R^2: 0.07
Best Model Test R^2: 0.07


#### cross-validation

In [9]:
from sklearn.model_selection import cross_validate

scores = cross_validate(model, X, y, cv=6, return_train_score=True)
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.006551,0.001,-0.017765,0.07563
1,0.006518,0.002001,0.004878,0.079373
2,0.008709,0.002118,-0.240633,0.102407
3,0.007008,0.001018,0.012162,0.063163
4,0.005558,0.001013,-0.013317,0.080448
5,0.006617,0.0009,-0.112667,0.056029
