In [220]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import keras_tuner as kt
import ast
import numpy as np

This is the preprocessed dataset from phase 1.

In [221]:
data = pd.read_csv('dataset.csv')
data.head(5)

Unnamed: 0,track_name,album_name,artists,explicit,key,mode,time_signature,danceability,energy,loudness,...,week_of_year,day,day_of_week,day_of_year,duration_mins,duration_ms,artist_count,genre_count,track_genre,popularity
0,Ode To The Mets,The New Abnormal,['The Strokes'],False,1,0,4.0,0.428,0.617,-5.424,...,15,10,Friday,101,5.863117,351787,1,2,"['alt-rock', 'garage']",67
1,Glaciers,Bloom,['Lights & Motion'],False,7,1,4.0,0.0789,0.16,-18.144,...,5,2,Friday,33,2.93955,176373,1,1,['ambient'],49
2,Uber Pussy,Pink Season,['Pink Guy'],True,7,1,4.0,0.87,0.597,-6.32,...,1,4,Wednesday,4,1.956017,117361,1,1,['comedy'],39
3,2002,Speak Your Mind (Deluxe),['Anne-Marie'],False,1,0,4.0,0.697,0.683,-2.881,...,17,27,Friday,117,3.11645,186987,1,3,"['dance', 'house', 'pop']",82
4,Monsters You Made (feat. Chris Martin),Twice As Tall,"['Burna Boy', 'Chris Martin']",True,7,0,4.0,0.671,0.646,-7.513,...,33,13,Thursday,226,3.625683,217541,2,1,['dancehall'],45


In [222]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4724 entries, 0 to 4723
Data columns (total 36 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_name        4724 non-null   object 
 1   album_name        4724 non-null   object 
 2   artists           4724 non-null   object 
 3   explicit          4724 non-null   bool   
 4   key               4724 non-null   int64  
 5   mode              4724 non-null   int64  
 6   time_signature    4724 non-null   float64
 7   danceability      4724 non-null   float64
 8   energy            4724 non-null   float64
 9   loudness          4724 non-null   float64
 10  speechiness       4724 non-null   float64
 11  acousticness      4724 non-null   float64
 12  instrumentalness  4724 non-null   float64
 13  liveness          4724 non-null   float64
 14  valence           4724 non-null   float64
 15  tempo             4724 non-null   float64
 16  track_id          4724 non-null   object 


Remove ID columns because they are not useful for the model.

In [223]:
data = data.drop(['track_id', 'album_id', 'artist_ids'], axis=1)

In [224]:
non_numeric_cols = data.select_dtypes(exclude=['int', 'float', 'bool']).columns
for col in non_numeric_cols:
    print(f'{col}: {data[col].dtype}')

track_name: object
album_name: object
artists: object
release_date: object
day_of_week: object
track_genre: object


As we extracted the date information, we can remove the original date columns.

In [225]:
data = data.drop(['release_date'], axis=1)

In [226]:
non_numeric_cols = data.select_dtypes(exclude=['int', 'float', 'bool']).columns
for col in non_numeric_cols:
    print(f'{col}: {data[col].dtype}')

track_name: object
album_name: object
artists: object
day_of_week: object
track_genre: object


In [227]:
data['track_genre'] = data['track_genre'].apply(lambda x: ast.literal_eval(x))
data['artists'] = data['artists'].apply(lambda x: ast.literal_eval(x))

In [228]:
genres = set()
for genre_list in data['track_genre']:
    for genre in genre_list:
        genres.add(genre)
print(f'Unique genres: {len(genres)}')

Unique genres: 111


In [229]:
# mlb = MultiLabelBinarizer()

# one_hot_genres = mlb.fit_transform(data['track_genre'])
# one_hot_df = pd.DataFrame(one_hot_genres, columns=mlb.classes_)
# data = pd.concat([data, one_hot_df], axis=1)
data = data.drop(columns=['track_genre'])

In [231]:
non_numeric_cols = data.select_dtypes(exclude=['int', 'float', 'bool']).columns
for col in non_numeric_cols:
    print(f'{col}: {data[col].dtype}')

track_name: object
album_name: object
artists: object
day_of_week: object


In [232]:
data = data.drop(columns=['track_name', 'album_name', 'artists', 'day_of_week'])

In [233]:
X = data.drop(columns=['popularity'])
y = data['popularity']

### Part 3: Dimensionality Reduction

**Principal Component Analysis (PCA)**

In [234]:
pca = PCA(n_components=0.9, random_state=31)
X_reduced = pca.fit_transform(X)
pca.explained_variance_ratio_
X_reduced_df = pd.DataFrame(X_reduced, columns=[f'PC{i}' for i in range(1, X_reduced.shape[1] + 1)])

# print(selected_columns)
print("Number of columns after PCA reduction:", X_reduced_df.columns.size)
print("Number of columns in the origianl Dataset:", X.columns.size)
print("Reduction Ratio", X_reduced_df.columns.size / X.columns.size, "\n")

Number of columns after PCA reduction: 1
Number of columns in the origianl Dataset: 26
Reduction Ratio 0.038461538461538464 



In [235]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
cumsum[-1]
# cumsum.sum()

0.9999981989402316

In [236]:
variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i}' for i in range(1, len(pca.explained_variance_) + 1)],
    'Explained Variance': pca.explained_variance_,
    'Explained Variance Ratio': pca.explained_variance_ratio_
})

# Print the DataFrame
print(variance_df)

  Principal Component  Explained Variance  Explained Variance Ratio
0                 PC1        8.237030e+09                  0.999998


In [237]:
print("Correspondnig variances of each feature with the first PCA component:")
selected_columns = X.columns[pca.components_[0].argsort()[::-1]]
for col in selected_columns:
    print(f"    - {col}: {pca.components_[0][X.columns.get_loc(col)]}")

Correspondnig variances of each feature with the first PCA component:
    - duration_ms: 0.9999999996663541
    - duration_mins: 1.6666666661105896e-05
    - tempo: 4.271553709804668e-06
    - week_of_year: 3.2466207363977965e-06
    - instrumentalness: 5.534170259199161e-07
    - time_signature: 1.620077782192154e-07
    - key: 1.1167802772071055e-07
    - liveness: 6.572747963548224e-08
    - disc_number: 3.9183034141126226e-09
    - quarter: -3.6064519225345725e-08
    - genre_count: -3.747643092294849e-08
    - speechiness: -1.3603656953052994e-07
    - acousticness: -1.3713955614105753e-07
    - danceability: -1.590149921040909e-07
    - energy: -1.6694851393590743e-07
    - mode: -2.0589034529113577e-07
    - month: -2.6014652318361355e-07
    - artist_count: -2.6734484001897374e-07
    - explicit: -3.6942340314521675e-07
    - valence: -4.932788648973499e-07
    - day: -2.914150958696777e-06
    - track_number: -5.294210180773837e-06
    - loudness: -8.002252583441901e-06
    - 

In [None]:
categorical_features = data.select_dtypes(exclude=['int', 'float']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

X_processed = preprocessor.fit_transform(X)

### Train and Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
non_numeric_cols = data.select_dtypes(exclude=['int', 'float', 'bool']).columns
for col in non_numeric_cols:
    print(f'{col}: {data[col].dtype}')

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer with 128 neurons
    Dropout(0.2),  # Dropout layer to prevent overfitting
    Dense(64, activation='relu'),  # Second hidden layer with 64 neurons
    Dropout(0.2),  # Dropout layer
    Dense(32, activation='relu'),  # Third hidden layer with 32 neurons
    Dense(1)  # Output layer with 1 neuron for regression
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # MSE is a common loss function for regression
              metrics=['mean_absolute_error'])  # MAE is a useful metric for regression

In [None]:
def fit_and_predict(model, X_train, y_train, X_test, y_test, epochs=20, validation_split=0.2, batch_size=10):
    history = model.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, batch_size=batch_size)
    y_pred = model.predict(X_test)

    return history, y_pred

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    history, y_pred = fit_and_predict(model, X_train, y_train, X_test, y_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')

    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_pred, alpha=0.3)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.title('True vs Predicted Popularity')
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:
evaluate_model(model, X_train, y_train, X_test, y_test)

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=256, step=32), activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(units=hp.Int('units_2', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(Dense(units=hp.Int('units_3', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-4, 1e-3, 1e-2])), loss='mse', metrics=['mse'])
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=3,
    directory='my_dir',
    project_name='tune_keras')

tuner.search(X_train, y_train, epochs=15, validation_split=0.2, batch_size=32)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first layer is {best_hps.get('units_1')},
the second layer is {best_hps.get('units_2')}, the third layer is {best_hps.get('units_3')}, and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

model = tuner.hypermodel.build(best_hps)

In [None]:
evaluate_model(model, X_train, y_train, X_test, y_test)