# Testing Out Neural Network Regression

## Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.optimizers import Adam
from keras.layers import Dense, Dropout
from keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

## Loading in the Dataset

In [None]:
csv_url = 'https://gist.github.com/NickyDCFP/d675b176350f6a1d54455ffc35e350f9/raw/' + \
          '220fdbc2aec82b8b3e33681e4465813c02b5fccf/Spotify_Youtube.csv'
csv_df = pd.read_csv(csv_url)
csv_df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
csv_df.head()

## EDA ##

In [None]:
irrelevant_columns = ["Url_spotify", "Uri", "Url_youtube", "ID", "Artist", "Track", "Album", "Album_type", "Title", "Channel", "Description", 'Licensed', 'official_video'] # For now, dropping all non-numeric columns
df = csv_df.drop(irrelevant_columns, axis=1)
df.head(20)

In [None]:
y_cols = ['Views', 'Likes', 'Comments']
df = df.dropna()
df = df.loc[df['Views'] != 0]
num_likes = np.array(df['Likes']).astype(int)
num_comments = np.array(df['Comments']).astype(int)
num_views = np.array(df['Views']).astype(int)

In [None]:
X_df = df.drop(y_cols, axis=1)
X = np.array(X_df).astype('float32')
y = (num_likes / num_views).astype('float32') # Can also use comment proportion as target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clf = IsolationForest()
clf.fit(X_train)
y_p = clf.predict(X_train)
X_train = X_train[y_p == 1, :]
y_train = y_train[y_p == 1]
y_p_test = clf.predict(X_test)
X_test = X_test[y_p_test == 1, :]
y_test = y_test[y_p_test == 1]
print(X_train.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def r2(y_true, y_pred):
    score = tf.py_function(r2_score, (y_true, y_pred), tf.float64)
    return score

## Find optimal 2 layer structure ##

In [None]:
def test_2_layer():
    for layer1 in [20, 17, 15, 13, 10]:
        for layer2 in [15, 12, 8, 5]:
            model = keras.Sequential([
                            Dense(layer1, activation='sigmoid',input_shape=(X_train_scaled.shape[1], )),
                            Dense(layer2, activation='sigmoid'),
                            Dense(1)
                        ])
            optimizer = Adam(learning_rate=0.001)
            model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
            model.fit(X_train_scaled, y_train, epochs=150, batch_size=150, verbose=0)
            loss, r_2 = model.evaluate(X_test_scaled, y_test, verbose=0)
            res = pd.concat([res, pd.DataFrame({
            'Layer 1': layer1,
            'Layer 2': layer2,
            'R^2': r_2,
            'Loss': loss
        }, index=['Layer 1'])], ignore_index=True)
    return res
results = test_2_layer()
            

## Testing Optimal 3 Layer NN Structure ##

In [None]:
# Create a function to fit and predict the model with given layers
import matplotlib.pyplot as plt
def fit_and_predict(activation='sigmoid', l1=0, l2=0, l3=0):
    results = pd.DataFrame(columns=['Layer 1', 'Layer 2', 'Layer 3', 'R^2', 'Loss'])
    layer1, layer2, layer3 = 20, 20, 5
    for i in range(len((layer1, layer2, layer3))):
        res = pd.DataFrame(columns=['Layer 1', 'Layer 2', 'Layer 3', 'R^2', 'Loss'])
        for diff in range(0, -19, -1 + (-1 * i==2)):
            trials = [0, 0]
            for trial in range(3):
                model = keras.Sequential([
                            Dense(layer1 + (i==0) * diff, activation=activation,input_shape=(X_train_scaled.shape[1], )),
                            Dense(layer2 + (i==1) * diff, activation=activation),
                            Dense((10 + diff//2 )if (i==2) else layer3, activation=activation),
                            Dense(1)
                        ])
                optimizer = Adam(learning_rate=0.001)
                model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
                model.fit(X_train_scaled, y_train, epochs=150, batch_size=150, verbose=0)
                loss, r_2 = model.evaluate(X_test_scaled, y_test, verbose=0)
                trials[0] += loss
                trials[1] += r_2
            loss, r_2 = trials[0]/3, trials[1]/3
            res = pd.concat([res, pd.DataFrame({
                'Layer 1': layer1 + (i==0) * diff,
                'Layer 2': layer2 + (i==1) * diff,
                'Layer 3': (10 + diff//2 )if (i==2) else layer3,
                'R^2': r_2,
                'Loss': loss
            }, index=['Layer 1'])], ignore_index=True)
        res = res.sort_values(by=f'Layer {i+1}', ascending=False)
        # create a line plot with Layer 1 on the x-axis and R^2 on the y-axis
        plt.plot(res[f'Layer {i+1}'], res['R^2'])
        # set the axis labels and title
        plt.xlabel(f'Layer {i+1}')
        plt.ylabel('R^2')
        plt.title(f'R^2 vs Layer {i+1}')
        # display the plot
        plt.show()
        print(res.head())
        # Sort the dataframe by R^2 value
        results = pd.concat([results, res], axis=0)
    results = results.sort_values(by='R^2', ascending=False)
    return results

results = fit_and_predict()

In [None]:
from IPython.display import display
display(results)


## Test Regularization for each layer ##

In [None]:
def test_regularization():
    results = pd.DataFrame(columns=['Layer', 'L2', 'R^2', 'Loss'])
    for layer in range(1,4):
        res= pd.DataFrame(columns=['Layer', 'L2', 'R^2', 'Loss'])
        l2 = 0.001
        while(l2 <= 1000):
            model = keras.Sequential([
            Dense(20, activation='sigmoid', kernel_regularizer=regularizers.l2(l2 if layer == 1 else 0), input_shape=(X_train_scaled.shape[1], )),
            Dense(20, activation='sigmoid', kernel_regularizer=regularizers.l2(l2 if layer == 2 else 0)),
            Dense(5, activation='sigmoid', kernel_regularizer=regularizers.l2(l2 if layer == 3 else 0)),
            Dense(1)
        ]) 
            optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
            model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
            fit = model.fit(X_train_scaled, y_train,
                        epochs=150,
                        batch_size=150,
                        verbose=0)
            loss, r_2 = model.evaluate(X_test_scaled, y_test)
            res = pd.concat([res, pd.DataFrame({
                'Layer':layer,
                'L2': l2, 
                'R^2': r_2, 
                'Loss': loss
            }, index=['L2'])], ignore_index=True)
            print(f'{loss}, {r_2}')
            l2 *= 10
        plt.plot(res[f'L2'], res['R^2'])
        # set the axis labels and title
        plt.xlabel(f'L2 Layer {layer}')
        plt.ylabel('R^2')
        plt.title(f'R^2 vs L2 Layer {layer}')
        # display the plot
        plt.show()
        res = res.sort_values(by='R^2', ascending=False)
        display(res.head())
        results = pd.concat([results, res], axis=0)
    results = results.sort_values(by='R^2', ascending=False)
    return results
results = test_regularization()

In [None]:
display(results)

## Testing L2 regularization for each activation function and structure ##

In [None]:

def test_structure(struct, activation, results):
    res = pd.DataFrame(['Activation', 'Structure', 'Lambda', 'Train R^2', 'Train MSE', 'Test R^2', 'Test MSE'])
    lambdas =  [0, 1e-7, 1e-6, 1e-3, 0.01, 0.1, 1, 2, 5, 10, 100, 1000]
    for l2 in lambdas: 
        model = keras.Sequential([
        Dense(struct[0], activation=activation, kernel_regularizer=regularizers.l2(l2), input_shape=(X_train_scaled.shape[1], )),
        Dense(struct[1], activation=activation, kernel_regularizer=regularizers.l2(l2)),
        Dense(struct[2], activation=activation, kernel_regularizer=regularizers.l2(l2)),
        Dense(1)]) 
        optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
        model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
        fit = model.fit(X_train_scaled, y_train,
                    epochs=150,
                    batch_size=150,
                    verbose=0)
        train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
        test_loss, test_r_2 = model.evaluate(X_test_scaled, y_test)
        res = pd.concat([res, pd.DataFrame({
            'Activation': activation,
            'Structure': " ".join(str(x) for x in struct),
            'Lambda': l2, 
            'Train R^2': train_r_2,
            'Train MSE': train_loss,
            'Test R^2': test_r_2,
            'Test MSE': test_loss,
        }, index=['Lambda'])], ignore_index=True)

    fig, ax = plt.subplots()

    ax.plot(res['Lambda'], res['Train R^2'], label='Train', color='green')
    ax.plot(res['Lambda'], res['Test R^2'], label='Test', color='red')
    ax.legend()
    ax.set_xlabel('Lambda')
    ax.set_ylabel('R^2')
    ax.set_title(f'R^2 vs {activation} Activation with {" ".join(str(x) for x in struct)} Structure')
    plt.show()
    results = pd.concat([results, res], axis=0)
    return results

def test_models(activations=['sigmoid','relu', 'tanh'], structures = ([20, 6, 5], [20, 20, 5], [20, 11, 5])):
    results = pd.DataFrame(['Activation', 'Structure', 'Lambda', 'Train R^2', 'Train MSE', 'Test R^2', 'Test MSE'])
    for struct in structures:
        for activation in activations:
            results = test_structure(struct, activation, results)
            display(results.tail(5))
    display(results)
    return results

results = test_models()

## Testing Dropout Regularization ##

In [None]:

def test_structure(struct, activation, results):
    res = pd.DataFrame(['Activation', 'Structure', 'Dropout', 'Train R^2', 'Train Loss', 'Test R^2', 'Test Loss'])
    amts = [0.15, 0.25, 0.35, 0.5]
    for amt in amts: 
        model = keras.Sequential([
        Dense(struct[0], activation=activation, input_shape=(X_train_scaled.shape[1], )),
        Dropout(amt),
        Dense(struct[1], activation=activation),
        Dropout(amt),
        Dense(struct[2], activation=activation),
        Dropout(amt),
        Dense(1)]) 
        optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
        model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
        fit = model.fit(X_train_scaled, y_train,
                    epochs=150,
                    batch_size=150,
                    verbose=0)
        train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
        test_loss, test_r_2 = model.evaluate(X_test_scaled, y_test)
        res = pd.concat([res, pd.DataFrame({
            'Activation': activation,
            'Structure': " ".join(str(x) for x in struct),
            'Dropout': amt, 
            'Train R^2': train_r_2,
            'Train Loss': train_loss,
            'Test R^2': test_r_2,
            'Test Loss': test_loss,
        }, index=['Dropout'])], ignore_index=True)

    fig, ax = plt.subplots()

    ax.plot(res['Dropout'], res['Train R^2'], label='Train', color='green')
    ax.plot(res['Dropout'], res['Test R^2'], label='Test', color='red')
    ax.legend()
    ax.set_xlabel('Dropout')
    ax.set_ylabel('R^2')
    ax.set_title(f'R^2 vs {activation} Activation with {" ".join(str(x) for x in struct)} Structure')
    plt.show()
    results = pd.concat([results, res], axis=0)
    return results

def test_models(activations=['sigmoid','relu', 'tanh'], structures = ([20, 6, 5], [20, 20, 5], [20, 11, 5])):
    results = pd.DataFrame(['Activation', 'Structure', 'Dropout', 'Train R^2', 'Train Loss', 'Test R^2', 'Test Loss'])
    for struct in structures:
        for activation in activations:
            results = test_structure(struct, activation, results)
            display(results.tail(5))
    display(results)
    return results

results = test_models()


In [None]:
results = results.sort_values('Test R^2', ascending=False)
pd.set_option('display.max_rows', None)
display(results)

## Test Transformations ##

In [None]:
transforms = []
for i in range(2, 4):
    poly = PolynomialFeatures(degree=i)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_test_poly = poly.transform(X_test_scaled)
    transforms.append((X_train_poly, X_test_poly))

res = pd.DataFrame(['Structure', 'Lambda', 'Train R^2', 'Train Loss', 'Test R^2', 'Test Loss', 'Degree'])
for degree, transform in enumerate(transforms):
    model = keras.Sequential([
            Dense(20, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-7), input_shape=(transform[0].shape[1], )),
            Dense(6, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-7)),
            Dense(5, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-7)),
            Dense(1)])
    optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
    fit = model.fit(transform[0], y_train,
                epochs=150,
                batch_size=150,
                verbose=0)
    train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
    test_loss, test_r_2 = model.evaluate(transform[1], y_test)
    res = pd.concat([res, pd.DataFrame({
            'Structure': '20 6 5',
            'Lambda': 1e-7,
            'Train R^2': train_r_2,
            'Train Loss': train_loss,
            'Test R^2': test_r_2,
            'Test Loss': test_loss,
            'Degree': degree
        }, index=['Structure'])], ignore_index=True)
    model = keras.Sequential([
                                Dense(20, activation='sigmoid',input_shape=(transform[0].shape[1], )),
                                Dense(6, activation='sigmoid'),
                                Dense(5, activation='sigmoid'),
                                Dense(1)
                            ])
    optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
    fit = model.fit(transform[0], y_train,
                    epochs=150,
                    batch_size=150,
                    verbose=0)
    train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
    test_loss, test_r_2 = model.evaluate(transform[1], y_test)
    res = pd.concat([res, pd.DataFrame({
            'Structure': '20 6 5',
            'Lambda': 0,
            'Train R^2': train_r_2,
            'Train Loss': train_loss,
            'Test R^2': test_r_2,
            'Test Loss': test_loss,
            'Degree': degree
        }, index=['Structure'])], ignore_index=True)
    model = keras.Sequential([
                                Dense(20, activation='sigmoid',input_shape=(transform[0].shape[1], )),
                                Dense(20, activation='sigmoid'),
                                Dense(5, activation='sigmoid'),
                                Dense(1)
                            ])
    optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
    fit = model.fit(transform[0], y_train,
                    epochs=150,
                    batch_size=150,
                    verbose=0)
    train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
    test_loss, test_r_2 = model.evaluate(transform[1], y_test)
    res = pd.concat([res, pd.DataFrame({
            'Structure': '20 20 5',
            'Lambda': 0,
            'Train R^2': train_r_2,
            'Train Loss': train_loss,
            'Test R^2': test_r_2,
            'Test Loss': test_loss,
            'Degree': degree
        }, index=['Structure'])], ignore_index=True)
    model = keras.Sequential([
                                Dense(20, activation='sigmoid',input_shape=(transform[0].shape[1], )),
                                Dense(11, activation='sigmoid'),
                                Dense(5, activation='sigmoid'),
                                Dense(1)
                            ])
    optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
    fit = model.fit(transform[0], y_train,
                    epochs=150,
                    batch_size=150,
                    verbose=0)
    train_loss, train_r_2 = fit.history['loss'][-1], fit.history['r2'][-1]
    test_loss, test_r_2 = model.evaluate(transform[1], y_test)
    res = pd.concat([res, pd.DataFrame({
            'Structure': '20 11 5',
            'Lambda': 0,
            'Train R^2': train_r_2,
            'Train Loss': train_loss,
            'Test R^2': test_r_2,
            'Test Loss': test_loss,
            'Degree': degree
        }, index=['Structure'])], ignore_index=True)

    display(res.tail())
print(res)

In [None]:
results = res[res['Degree'] != 'NaN'].sort_values(by='Test R^2', ascending=False)
results['Degree'] += 2
display(results)

## See relationship between test size and R^2 ##

In [None]:
def try_test_size(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    l1_lambda = 0.1
    l2_lambda = 0.1
    model = keras.Sequential([
        Dense(5, activation='sigmoid', kernel_regularizer=regularizers.l1(l1_lambda), input_shape=(X_train_scaled.shape[1], )),
        Dense(5, activation='sigmoid', kernel_regularizer=regularizers.l2(l2_lambda)),
        Dropout(0.5),
        Dense(1)
    ]) 
    optimizer = Adam(learning_rate=0.0001) # Tried out 0.01 learning rate, not much different. 0.001 is the default. Lowering learning rate gives steady descent, nice
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[r2])
    fit = model.fit(X_train_scaled, y_train,
                epochs=150,
                batch_size=150,
                verbose=0)
    loss, r_2 = model.evaluate(X_test_scaled, y_test)
    history = fit.history
    history['loss'] = history['loss'][-1]
    history['r2'] = history['r2'][-1]
    history['test_size'] = test_size
    return history, {'test_size': test_size, 'loss': loss, 'r2': r_2}

In [None]:
cols = ['test_size', 'loss', 'r2']
training_error = pd.DataFrame(columns=cols)
test_error = pd.DataFrame(columns=cols)

for i in range(1, 50, 1):
    test_size = i / 100
    history, eval = try_test_size(X, y, test_size)
    training_error = pd.concat([training_error, pd.DataFrame(history, index=['test_size'])], ignore_index=True)
    test_error = pd.concat([test_error, pd.DataFrame(eval, index=['test_size'])], ignore_index=True)

display(training_error.head())
display(test_error.head())

In [None]:
plt.figure()
plt.plot(training_error['test_size'], training_error['loss'], label='Training MSE')
plt.plot(test_error['test_size'], test_error['loss'], label='Test MSE')
plt.title('Neural Network MSE Across Different Test Sizes')
plt.xlabel('Test Size')
plt.ylabel('MSE')
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(training_error['test_size'], training_error['r2'], label='Training R2')
plt.plot(test_error['test_size'], test_error['r2'], label='Test R2')
plt.title('Neural Network R2 Across Different Test Sizes')
plt.xlabel('Test Size')
plt.ylabel('R2')
plt.legend()
plt.show()