In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import linear_model
import torch
from scipy.special import softmax 
from sklearn.preprocessing import StandardScaler

import pyro
import pyro.distributions as dist
from pyro.contrib.autoguide import AutoDiagonalNormal, AutoMultivariateNormal
from pyro.infer import MCMC, NUTS, HMC, SVI, Trace_ELBO
from pyro.optim import Adam, ClippedAdam
from pyro.infer import Predictive

  from .autonotebook import tqdm as notebook_tqdm


### Data Processing

In [2]:
scaler = StandardScaler()

In [3]:
#Use your own local path please!
path = "./Data/"
file_name = ["train.csv", "test.csv", "submission.csv"]
df_train = pd.read_csv(path + file_name[0])
df_test = pd.read_csv(path + file_name[1])
df_submission = pd.read_csv(path + file_name[2])

In [4]:
df_train['key'] = df_train['key'].fillna(df_train['key'].mean())

In [5]:
df_train['instrumentalness'] = df_train['instrumentalness'].fillna(df_train['instrumentalness'].mean())

In [6]:
df_train['Popularity'] = df_train['Popularity'].fillna(df_train['Popularity'].mean())

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [8]:
df_features = df_train[['Popularity', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_in min/ms', 'time_signature']].copy()
df_features[df_features.columns] = scaler.fit_transform(df_features)

df_train_target = df_train[['Class']].sample(1000)
df_train_target_index = df_train_target.index
df_train_features = df_features.iloc[df_train_target_index].to_numpy()
df_train_target = df_train_target.to_numpy()

df_features_new = df_features
df_features_new[['Class']] = df_train[['Class']]
# df_features_new

df_rest_features = df_features_new.drop(index = df_train_target_index).reset_index()
# df_rest_target = df_train_target.iloc[~df_train_target_index].reset_index()
df_rest_features = df_rest_features.drop(columns = {df_rest_features.columns[0]})
df_test_target = df_rest_features[['Class']].sample(200)
df_test_target_index = df_test_target.index

df_test_features = df_rest_features.iloc[df_test_target_index]
df_test_features = df_test_features.drop(columns={'Class'}).to_numpy()

df_test_target = df_test_target.to_numpy()


# df_rest_features

In [9]:
df_rest_02 = df_rest_features.drop(index = df_test_target_index).reset_index()
df_rest_02 = df_rest_02.drop(columns = {df_rest_02.columns[0]})
df_val_target = df_rest_02[['Class']].sample(200)
df_val_target_index = df_val_target.index

df_val_features = df_rest_02.iloc[df_val_target_index]
df_val_features = df_val_features.drop(columns={'Class'}).to_numpy()

df_val_target = df_val_target.to_numpy()
df_val_features.shape, df_val_target.shape

((200, 14), (200, 1))

In [10]:
D = df_train_features.shape[1]
N_train = df_train_features.shape[0]
N_test = df_test_features.shape[0]
N_val = df_val_features.shape[0]
n_cat = 11 
degF=5
tau=10

### Ancestral Sampling 

In [11]:
# sample coefficients (beta)
beta = np.random.normal(0,1,size=D)
print("beta:", beta)

beta_array = np.zeros((n_cat,D))

for i in range(n_cat):
    
    beta_array[i,:] = np.random.normal(0,1,size=D)
    
# sample observations (y's)
y = np.zeros((N_train,n_cat))
for n in range(N_train):
    
    probs = np.zeros(n_cat)
    for i in range(n_cat):
        probs[i] = np.array([(np.dot(beta_array[i,:], df_train_features[n,:]))])
        
    p =  softmax(probs)
    y[n,:] = np.random.multinomial(1, p)  #binomial with one trial is equivalent to bernoulli
    #y[n] = int(p > 0.5) # alternative version without observation noise
    #print(" p=, y[n]="  (n, p, y[n,:]))
    #print('n, p and y ', n, p, y[n,:])

beta: [-1.1131725   1.88309417  2.48329262 -1.76659086 -0.00330663 -0.1915277
  0.61044673 -0.36818506  0.95252189  0.67795511 -0.6937733  -0.38363578
 -0.82296093  0.23970234]


In [12]:
label_array = np.array([]) 
for i in y:
    label_array = np.append(label_array,np.argmax(i))


In [13]:
from collections import Counter

counts = Counter(label_array)
counts

Counter({2.0: 87,
         7.0: 227,
         5.0: 104,
         8.0: 123,
         6.0: 47,
         3.0: 74,
         1.0: 103,
         10.0: 88,
         0.0: 73,
         9.0: 47,
         4.0: 27})

In [14]:
print("Accuracy:", 1.0*np.sum(label_array == df_train_target.flatten()) / len(df_train_target))

Accuracy: 0.073


### Model implementation with Pyro

In [16]:
def model(X, n_cat, degF, tau, obs=None):
    
    input_dim = X.shape[1]
    
    
    mu_beta = pyro.sample("mu_beta", dist.StudentT(df=torch.ones(n_cat)*degF, 
                                                   loc=torch.zeros(n_cat), 
                                                   scale=torch.ones(n_cat)).to_event()) # Prior for the bias mean      
    sigma_beta  = pyro.sample("sigma_beta",  dist.HalfCauchy(tau*torch.ones(n_cat)).to_event()) # Prior for the bias standard deviation
    
    beta  = pyro.sample("beta", dist.Normal(mu_beta*torch.ones(n_cat), 
                                            sigma_beta*torch.ones(input_dim, n_cat)).to_event()) # Priors for the regression coefficents

    alpha = pyro.sample("alpha", dist.Normal(torch.zeros(1, n_cat), 
                                             5.*torch.ones(1, n_cat)).to_event())  # Prior for the bias/intercept
    
    
    with pyro.plate("data"):
        y = pyro.sample("y", dist.Categorical(logits=alpha + X.matmul(beta)), obs=obs)
        
    return y

In [17]:
X_train = torch.tensor(df_train_features).float()
y_train = torch.tensor(df_train_target.flatten()).float()

#### Final Model training 

In [18]:
degF=5
tau=1

In [20]:
# Define guide function
guide = AutoMultivariateNormal(model)

# Reset parameter values
pyro.clear_param_store()

# Define the number of optimization steps
n_steps = 10000

learning_rates = [0.0001,0.001,0.01]
# Setup the optimizer
acc_val_lr = []
for lr in learning_rates:

    adam_params = {"lr": lr}
    optimizer = ClippedAdam(adam_params)

    # Setup the inference algorithm
    elbo = Trace_ELBO(num_particles=1)
    svi = SVI(model, guide, optimizer, loss=elbo)

    # Do gradient steps
    for step in range(n_steps):
        elbo = svi.step(X_train, n_cat, degF, tau, y_train)
        if step % 1000 == 0:
            print("[%d] ELBO: %.1f" % (step, elbo))

            #ef model(X, n_cat, degF, tau, obs=None):
    predictive = Predictive(model, guide=guide, num_samples=2000,
                    return_sites=("alpha", "beta"))
    samples = predictive(X_train, n_cat, degF, tau, y_train)
    
    
    samples_alpha = samples["alpha"].detach().squeeze()
    samples_beta = samples["beta"].detach().squeeze()
    
    
    mean_betas = samples_beta.mean(axis=0)
    mean_betas = mean_betas.T
    mean_alpha = samples_alpha.mean(axis=0)
    mean_alpha = mean_alpha.T
    
    
    y_val_pred = np.zeros((N_val,n_cat))
    
    for n in range(N_val):

        probs = np.zeros(n_cat)
        for i in range(n_cat):
            probs[i] = np.array([mean_alpha[i]+(np.dot(mean_betas[i,:], df_val_features[n,:]))])

        p =  softmax(probs)
        y_val_pred[n,:] = np.argmax(p)  #binomial with one trial is equivalent to bernoulli
        
    y_val_pred = y_val_pred[:,0]
            
    acc = 1.0*np.sum(y_val_pred == df_val_target.flatten()) / len(df_val_target)
    print({lr:acc})
    acc_val_lr.append({lr:acc})

[0] ELBO: 4433.5
[1000] ELBO: 3841.0
[2000] ELBO: 3244.7
[3000] ELBO: 2886.3
[4000] ELBO: 2615.0
[5000] ELBO: 2443.0
[6000] ELBO: 2225.8
[7000] ELBO: 2168.9
[8000] ELBO: 2111.7
[9000] ELBO: 1988.7
{0.0001: 0.445}
[0] ELBO: 1931.3
[1000] ELBO: 1730.5
[2000] ELBO: 1682.8
[3000] ELBO: 1661.3
[4000] ELBO: 1656.7
[5000] ELBO: 1644.1
[6000] ELBO: 1638.6
[7000] ELBO: 1640.4
[8000] ELBO: 1634.9
[9000] ELBO: 1634.4
{0.001: 0.475}
[0] ELBO: 1631.6
[1000] ELBO: 1690.9
[2000] ELBO: 1737.2
[3000] ELBO: 1690.1
[4000] ELBO: 1715.4
[5000] ELBO: 1690.9
[6000] ELBO: 1697.8
[7000] ELBO: 1694.6
[8000] ELBO: 1691.5
[9000] ELBO: 1732.4
{0.01: 0.47}


In [36]:
for p in acc_val_lr:

    lr = list(p.keys())[0]
    acc = list(p.values())[0]

    print(f'The accuracy value for learning rate {lr} is {acc}')

The accuracy value for learning rate 0.0001 is 0.445
The accuracy value for learning rate 0.001 is 0.475
The accuracy value for learning rate 0.01 is 0.47
