In [None]:
# data manipulation/visualization
import pandas as pd

import numpy as np
from random import randint
import os.path

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:

torch.cuda.is_available()

In [None]:
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
df_data = pd.read_csv('https://raw.githubusercontent.com/JamesBond0014/NBA_ALLSTAR_Prediction/main/ASG_data.csv')

names_and_teams = df_data[['PLAYER', 'TEAM']]
for df in [df_data]:

    # the % of team's games the player played in
    # sometimes because of scheduling/trades, a player's indiviual GP may exceed their current team's, so we impose a ceiling of 1
    df['Play Pct.'] = (df['GP'] / df['Team GP']).map(lambda pct : min(pct, 1))

#     # nomalized via league average pace for that year
    for col in ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '3PM']:
        df['Adjusted ' + col] = df[col] / df['Avg. Pace']

train_data, test_data, test_years, train_years = [],[], [2020], []
# for i in range(3):
#     test_years.append(randint(1996, 2020))

for index, row in df_data.iterrows():
    if (row['Year'] in test_years):
        test_data.append(row)
    else:
        train_data.append(row)
data_by_year = {}

for index, row in df_data.iterrows():
    curr_year = row['Year']
    if (curr_year in data_by_year):
        data_by_year[curr_year].append(row)
    else:
        data_by_year[curr_year] = [row]


In [None]:
 features_full = [
    'Adjusted PTS',
    'Adjusted REB',
    'Adjusted AST',
    'Adjusted STL',
    'Adjusted BLK',
    'Adjusted TOV',
    'Adjusted 3PM',
    'DEFWS',
    'TS%',
    'USG%',
    'PIE',
    'Play Pct.',
    'Team Conference Rank',
    'Prior ASG Appearances',
    'AS Last Year?'
]

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

In [None]:

train_df = pd.DataFrame(train_data)
train_targets = np.array(pd.DataFrame(train_df['Selected?']))
train_df = train_df.drop(columns=['Selected?'])
train_df_filtered = train_df[features_full]

train_df_filtered, train_targets = BorderlineSMOTE(random_state=0).fit_sample(train_df_filtered, train_targets) #np.array(train_targets_onehot)
train_targets = torch.from_numpy(train_targets).long()

test_df = pd.DataFrame(test_data)
test_targets = torch.from_numpy(np.array(pd.DataFrame(test_df['Selected?']))).float()
test_df = test_df.drop(columns=['Selected?'])
test_df_filtered = test_df[features_full]

test_df = pd.DataFrame(test_data)
test_targets = torch.from_numpy(np.array(pd.DataFrame(test_df['Selected?']))).float()
test_df = test_df.drop(columns=['Selected?'])
test_df_filtered = test_df[features_full]

data_by_year_tar = {}
for i in data_by_year:
    data_by_year[i] = pd.DataFrame(data_by_year[i])
    data_by_year_tar[i] = torch.from_numpy(np.array(pd.DataFrame(data_by_year[i]['Selected?']))).long()
    data_by_year[i] = data_by_year[i].drop(columns=['Selected?'])
    data_by_year[i] = data_by_year[i][features_full]
# data_by_year.keys()

In [None]:
class Sparsemax(nn.Module): # from https://towardsdatascience.com/implementing-tabnet-in-pytorch-fc977c383279
    def __init__(self, dim=None):
        super(Sparsemax, self).__init__()
        self.dim = -1 if dim is None else dim

    def forward(self, input):
        input = input.transpose(0, self.dim)
        original_size = input.size()
        input = input.reshape(input.size(0), -1)
        input = input.transpose(0, 1)
        dim = 1

        number_of_logits = input.size(dim)
        
        input = input - torch.max(input, dim=dim, keepdim=True)[0].expand_as(input)
        zs = torch.sort(input=input, dim=dim, descending=True)[0]
        range = torch.arange(start=1, end=number_of_logits + 1, device=device,step=1, dtype=input.dtype).view(1, -1)
        range = range.expand_as(zs)

        bound = 1 + range * zs
        cumulative_sum_zs = torch.cumsum(zs, dim)
        is_gt = torch.gt(bound, cumulative_sum_zs).type(input.type())
        k = torch.max(is_gt * range, dim, keepdim=True)[0]
        zs_sparse = is_gt * zs
        taus = (torch.sum(zs_sparse, dim, keepdim=True) - 1) / k
        taus = taus.expand_as(input)
        self.output = torch.max(torch.zeros_like(input), input - taus)
        output = self.output
        output = output.transpose(0, 1)
        output = output.reshape(original_size)
        output = output.transpose(0, self.dim)
        return output
    def backward(self, grad_output):
        dim = 1
        nonzeros = torch.ne(self.output, 0)
        sum = torch.sum(grad_output * nonzeros, dim=dim) / torch.sum(nonzeros, dim=dim)
        self.grad_input = nonzeros * (grad_output - sum.expand_as(grad_output))
        return self.grad_input

In [None]:
class GBN(nn.Module):
    def __init__(self, in_size, batch_size=128, momentum=0.01):
        super().__init__()
        self.batch_norm = nn.BatchNorm1d(in_size, momentum = momentum)
        self.batch_size = batch_size
    
    def forward(self, x):
        batches = x.chunk(x.shape[0]//self.batch_size, 0)
        x_norm = []
        for i, batch in enumerate(batches):
            x_norm.append(self.batch_norm(batch))
        return torch.cat(x_norm,0)

In [None]:
class AttentionTransformer(nn.Module): 
    def __init__(self, in_size, out_size,relaxation, batch_size=128):
        super().__init__()
        self.linear = nn.Linear(in_size, out_size)
        self.norm = GBN(out_size, batch_size)
        self.activation = Sparsemax() # play with other ones
        self.relaxation = relaxation
    
    def forward(self, x, prior):
        x = self.linear(x)
        x = self.norm(x)
        x = self.activation(x)
        prior = prior*(self.relaxation-x)
        return x, prior


In [None]:
class GLU(nn.Module):
    def __init__(self,in_size,out_size,linear=None,batch_size=128):
        super().__init__()
        if not linear: self.linear = nn.Linear(in_size, out_size*2) #*2 for "folding"
        else: self.linear = linear
        self.norm = GBN(out_size*2, batch_size)
    
    def forward(self, x):
        x = self.linear(x)

        x = self.norm(x)
        size = x.shape[1]
        firstHalf, secondHalf = x.chunk(2, dim=1)
        return firstHalf*torch.sigmoid(secondHalf)


In [None]:
class FeatureTransformer(nn.Module):
    def __init__(self,input_size,output_size,shared_layers,ind_n,batch_size=128):
        super().__init__()
        self.shared = nn.ModuleList()
        self.ind= nn.ModuleList()
        for i, layer in enumerate(shared_layers):
            if i==0: self.shared.append(GLU(input_size, output_size, layer, batch_size = batch_size))
            else: self.shared.append(GLU(output_size, output_size, layer, batch_size = batch_size))

        for i in range(ind_n):
            if (i ==0 and not shared_layers):
                self.ind.append(GLU(input_size, output_size, batch_size = batch_size))
            else:
                self.ind.append(GLU(output_size, output_size, batch_size = batch_size))
        self.scale = torch.sqrt(torch.tensor([.5],device=device))
    
    def forward(self, x):
        # x_old = torch.zeros_like(x)
        for i, glu in enumerate(self.shared):
            if i==0: x = glu(x)
            else:
                x_new = glu(x)
                x = torch.add(x, x_new)
  
        for glu in self.ind:
            x_new = glu(x)
            x = torch.add(x, x_new)
        return x*self.scale


In [None]:
class DecisionStep(nn.Module):
    def __init__(self, input_size, attention_feature_size, final_feature_size, 
                 shared, n, relax, batch_size =128):
        super().__init__()
        self.feature = FeatureTransformer(input_size, attention_feature_size+ 
                                          final_feature_size, shared, n, 
                                          batch_size)
        self.attention = AttentionTransformer(attention_feature_size, input_size,
                                              relax, batch_size)
    def forward(self, x, prev, prior):
        attn_mask, prior = self.attention(prev, prior)
        loss = ((-1)*attn_mask*torch.log(attn_mask+1e-10)).mean()
        x_mask = x*attn_mask
        x = self.feature(x_mask)
        return x, loss, prior


In [None]:
class TabNet(nn.Module):
    def __init__(self, input_size, output_size, share_n=2, ind_n=2, dec_n=4,
                 device= 'cpu', relax = 3):
        super().__init__()
        self.attention_feature_size, self.final_feature_size = 64,64
        self.device = device
        a, f = self.attention_feature_size, self.final_feature_size
        self.shared_layers = nn.ModuleList()
        for i in range(share_n):
            if i ==0 : self.shared_layers.append(nn.Linear(input_size, 2*(a+f)))
            else: self.shared_layers.append(nn.Linear(a+f, 2*(a+f)))
        self.feature = FeatureTransformer(input_size, a+f, self.shared_layers, 
                                          ind_n)
        self.dec_steps = nn.ModuleList()
        for i in range(dec_n):
            self.dec_steps.append(DecisionStep(input_size, f, a, self.shared_layers,
                                               ind_n, relax))
        self.linear = nn.Linear(f, output_size)
        self.norm = nn.BatchNorm1d(input_size)
        self.activation = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = x.to(device = self.device)
        x = self.norm(x)
        x_prev = self.feature(x)[:,self.final_feature_size:]
        loss = torch.zeros(1).to(x.device)
        out = torch.zeros(x.size(0), self.final_feature_size).to(x.device)
        prior = torch.ones(x.shape).to(x.device)

        for step in self.dec_steps:
            x_out, lss, prior = step(x, x_prev,prior)
            first, second = x_out.chunk(2, dim = 1)
            out += F.relu(first)
            x_prev = second
            loss += lss
        out = self.linear(out)
        y_pred = self.activation(out)
        return y_pred




In [None]:
train_tensor = torch.from_numpy(np.array(train_df_filtered)).float().to(device = device)
test_tensor = torch.from_numpy(np.array(test_df_filtered)).float().to(device = device)

In [None]:
from tqdm import tqdm


In [None]:
device

In [None]:
loss_f = nn.CrossEntropyLoss()


In [None]:
def evaluation(model, losses):
    res = model(test_tensor)
    class_pred = torch.argmax(res,dim=1).cpu().detach().numpy()
    class_target = test_targets.detach().numpy()

    num_cor = len([True for i in range(0, len(class_target)) if class_pred[i] == class_target[i]])
    tp = len([True for i in range(0, len(class_target)) if class_pred[i] != 0 and class_pred[i] == class_target[i]])
    fn = len([True for i in range(0, len(class_target)) if class_pred[i] == 0 and class_pred[i] != class_target[i]])
    p = len([True for i in range(0, len(class_target)) if class_pred[i] != 0])
    if p==0:p=1
    n = len(class_pred) - p

    acc = num_cor/len(class_target)
    prec = tp / p
    rec = tp / (fn+p)
    f1_score = 2*prec*rec / (prec+rec)
    min_loss = min(losses)
    print("Accuracy: {}".format(acc))
    print("Precision: {}".format(prec))
    print("Recall: {}".format(rec))
    print("F1_score: {}".format(f1_score))
    print("Min loss: {}".format(min_loss))
    print("{}/{} coorect".format(num_cor,len(class_target)))
    idx = list(range(0, len(losses)))
    plt.plot(idx, losses)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [None]:
kf = KFold(n_splits=12, shuffle = True)

In [None]:
tabnet_kfold = TabNet(len(features_full), 3, device=device).to(device)

In [None]:
all_loss_kfold = []
for train_idx, val_idx in kf.split(train_tensor): #kfold
    optimizer = optim.Adam(tabnet_kfold.parameters(),lr=0.007,weight_decay=0.00001)

    new_data_train, new_data_val = train_tensor[train_idx], train_tensor[val_idx]
    new_pred_train, new_pred_val = train_targets[train_idx], train_targets[val_idx]
    for i in tqdm(range(100)): #kind of brute force, nice acc
        tabnet_kfold.zero_grad()
        pred = tabnet_kfold(new_data_train.to(device = device))
        loss = loss_f(pred.float(), new_pred_train.to(device = device))
        loss.backward()
        optimizer.step()
    val_pred = tabnet_kfold(new_data_val.to(device = device))
    all_loss_kfold.append(loss_f(val_pred, new_pred_val.to(device = device)))

In [None]:
evaluation(tabnet_kfold, all_loss_kfold)


In [None]:
tabnet_normal = TabNet(len(features_full), 3, device=device).to(device)

In [None]:
all_loss_normal = []
for i in tqdm(range(60)): #kind of brute force, nice acc
    optimizer = optim.Adam(tabnet_normal.parameters(),lr=0.007,weight_decay=0.00001)
    tabnet_normal.zero_grad()
    pred = tabnet_normal(train_tensor)
    loss = loss_f(pred.float(), train_targets.to(device = device))
    all_loss_normal.append(loss)
    loss.backward()
    optimizer.step()

In [None]:
evaluation(tabnet_normal, all_loss_normal)

In [None]:
tabnet_batch = TabNet(len(features_full), 3, device=device).to(device)

In [None]:
all_loss_batch = []
batch_size = 7500 #mini batching
chunk_size = train_tensor.shape[0]//batch_size
pred_chunks = train_targets.chunk(chunk_size, dim=0)
optimizer = optim.Adam(tabnet_batch.parameters(),lr=0.007,weight_decay=0.00001)

for i in tqdm(range(125)):
    batch_loss =  0
    for j,chunk in enumerate(train_tensor.chunk(chunk_size, dim=0)):
        optimizer.zero_grad()
        pred = tabnet_batch(chunk.to(device))
        loss = loss_f(pred.float(), pred_chunks[j].to(device = device))
        batch_loss += loss
        loss.backward()
        optimizer.step()
    all_loss_batch.append(batch_loss/chunk_size)

In [None]:
evaluation(tabnet_batch, all_loss_batch)

In [None]:
import random

In [None]:
tabnet_batch_year = TabNet(len(features_full), 3, device=device).to(device)

In [None]:
years_to_train_batch = list(data_by_year.keys())
years_to_train_batch.remove(2020)
optimizer = optim.Adam(tabnet_batch.parameters(),lr=0.007,weight_decay=0.00001)
all_loss_batch_year=[]
for i in tqdm(range(100)):
    batch_loss = 0
    # random.shuffle(years_to_train_batch)
    for year in years_to_train_batch:
        training_data_year = torch.from_numpy(np.array(data_by_year[year])).float()
        training_target_year = torch.from_numpy(np.array(data_by_year_tar[year])).long().view(-1)

        optimizer.zero_grad()
        pred = tabnet_batch_year(training_data_year.to(device))
        loss = loss_f(pred.float(), training_target_year.to(device = device))
        batch_loss += loss
        loss.backward()
        optimizer.step()
    all_loss_batch_year.append(batch_loss/len(years_to_train_batch))

In [None]:
evaluation(tabnet_batch_year, all_loss_batch_year)