In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
f_train = open("../../data/train.json", 'r')
train_data = json.load(f_train)

In [3]:
def get_year_venue_matrix(data):
    n_samples = len(data)

    # vmatrix = np.zeros([n_samples, 466])
    matrix = []

    y = []

    for i in tqdm(range(n_samples), desc="venue"):
        
        for au in data[i]['authors']:
            if au < 100:

                venue = data[i]['venue']
                year = data[i]['year']
                y.append(au)
                
                ### Embedding
                # tmp = [0 for _ in range(486)]
                # if venue:
                #     tmp[venue] += 1
                # else:
                #     tmp[465] += 1
                # year += 466
                # tmp[year] += 1

                ## 
                if not venue:
                    venue = 465
                tmp = [year, venue]
                
                matrix.append(tmp)

    return np.array(matrix), np.array(y)

In [4]:
from sklearn.model_selection import train_test_split

X, y = get_year_venue_matrix(train_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

print("X_train : ", X_train.shape)
print("y_train : ", y_train.shape)
print("X_test  : ", X_test.shape)
print("y_test  : ", y_test.shape)

venue: 100%|██████████| 25793/25793 [00:00<00:00, 446099.88it/s]

X_train :  (7150, 2)
y_train :  (7150,)
X_test  :  (1788, 2)
y_test  :  (1788,)





In [5]:
len(set(X[:, 1]))

323

In [6]:
from collections import Counter

def train(X, y):

    # prior probability of authors
    prior = {}                               # prior probabilities 
    counts = Counter(y)
    n_features = len(X[0])
    total = len(y)
    for label in counts.keys():
        if label not in prior:
            prior[label] = counts[label] / total
    
    # likelihood probability for year
    likelihood_year = {}                         # likelihood probabilities 
    for label in set(y):

        column = []
    
        for i in range(total):
            if label == y[i]:
                column.append(X[i][0])              
                
        likelihood_year[label] = gaussian_params(column)

    # likelihood probability for venue
    likelihood_venue = {}
    n_venue = 466
    for i in range(n_venue):                        
        likelihood_venue[i] = [0 for _ in range(100)]

    for venue in range(n_venue):
        sum_ = 0
        for i in range(total):
            
            if X[i][1] == venue:
                likelihood_venue[venue][y[i]] += 1
                sum_ +=  1

        if sum_ > 0:
            likelihood_venue[venue] = [i/sum_ for i in likelihood_venue[venue]]
        
        
    return prior, likelihood_year, likelihood_venue

def gaussian_params(array):                 # param with array, return the mean and sigma of gaussian distribution
    mean = np.mean(array)
    sigma = np.std(array)
    return [mean, sigma]

In [8]:
import scipy.stats as st

def my_predict(data, prior, likelihood_year, likelihood_venue):               # predict the class by adding the missing score 
    
    result = []

    for label in range(100):

        # prior_prob = prior[label] 
        prior_prob = 0

        mu = likelihood_year[label][0]
        sigma = likelihood_year[label][1]
        
        likelihood_prob_year = st.norm.pdf(data[0], mu, sigma)

        likelihood_prob_venue = likelihood_venue[data[1]][label]
                
        result.append(prior_prob + likelihood_prob_year + likelihood_prob_venue)

    return result

In [9]:
def my_evaluate(X, prior, likelihood_year, likelihood_venue):                # evaluate the accuracy with the effect by missing values  
    
    predicts = []
        
    for i in tqdm(range(len(X))):                       # the list of predicted pose for each instance
        predicts.append(my_predict(X[i], prior, likelihood_year, likelihood_venue))

        # pred = my_predict(X[i], y, prior, likelihood_year, likelihood_venue)
        # predicts.append(pred.index(max(pred)))

    return np.array(predicts)

In [27]:
# y_pred = my_evaluate(X_test, y_test, prior, likelihood_year, likelihood_venue)
# prior, likelihood_year, likelihood_venue = train(X_train, y_train)

prior, likelihood_year, likelihood_venue = train(X, y)
y_pred_prob_matrix = my_evaluate(X_test, prior, likelihood_year, likelihood_venue)

100%|██████████| 1788/1788 [00:22<00:00, 77.84it/s]


In [25]:
y_pred = []
for instance in y_pred_prob_matrix:
    instance = list(instance)
    y_pred.append(instance.index(max(instance)))

In [26]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# gnb = GaussianNB()

print("accuracy : ", accuracy_score(y_test, y_pred))
print("recall   : ", recall_score(y_test, y_pred, average='weighted'))
print("f1       : ", f1_score(y_test, y_pred, average='weighted'))

accuracy :  0.10514541387024609
recall   :  0.10514541387024609
f1       :  0.11757201740118793


### Kaggle Predictions

In [28]:
f_test = open("../../data/test.json", 'r')
test_data = json.load(f_test)

In [29]:
n_samples = len(test_data)
matrix = []

for i in tqdm(range(n_samples)):

    venue = test_data[i]['venue']
    year = test_data[i]['year']
    
    if not venue:
        venue = 465
    tmp = [year, venue]
    
    matrix.append(tmp)

100%|██████████| 800/800 [00:00<00:00, 471932.94it/s]


In [30]:
predict_dict = {}
for i in tqdm(range(800)):
    tmp = {}
    pred_ = my_predict(matrix[i], prior, likelihood_year, likelihood_venue)

    for j in range(100):
        tmp[j] = pred_[j]

    predict_dict[i] = tmp

100%|██████████| 800/800 [00:10<00:00, 74.04it/s]


In [31]:
with open('outputs/year_venue_naive_bayes.json', 'w') as fp:
    json.dump(predict_dict, fp)

In [33]:
predict_dict

{0: {0: 0.025694689151316938,
  1: 1.0291680092039959,
  2: 0.01986587913268543,
  3: 0.029578283119578292,
  4: 0.020024560958136185,
  5: 0.01503550718119987,
  6: 0.023882172471408825,
  7: 0.004495304434844805,
  8: 0.02106706354020333,
  9: 0.0314561004120304,
  10: 0.045420960885347646,
  11: 0.012750465005301734,
  12: 0.02223214463073519,
  13: 0.033206226874371136,
  14: 0.022786272667570735,
  15: 0.0012276493926395395,
  16: 0.02587387920194599,
  17: 0.019410662178441458,
  18: 0.020584219057140845,
  19: 0.025026606294111763,
  20: 0.04313005773221776,
  21: 0.022692878680218406,
  22: 0.015639734735175805,
  23: 0.047384138725208066,
  24: 0.01591729066898925,
  25: 0.018888396620325577,
  26: 0.023258184382815048,
  27: 0.011984420032173402,
  28: 0.015122692909050394,
  29: 0.03781978307708881,
  30: 0.03722388061895031,
  31: 0.01619877165794127,
  32: 0.01828793634436775,
  33: 0.01026188022862961,
  34: 0.04040096738870033,
  35: 0.013337348378439187,
  36: 0.0392109