In [92]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [93]:
f_train = open("../../data/train.json", 'r')
train_data = json.load(f_train)

In [94]:
def get_year_venue_matrix(data):
    n_samples = len(data)

    # vmatrix = np.zeros([n_samples, 466])
    matrix = []

    y = []

    for i in tqdm(range(n_samples), desc="venue"):
        
        for au in data[i]['authors']:
            if au < 100:

                venue = data[i]['venue']
                year = data[i]['year']
                y.append(au)
                
                ### Embedding
                # tmp = [0 for _ in range(486)]
                # if venue:
                #     tmp[venue] += 1
                # else:
                #     tmp[465] += 1
                # year += 466
                # tmp[year] += 1

                ## 
                if not venue:
                    venue = 465
                tmp = [year, venue]
                
                matrix.append(tmp)

    return np.array(matrix), np.array(y)

In [95]:
from sklearn.model_selection import train_test_split

X, y = get_year_venue_matrix(train_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

print("X_train : ", X_train.shape)
print("y_train : ", y_train.shape)
print("X_test  : ", X_test.shape)
print("y_test  : ", y_test.shape)

venue: 100%|██████████| 25793/25793 [00:00<00:00, 671014.76it/s]

X_train :  (7150, 2)
y_train :  (7150,)
X_test  :  (1788, 2)
y_test  :  (1788,)





In [96]:
len(set(X[:, 1]))

323

In [97]:
from collections import Counter

def train(X, y):

    # prior probability of authors
    prior = {}                               # prior probabilities 
    counts = Counter(y)
    n_features = len(X[0])
    total = len(y)
    for label in counts.keys():
        if label not in prior:
            prior[label] = counts[label] / total
    
    # likelihood probability for year
    likelihood_year = {}                         # likelihood probabilities 
    for label in set(y):

        column = []
    
        for i in range(total):
            if label == y[i]:
                column.append(X[i][0])              
                
        likelihood_year[label] = gaussian_params(column)

    # likelihood probability for venue
    likelihood_venue = {}
    n_venue = 466
    for i in range(n_venue):                        
        likelihood_venue[i] = [0 for _ in range(100)]

    for venue in range(n_venue):
        sum_ = 0
        for i in range(total):
            
            if X[i][1] == venue:
                likelihood_venue[venue][y[i]] += 1
                sum_ +=  1

        if sum_ > 0:
            likelihood_venue[venue] = [i/sum_ for i in likelihood_venue[venue]]
        
        
    return prior, likelihood_year, likelihood_venue

def gaussian_params(array):                 # param with array, return the mean and sigma of gaussian distribution
    mean = np.mean(array)
    sigma = np.std(array)
    return [mean, sigma]

In [98]:
# prior, likelihood_year, likelihood_venue = train(X_train, y_train)

prior, likelihood_year, likelihood_venue = train(X, y)

In [85]:
import scipy.stats as st

def my_predict(data, prior, likelihood_year, likelihood_venue):               # predict the class by adding the missing score 
    
    result = []

    for label in range(100):

        # prior_prob = prior[label] 
        prior_prob = 0

        mu = likelihood_year[label][0]
        sigma = likelihood_year[label][1]
        
        likelihood_prob_year = st.norm.pdf(data[0], mu, sigma)

        likelihood_prob_venue = likelihood_venue[data[1]][label]
                
        result.append(prior_prob + likelihood_prob_year + likelihood_prob_venue)

    return result

In [86]:
def my_evaluate(X, prior, likelihood_year, likelihood_venue):                # evaluate the accuracy with the effect by missing values  
    
    predicts = []
        
    for i in tqdm(range(len(X))):                       # the list of predicted pose for each instance
        predicts.append(my_predict(X[i], prior, likelihood_year, likelihood_venue))

        # pred = my_predict(X[i], y, prior, likelihood_year, likelihood_venue)
        # predicts.append(pred.index(max(pred)))

    return np.array(predicts)

In [80]:
# y_pred = my_evaluate(X_test, y_test, prior, likelihood_year, likelihood_venue)
y_pred = my_evaluate(X_test, y_test, prior, likelihood_year, likelihood_venue)

100%|██████████| 1788/1788 [00:27<00:00, 64.21it/s]


In [77]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# gnb = GaussianNB()

print("accuracy : ", accuracy_score(y_train, y_pred))
print("recall   : ", recall_score(y_train, y_pred, average='weighted'))
print("f1       : ", f1_score(y_train, y_pred, average='weighted'))

accuracy :  0.1158041958041958
recall   :  0.1158041958041958
f1       :  0.11904598956054079


### Kaggle Predictions

In [99]:
f_test = open("../../data/test.json", 'r')
test_data = json.load(f_test)

In [100]:
n_samples = len(test_data)
matrix = []

for i in tqdm(range(n_samples)):

    venue = test_data[i]['venue']
    year = test_data[i]['year']
    
    if not venue:
        venue = 465
    tmp = [year, venue]
    
    matrix.append(tmp)

100%|██████████| 800/800 [00:00<00:00, 619657.10it/s]


In [101]:
predict_dict = {}
for i in tqdm(range(800)):
    tmp = {}
    pred_ = my_predict(matrix[i], prior, likelihood_year, likelihood_venue)

    for j in range(100):
        tmp[j] = pred_[j]

    predict_dict[i] = tmp

100%|██████████| 800/800 [00:10<00:00, 73.09it/s]


In [102]:
with open('outputs/year_venue_naive_bayes.json', 'w') as fp:
    json.dump(predict_dict, fp)