In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
f_train = open("../../data/train.json", 'r')
train_data = json.load(f_train)

In [84]:
def get_year_venue_matrix(data):
    n_samples = len(data)

    # vmatrix = np.zeros([n_samples, 466])
    matrix = []

    y = []

    for i in tqdm(range(n_samples), desc="venue"):
        
        for au in data[i]['authors']:
            if au < 100:

                venue = data[i]['venue']
                year = data[i]['year']
                y.append(au)
                
                ### Embedding
                # tmp = [0 for _ in range(486)]
                # if venue:
                #     tmp[venue] += 1
                # else:
                #     tmp[465] += 1
                # year += 466
                # tmp[year] += 1

                ## 
                if not venue:
                    venue = -1
                tmp = [year, venue]
                
                matrix.append(tmp)

    return np.array(matrix), np.array(y)

In [85]:
from sklearn.model_selection import train_test_split

X, y = get_year_venue_matrix(train_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

print(X.shape)

venue: 100%|██████████| 25793/25793 [00:01<00:00, 22018.82it/s]


(8938, 2)


In [65]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# gnb = GaussianNB()
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
y_pred = clf.fit(X_train, y_train).predict(X_test)

print("accuracy : ", accuracy_score(y_test, y_pred))
print("recall   : ", recall_score(y_test, y_pred, average='weighted'))
print("f1       : ", f1_score(y_test, y_pred, average='weighted'))

accuracy :  0.053131991051454136
recall   :  0.053131991051454136
f1       :  0.04576998656156419


In [86]:
from collections import Counter

def train(X, y):
    prior = {}                               # prior probabilities of each pose
    counts = Counter(y)
    n_features = len(X[0])
    total = len(y)
    for label in counts.keys():
        if label not in prior:
            prior[label] = counts[label] / total
    
    likelihood = {}                         # likelihood probabilities of each pose
    for label in set(y):
        if not label in likelihood:
            likelihood[label] = []

        data = []

        for j in range(n_features):

            column = []
        
            for i in range(total):
                if label == y[i]:
                    column.append(X[i][j])              # and add the elements to a list in order to get parameters of gaussian
            
            data.append(column)
            
        for i in range(len(data)):
            likelihood[label].append(gaussian_params(data[i]))
        
    return prior, likelihood 

def gaussian_params(array):                 # param with array, return the mean and sigma of gaussian distribution
    mean = np.mean(array)
    sigma = np.std(array)
    return [mean, sigma]

In [87]:
prior, likelihood = train(X_train, y_train)

In [100]:
import scipy.stats as st

def my_predict(data, y, prior, likelihood):               # predict the class by adding the missing score 
    
    result = []

    for label in set(y):
        
        prior_prob = prior[label] * 0.1

        sum_ = 0
        
        for i in range(len(data)):    

            mu = likelihood[label][i][0]
            sigma = likelihood[label][i][1]

            if mu == 0 and sigma == 0 :
                likelihood_prob = 0
            
            else:      
                likelihood_prob = st.norm.pdf(data[i], mu, sigma)
            
            sum_ += likelihood_prob     # use np.log() function with likelihood probabilities
                
        result.append(prior_prob + sum_)

    return result

In [89]:
def my_evaluate(X, y, prior, likelihood):                # evaluate the accuracy with the effect by missing values  
    
    predicts = []
        
    for i in tqdm(range(len(X))):                       # the list of predicted pose for each instance
        predicts.append(my_predict(X[i], y, prior, likelihood))
    
    return np.array(predicts)

In [90]:
predicts = my_evaluate(X_test, y_test, prior, likelihood)

100%|██████████| 1788/1788 [00:51<00:00, 34.83it/s]


In [101]:
l = my_predict(X_test[0], y, prior, likelihood)
l.index(max(l))

40

In [103]:
l

[0.03019577802192385,
 0.0072389740535143,
 0.031925904769104556,
 0.0018620134601592202,
 0.04058531475856653,
 0.03609510321061575,
 0.018577355840466853,
 0.013752285635829368,
 0.016275935620782778,
 0.00812956432683716,
 0.008507850548292029,
 0.019146915872570804,
 0.007431815668093956,
 0.005733969000292837,
 0.011050404910773842,
 0.04724343939907517,
 0.02230581595676122,
 0.033162062811790585,
 0.00497744093926355,
 0.010358674372418684,
 0.002580196141381412,
 0.010998298929233574,
 0.01990299389385374,
 0.00591997933062686,
 0.0293841656556474,
 0.0372772307223642,
 0.021740449360902678,
 0.04208065626374888,
 0.03075215770543327,
 0.004400089167551947,
 0.014030946484343481,
 0.0092037447093423,
 0.014470719441639248,
 0.032423668154348564,
 0.016887888151446803,
 0.020813110617435994,
 0.0018735990868219941,
 0.00519461866298445,
 0.00600540421331284,
 0.014271979350571017,
 0.10289431523663482,
 0.03288147284049067,
 0.003281229679365902,
 0.0011853296763104505,
 0.00089