In [9]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
f_train = open("../../data/train.json", 'r')
train_data = json.load(f_train)

In [62]:
def get_year_venue_matrix(data):
    n_samples = len(data)

    # vmatrix = np.zeros([n_samples, 466])
    matrix = []

    y = []

    for i in tqdm(range(n_samples), desc="venue"):
        
        for au in data[i]['authors']:
            if au < 100:

                venue = data[i]['venue']
                year = data[i]['year']
                y.append(au)
                
                tmp = [0 for _ in range(486)]
                if venue:
                    tmp[venue] += 1
                else:
                    tmp[465] += 1
                year += 466
                tmp[year] += 1

                # if not venue:
                #     venue = -1
                # tmp = [year, venue]
                
                matrix.append(tmp)

    return np.array(matrix), np.array(y)

In [63]:
from sklearn.model_selection import train_test_split

X, y = get_year_venue_matrix(train_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

print(X.shape)

venue: 100%|██████████| 25793/25793 [00:00<00:00, 43242.53it/s]


In [65]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# gnb = GaussianNB()
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
y_pred = clf.fit(X_train, y_train).predict(X_test)

print("accuracy : ", accuracy_score(y_test, y_pred))
print("recall   : ", recall_score(y_test, y_pred, average='weighted'))
print("f1       : ", f1_score(y_test, y_pred, average='weighted'))

accuracy :  0.053131991051454136
recall   :  0.053131991051454136
f1       :  0.04576998656156419


In [58]:
from collections import Counter

def train(X, y):
    prior = {}                               # prior probabilities of each pose
    counts = Counter(y)
    n_features = len(X[0])
    total = len(y)
    for label in counts.keys():
        if label not in prior:
            prior[label] = counts[label] / total
    
    likelihood = {}                         # likelihood probabilities of each pose
    for laebl in set(y):
        if not label in likelihood:
            likelihood[label] = []

        data = []

        for j in range(n_features):

            column = []
        
            for i in range(total):
                if label == y[i]:
                    column.append(X[i][j])              # and add the elements to a list in order to get parameters of gaussian
            
            data.append(column)
            
        for i in range(len(data)):
            likelihood[label].append(gaussian_params(data[i]))
        
    return prior, likelihood 

def gaussian_params(array):                 # param with array, return the mean and sigma of gaussian distribution
    mean = np.mean(array)
    sigma = np.std(array)
    return [mean, sigma]

In [59]:
prior, likelihood = train(X_train, y_train)

In [47]:
import scipy.stats as st
from collections import Counter
def get_mprob(X, y):           # get the prior probability and likelihood probabilities of each attributes with missing values
    mprior = {}                # prior probabilities of missing value of each attribute
    count = Counter(y)
    attrs = len(X[0])
    total = len(y)
    alpha = 1                  # the coefficient that smooth the probability
    
    for name in count.keys():
        if name not in mprior:
            mprior[name] = count[name] / total
    
    mlikelihood = {}
    for name in set(y):
        if not name in mlikelihood:
            mlikelihood[name] = []
        data = []
        for j in range(attrs):
            missing = 0
            for i in range(total):
                if name == y[i] and X[i][j] == 9999:    
                    missing += 1
            
            mlikelihood[name].append((missing+alpha)/(count[name]+alpha*attrs))   # laplace smoothing
        
    return [mprior, mlikelihood] 
    

def get_mscore(data, pred_class, miss_probs):
    mprior, mlikelihood = miss_probs
    
    sum = 0
    for i in range(len(data)):
        if data[i] == 9999:
            sum += np.log(mlikelihood[pred_class][i])
        else:
            sum += np.log(1-mlikelihood[pred_class][i])
        
    score = np.log(mprior[pred_class]) + sum
    return score

    
    
def my_predict(data, y, miss_probs):               # predict the class by adding the missing score 
    result = {}
    for pose in set(y):
        if not pose in result:
            result[pose] = 0
        prior_prob = prior[pose]
        sum = 0
        for i in range(len(data)):
            
            if data[i] == 9999:                 
                sum += 0                           # add 0 to the sum when meet 9999
            else:
                likelihood_prob = st.norm.pdf(data[i], likelihood[pose][i][0], likelihood[pose][i][1])
                sum += np.log(likelihood_prob)     # use np.log() function with likelihood probabilities
                
        result[pose] = np.log(prior_prob) + sum
        result[pose] += get_mscore(data, pose, miss_prob)     # add the missing score
    
    pred_class = 0
    value = -999999
    for i in result.keys():        # find out the class with highest score, and that's the predicted class
        if result[i] > value:
            value = result[i]
            pred_class = i
    
    
    
    return pred_class


def my_evaluate(X, y, miss_prob):                # evaluate the accuracy with the effect by missing values  
    
    total = len(y)
    correct = 0
    predict_dict = {}
    pred_pose = []
    
    
    for i in range(len(y)):                       # the list of predicted pose for each instance
        pred_pose.append(my_predict(X[i], y, miss_prob))
    
    for pose in set(y):
        predict_dict[pose] = [0, 0, 0, 0]         # truth table, with the order of [tp,fn,fp,tn]
        
        for i in range(len(y)):
            pred_class = pred_pose[i]
            if y[i] == pose:                      # collect the data and fill the table
                if pred_class == pose:
                    predict_dict[pose][0] += 1
                else:
                    predict_dict[pose][1] += 1
            else:
                if pred_class == pose:
                    predict_dict[pose][2] += 1
                else:
                    predict_dict[pose][3] += 1
        correct += predict_dict[pose][0]
    
    accuracy = correct/total                      # accuracy = tp / (tp+fn+fp+tn)
    return accuracy, predict_dict                 # return accuracy and the table