In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import loadmat
from scipy import stats

In [2]:
def multivariateGausssian(X, mu, sigma):
    # X:(m, n)
    m, n = X.shape
    if(np.ndim(sigma)==1):
        sigma = np.diag(sigma)
    left = 1.0 / (np.power(2 * np.pi, n/2) * np.sqrt(np.linalg.det(sigma)))
    right = np.zeros((m, 1))
    for row in range(m):
        x = X[row]
        right[row] = np.exp(-0.5 * (x-mu).T @ np.linalg.inv(sigma) @ (x-mu))
    return left*right

In [3]:
def estimateGaussian(X):
    mu = X.mean(axis=0)
    sigma = X.var(axis=0)
    return mu, sigma

In [4]:
def selectThreshold(pval, yval):
    best_e, best_f1, f1 = 0, 0, 0
    step = (pval.max() - pval.min()) / 1000
    for e in np.arange(pval.min(), pval.max(), step):
        preds = pval < e
        tp = np.sum(np.logical_and(preds==1, yval==1))
        fp = np.sum(np.logical_and(preds==1, yval==0))
        fn = np.sum(np.logical_and(preds==0, yval==1))
        tn = np.sum(np.logical_and(preds==0, yval==0))
        precision = tp / (tp + fp)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        recall = tp / (tp + fn)
        f1 = (2 * precision * recall) / (precision + recall)
        if(f1 > best_f1):
            best_f1 = f1
            best_e = e
    return best_e, best_f1

def selectThreshold_by_accuracy(pval, yval):
    best_e, best_accu, f1 = 0, 0, 0
    step = (pval.max() - pval.min()) / 1000
    for e in np.arange(pval.min(), pval.max(), step):
        preds = pval < e
        tp = np.sum(np.logical_and(preds==1, yval==1))
        fp = np.sum(np.logical_and(preds==1, yval==0))
        fn = np.sum(np.logical_and(preds==0, yval==1))
        tn = np.sum(np.logical_and(preds==0, yval==0))
        precision = tp / (tp + fp)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        if(accuracy > best_accu):
            best_accu = accuracy
            best_e = e
    return best_e, best_accu

def selectThreshold_by_precision(pval, yval):
    best_e, best_pre, f1 = 0, 0, 0
    step = (pval.max() - pval.min()) / 1000
    for e in np.arange(pval.min(), pval.max(), step):
        preds = pval < e
        tp = np.sum(np.logical_and(preds==1, yval==1))
        fp = np.sum(np.logical_and(preds==1, yval==0))
        fn = np.sum(np.logical_and(preds==0, yval==1))
        tn = np.sum(np.logical_and(preds==0, yval==0))
        precision = tp / (tp + fp)
        if(precision > best_pre):
            best_pre = precision
            best_e = e
    return best_e, best_pre

In [5]:
df = pd.read_excel("./data/rpad_train.xlsx")
df_test = pd.read_excel("./data/rpad_test.xlsx")

df.head()

Unnamed: 0.1,Unnamed: 0,index,pubpeer_id,title,comments_total,journals,published_time,author_1,author_1_id,link,...,situation,rawcommentstext,comments,my_prediction_NEU,my_prediction_POS,my_prediction_NEG,total_comment,author_reply,author_active,reword
0,0,0,002C71700B7E707DE2FE75C198BAC7,Antileukemic effect of zerumbone-loaded nanost...,2,International Journal of Nanomedicine,2015,Heshu Sulaiman Sulaiman Rahman,29926779.0,/publications/002C71700B7E707DE2FE75C198BAC7#2,...,1,[' Images in Figure 8 seem to show some repeat...,0.988941,0.641259,0.007117,0.351624,3,0,0.0,45.7902
1,1,1,003FFE17AF3EFB7D5486389615AD3E,Homeopathy combat against coronavirus disease ...,12,Zeitschrift fur Gesundheitswissenschaften = Jo...,2020,D. Kalliantas,22556612.0,/publications/003FFE17AF3EFB7D5486389615AD3E#12,...,1,[' This is a totally unscientific piece (I can...,0.998431,0.081617,0.003545,0.914837,12,3,0.25,48.269174
2,2,2,00442961D0994CC6C186D8C1558B09,Long noncoding RNA SNHG22 increases ZEB1 expre...,2,"Cell cycle (Georgetown, Tex.)",2020,Hong Gao,11188776.0,/publications/00442961D0994CC6C186D8C1558B09#2,...,1,"[' Figs 2c and 5b,g. ![file] The turquoise ...",0.994018,0.97479,0.022007,0.003203,5,0,0.0,47.403562
3,3,3,005D80C447F09C2008B6397015BE9F,Litchi Seed Aqueous Extracts play a role in su...,2,Cell Cycle,2020,Yanling Ma,3664166.0,/publications/005D80C447F09C2008B6397015BE9F#2,...,1,[' \\\\ Figure 3. Morphology of MCF-7 cell in ...,0.995968,0.945253,0.052369,0.002378,3,0,0.0,53.28952
4,4,4,005DF8CACBC0BED0BE5FE67C32FC97,Long non-coding RNA MIAT promotes non-small ce...,2,European Review for Medical and Pharmacologica...,2019,D. Lin,25584073.0,/publications/005DF8CACBC0BED0BE5FE67C32FC97#2,...,1,[' This paper is one of a set of 121 papers th...,0.987079,0.972099,0.022557,0.005343,2,0,0.0,63.998941


In [12]:
select_column = ["reword"]
X = df[select_column]

In [13]:
mu, sigma2 = estimateGaussian(X.to_numpy())
X_val = df_test[select_column]

yval = df_test["situation"]

print(X_val.shape, yval.shape)

(800, 1) (800,)


In [14]:
yval = yval.to_numpy()[:, np.newaxis]
yval.shape

(800, 1)

In [15]:
X = X.to_numpy()
X_val = X_val.to_numpy()

In [16]:
pval = multivariateGausssian(X_val, mu, sigma2)
e1, f1 = selectThreshold(pval, yval)
e2, accu = selectThreshold_by_accuracy(pval, yval)
e3, prec = selectThreshold_by_precision(pval, yval)
print(e1, f1)
print(e2, accu)
print(e3, prec)

0.01281881455422474 0.6975397973950795
0.012647689602651353 0.73875
0.0013223291712489114 0.8863636363636364


  precision = tp / (tp + fp)
  precision = tp / (tp + fp)
  precision = tp / (tp + fp)


In [17]:
np.unique(df['discipline'].to_numpy())

array(['0', 'AGRICULTURAL SCIENCES', 'BIOLOGY & BIOCHEMISTRY',
       'CHEMISTRY', 'CLINICAL MEDICINE', 'COMPUTER SCIENCE',
       'ECONOMICS & BUSINESS', 'ENGINEERING', 'ENVIRONMENT/ECOLOGY',
       'GEOSCIENCES', 'IMMUNOLOGY', 'MATERIALS SCIENCE', 'MATHEMATICS',
       'MICROBIOLOGY', 'MOLECULAR BIOLOGY & GENETICS',
       'Multidisciplinary', 'NEUROSCIENCE & BEHAVIOR',
       'PHARMACOLOGY & TOXICOLOGY', 'PHYSICS', 'PLANT & ANIMAL SCIENCE',
       'PSYCHIATRY/PSYCHOLOGY', 'SOCIAL SCIENCES, GENERAL',
       'SPACE SCIENCE'], dtype=object)