In [1]:
import torch
import json
import numpy as np
from evaluation import *
from NN_Models import *
import random
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
y_test = torch.load('data/y_test.pt')
y_train = torch.load('data/y_train.pt')

In [3]:
# year & venue
f = open("outputs/year_venue_test.json", 'r')
year_venue_test = json.load(f)

f = open("outputs/year_venue_train.json", 'r')
year_venue_train = json.load(f)

# abstract & title
f = open("outputs/abstract_title_test.json", 'r')
abstract_title_test = json.load(f)

f = open("outputs/abstract_title_train.json", 'r')
abstract_title_train = json.load(f)

# coauthor
f = open("outputs/author_test.json", 'r')
author_test = json.load(f)

f = open("outputs/author_train.json", 'r')
author_train = json.load(f)

## Weighted

In [4]:
#################################
weight_sentence = 0.31982275074419797  # abstracts - title
weight_author = 0.6143938170885868  # authors
weight_year_venue = 0.09287086258899986  # year - venue

# weight_sentence = 1
# weight_author = 1
# weight_year_venue = 1

sum_ = weight_author + weight_sentence + weight_year_venue
#################################

YEAR_VENUE_WEIGHT = weight_year_venue/sum_
COAUTHOR_WEIGHT = weight_author/sum_
SENTENCE_WEIGHT = weight_sentence/sum_    

In [5]:
THRESHOLD = 0.2

y_pred_list = predict(
            author=author_test,
            COAUTHOR_WEIGHT=COAUTHOR_WEIGHT,
            year_venue=year_venue_test,
            YEAR_VENUE_WEIGHT=YEAR_VENUE_WEIGHT,
            abstracts_title=abstract_title_test,
            SENTENCE_WEIGHT=SENTENCE_WEIGHT,
            THRESHOLD=THRESHOLD
        )

In [6]:
y_test_list = to_list(y_test)

print_scores(y_test_list, y_pred_list)

The accuracy score of prediction is : 0.6130913804277381
The recall   score of prediction is : 0.6130913804277381
The f1       score of prediction is : 0.6177341265196299


  _warn_prf(average, modifier, msg_start, len(result))


#### Grid Search 

In [7]:
def grid_search(weight1, weight2, threshold):
    for i in weight1:
        for j in weight2:
            for k in threshold:
                yield(i, j, k)

In [11]:
max_f1 = 0
max_param = None

weight1 = np.linspace(0, 1, 20)
weight2 = np.linspace(0, 1, 20)
thresholds = [0.02, 0.05, 0.1, 0.2, 0.3, 0.5, 0.6]
total = len(weight1) * len(weight2) * len(thresholds)

y_test_list = to_list(y_test)

for w1, w2, thred in tqdm(grid_search(weight1, weight2, thresholds), total=total):
    
    if w1 + w2 > 1:
        continue

    w3 = 1 - w1 - w2

    y_pred_list = predict(
            author=author_test,
            COAUTHOR_WEIGHT=w1,
            year_venue=year_venue_test,
            YEAR_VENUE_WEIGHT=w2,
            abstracts_title=abstract_title_test,
            SENTENCE_WEIGHT=w3,
            THRESHOLD=thred
        )
    
    f1 = f1_score(y_test_list, y_pred_list, average='weighted')
    
    if f1 > max_f1:
        max_f1 = f1
        max_param = (w1, w2, w3, thred)
        # break


100%|██████████| 2800/2800 [08:35<00:00,  5.43it/s]


In [17]:
print("Max f1 score       : ", round(max_f1, 4))
COAUTHOR_WEIGHT, YEAR_VENUE_WEIGHT, SENTENCE_WEIGHT, THRESHOLD = max_param
print("COAUTHOR_WEIGHT    : ", round(COAUTHOR_WEIGHT, 10))
print("YEAR_VENUE_WEIGHT  : ", round(YEAR_VENUE_WEIGHT, 10))
print("SENTENCE_WEIGHT    : ", round(SENTENCE_WEIGHT, 10))
print("THRESHOLD          : ", THRESHOLD)

Max f1 score       :  0.6243
COAUTHOR_WEIGHT    :  0.4736842105
YEAR_VENUE_WEIGHT  :  0.0526315789
SENTENCE_WEIGHT    :  0.4736842105
THRESHOLD          :  0.3


## Model

In [4]:
clf = LogisticRegressionPredictModel()
clf.train(author_train, year_venue_train, abstracts_title_train, y_train)

100%|██████████| 6268/6268 [00:03<00:00, 1766.70it/s]


Score :  0.5165195460277427


In [5]:
y_pred_list = clf.evaluation(author_test, year_venue_test, abstracts_title_test)

100%|██████████| 3086/3086 [00:21<00:00, 144.02it/s]


In [6]:
y_test_list = to_list(y_test)

print_scores(y_test_list, y_pred_list)

The accuracy score of prediction is : 0.0
The recall   score of prediction is : 0.0
The f1       score of prediction is : 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
y_test_list[:5]

['21', '16 93 97', '78', '-1', '82']

In [8]:
y_pred_list.count('-1') / len(y_pred_list)

0.0

In [10]:
clf = LogisticRegressionPredictModel()
clf.train(author_train, year_venue_train, abstracts_title_train, y_train)

100%|██████████| 6268/6268 [00:04<00:00, 1285.03it/s]


Score :  1.0


In [18]:
au = clf.model.predict(np.array([[0.5, 0., 0.]]))
au

array([1.])