In [1]:
import os
import io
import json
import pandas as pd
import numpy as np
import torch
import joblib

from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Train/test split

In [2]:
df = pd.read_pickle('../dataset/SS/clean/bert_embeddings.pkl')
df.head()

Unnamed: 0_level_0,title,abstract,relatedwork_text,main_text,rw_citations,full_citations,BERT_embeddings
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,We first look at how text excerpts are extract...,Abstract. We present a novel summarization fra...,"[5965756, 1599046, 3177797, 444032, 6884774]","[1599046, 11055565, 16393334, 2481864, 5965756...","[-0.5292661, 0.96171594, 0.7239495, 0.09007428..."
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Self-training has been applied to several natu...,Annotating training data for event extraction ...,"[1398439, 7419156, 11187670, 8336242, 15894892...","[1398439, 7419156, 7579604, 11187670, 8336242,...","[-0.7070601, 0.5793047, 0.30019873, 0.451895, ..."
1345235,Optimizing Sparse Matrix窶天ector Product Comput...,Large-scale scientific applications frequently...,A variety of different data and computation re...,Large-scale scientific applications frequently...,"[1794629, 9580801, 120335238, 15090599, 208582...","[1794629, 9580801, 120335238, 15090599, 208582...","[-0.72422814, 0.38941112, 0.50098884, -0.02656..."
2624639,Enhanced Chosen-Ciphertext Security and Applic...,We introduce and study a new notion of enhance...,ECCA is similar in spirit to coin-revealing se...,We introduce and study a new notion of enhance...,"[41204165, 19655, 235427, 226828, 3148885]","[443317, 41204165, 7113862, 10098664, 226828, ...","[-0.59993136, 0.84347814, 0.1958661, 0.2972666..."
146120525,ARSM: Augment-REINFORCE-Swap-Merge Estimator f...,To address the challenge of backpropagating th...,"For optimizing (1) for categorical z, the diff...",To address the challenge of backpropagating th...,"[3535369, 5859948, 1758804, 10756562, 19115634...","[121929631, 4043645, 5859948, 7195970, 1075656...","[-0.78408253, 0.022130227, 0.31313884, 0.46616..."


In [3]:
# Split data to 80% trainset and 20% testset
df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Clusters

In [4]:
embeddings = []
for i in df_train.index:
    emb = df_train.loc[i, 'BERT_embeddings']
    embeddings.append(emb)

embeddings = np.array(embeddings)
embeddings.shape

(21933, 768)

In [5]:
os.makedirs('../model', exist_ok=True)

In [6]:
embeddings_test = PCA(n_components=12).fit_transform(embeddings)
embeddings_test.shape

(21933, 12)

In [7]:
# kmeans_5 = KMeans(n_clusters=5).fit(embeddings_test)
# joblib.dump(kmeans_5, '../model/kmeans_5.model')

# Evaluation

In [8]:
kmeans_5 = joblib.load('../model/kmeans_5.model')
Counter(kmeans_5.predict(embeddings_test))

Counter({1: 3644, 4: 3418, 3: 4883, 0: 5591, 2: 4397})

In [9]:
train_labels = pd.read_csv('train.csv', index_col=0)
train_labels.head()

Unnamed: 0,paper1_id,paper2_id,label_1,label_2,label_3,label_4,label
0,7828885,208310034,0.0,,0.0,0.0,0
1,4837028,16155532,0.0,,1.0,1.0,1
2,18475456,17445278,0.0,,0.0,0.0,0
3,202775562,70299115,0.0,,1.0,0.0,0
4,4899384,182952605,0.0,,0.0,0.0,0


In [10]:
test_labels = pd.read_csv('test.csv', index_col=0)
test_labels.head()

Unnamed: 0,paper1_id,paper2_id,label_1,label_2,label_3,label_4,label
0,53829365,5392739,0.0,,0.0,0.0,0
1,3743029,9713252,0.0,,0.0,0.0,0
2,8459419,30644086,0.0,,0.0,0.0,0
3,11970283,6317007,0.0,,0.0,0.0,0
4,2859455,16124390,0.0,,0.0,0.0,0


In [11]:
def model_eval(model, train_labels, df_train):
    
    result = pd.DataFrame(train_labels.loc[:, 'label'], dtype=np.int8)
    result['model_label'] = 0
    
    df_train_new = df_train.copy()
    df_train_new.reset_index(inplace=True)
    
    for i in train_labels.index:
        id1 = train_labels.loc[i, 'paper1_id']
        id2 = train_labels.loc[i, 'paper2_id']
        c1 = model.predict(embeddings_test[int(df_train_new[df_train_new.paper_id == str(id1)].index[0])].reshape(1, -1))[0]
        c2 = model.predict(embeddings_test[int(df_train_new[df_train_new.paper_id == str(id2)].index[0])].reshape(1, -1))[0]
        
        result.loc[i, 'model_label'] = int(c1 == c2)
        
    fn = len(result.loc[(result.label == 1) & (result.model_label == 0)])
    tp = len(result.loc[(result.label == 1) & (result.model_label == 1)])
    tn = len(result.loc[(result.label == 0) & (result.model_label == 0)])
    fp = len(result.loc[(result.label == 0) & (result.model_label == 1)])
    
    acc = (tp+tn)/(tp+fp+tn+fn)
    recall = tp/(tp+fn)
        
    return acc, recall

In [12]:
model_eval(kmeans_5, train_labels, df_train)

(0.8075, 0.5076923076923077)

In [13]:
model_eval(kmeans_5, test_labels, df_test)

(0.79, 0.38461538461538464)

In [14]:
embeddings = []
for i in df.index:
    emb = df.loc[i, 'BERT_embeddings']
    embeddings.append(emb)

embeddings = np.array(embeddings)
embeddings_test = PCA(n_components=12).fit_transform(embeddings)

In [15]:
df_new = df.copy()
df_new['label'] = '0'

for i in range(len(df_new)):
    index = df_new.index[i]
    df_new.at[index, 'label'] = kmeans_5.predict(embeddings_test[i].reshape(1, -1))[0]
    
df_new.head()

Unnamed: 0_level_0,title,abstract,relatedwork_text,main_text,rw_citations,full_citations,BERT_embeddings,label
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
859921,Summarizing Reviews with Variable-length Synta...,Abstract. We present a novel summarization fra...,We first look at how text excerpts are extract...,Abstract. We present a novel summarization fra...,"[5965756, 1599046, 3177797, 444032, 6884774]","[1599046, 11055565, 16393334, 2481864, 5965756...","[-0.5292661, 0.96171594, 0.7239495, 0.09007428...",1
129124,Can Document Selection Help Semi-supervised Le...,Annotating training data for event extraction ...,Self-training has been applied to several natu...,Annotating training data for event extraction ...,"[1398439, 7419156, 11187670, 8336242, 15894892...","[1398439, 7419156, 7579604, 11187670, 8336242,...","[-0.7070601, 0.5793047, 0.30019873, 0.451895, ...",1
1345235,Optimizing Sparse Matrix窶天ector Product Comput...,Large-scale scientific applications frequently...,A variety of different data and computation re...,Large-scale scientific applications frequently...,"[1794629, 9580801, 120335238, 15090599, 208582...","[1794629, 9580801, 120335238, 15090599, 208582...","[-0.72422814, 0.38941112, 0.50098884, -0.02656...",1
2624639,Enhanced Chosen-Ciphertext Security and Applic...,We introduce and study a new notion of enhance...,ECCA is similar in spirit to coin-revealing se...,We introduce and study a new notion of enhance...,"[41204165, 19655, 235427, 226828, 3148885]","[443317, 41204165, 7113862, 10098664, 226828, ...","[-0.59993136, 0.84347814, 0.1958661, 0.2972666...",2
146120525,ARSM: Augment-REINFORCE-Swap-Merge Estimator f...,To address the challenge of backpropagating th...,"For optimizing (1) for categorical z, the diff...",To address the challenge of backpropagating th...,"[3535369, 5859948, 1758804, 10756562, 19115634...","[121929631, 4043645, 5859948, 7195970, 1075656...","[-0.78408253, 0.022130227, 0.31313884, 0.46616...",0


In [18]:
# df_new.to_pickle('withlabel.pkl')