In [1]:
import pandas as pd
import numpy as np
import os
import os.path as path
from sklearn import preprocessing
from collections import Counter

dataset_dir = "~/Projects/BigData/hw3"
# Read abstract, category, year of each paper
papers = pd.read_csv(path.join(dataset_dir, 'papers.csv.gz'), compression='gzip')

# Read the embedding vector of each paper
feats = pd.read_csv(path.join(dataset_dir, 'feats.csv.gz'), compression='gzip', header=None).values.astype(np.float32)

# Read the citation relations between papers
edges = pd.read_csv(path.join(dataset_dir, 'edges.csv.gz'), compression='gzip', header=None).values.T.astype(np.int32)
citer, citee = edges

# 可以读出title,abstract,category,year
print(papers["title"])
print(feats)
print(edges)

0         Evasion Attacks against Machine Learning at Te...
1         How Hard is Computing Parity with Noisy Commun...
2         On the Absence of the Rip in Real World Applic...
3             A Promise Theory Perspective on Data Networks
4         Analysis of Asymptotically Optimal Sampling Ba...
                                ...                        
169338    Sentinet Detecting Localized Universal Attacks...
169339    Interpretable Mtl From Heterogeneous Domains U...
169340    Learning Compositional Rules via Neural Progra...
169341           Certified Defenses for Adversarial Patches
169342    Fauras a Proxy Based Framework for Ensuring th...
Name: title, Length: 169343, dtype: object
[[-0.057943 -0.05253  -0.072603 ...  0.173364 -0.172796 -0.140059]
 [-0.1245   -0.070665 -0.325202 ...  0.068524 -0.372111 -0.301036]
 [-0.080242 -0.023328 -0.183787 ...  0.109919  0.117589 -0.139883]
 ...
 [-0.22053  -0.036568 -0.402199 ...  0.11336  -0.161393 -0.145171]
 [-0.138236  0.040885 -0

In [5]:
# 数据集划分
train_data = papers[papers['year'] <= 2017]
val_data = papers[papers['year'] == 2018]
test_data = papers[papers['year'] >= 2019]

le = preprocessing.LabelEncoder()

# 提取特征和标签
X_train = feats[train_data.index]
y_train_str = train_data['category'].values
y_train = le.fit_transform(y_train_str)

X_val = feats[val_data.index]
y_val_str = val_data['category']
y_val = le.fit_transform(y_val_str)

X_test = feats[test_data.index]

In [6]:
def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

def f1_score(y_true, y_pred):
    unique_labels = np.unique(y_true)
    f1_scores = []
    for label in unique_labels:
        tp = np.sum((y_true == label) & (y_pred == label))
        fp = np.sum((y_true != label) & (y_pred == label))
        fn = np.sum((y_true == label) & (y_pred != label))
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        f1_scores.append(f1)
    return np.mean(f1_scores)

def KNN_predict(x_train, y_train, k, X):
    y_pred = []
    print("len:", len(X))
    i = 0
    for x in X:
        distances = np.linalg.norm(x_train - x, axis=1)
        nearest_neighbors = np.argsort(distances)[:k]
        nearest_labels = y_train[nearest_neighbors]
        y_pred.append(np.argmax(np.bincount(nearest_labels)))    
        if i % 1000 == 0:
            print(f"Progress:{i}/{len(X)}")
        i += 1
    return np.array(y_pred)

In [22]:
y_pred = KNN_predict(X_train, y_train, 3, X_val)
        
accuracy_mean = accuracy_score(y_val, y_pred)
f1_mean = f1_score(y_val, y_pred)
print(f"Accuracy: {accuracy_mean}")
print(f"F1 mean: {f1_mean}")

len: 29799
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
Accuracy: 0.4670290949360717
F1 mean: 0.26861034238993187


In [7]:
y_test_encoded = KNN_predict(X_train, y_train, 3, X_test)

len: 48603
Progress:0/48603
Progress:1000/48603
Progress:2000/48603
Progress:3000/48603
Progress:4000/48603
Progress:5000/48603
Progress:6000/48603
Progress:7000/48603
Progress:8000/48603
Progress:9000/48603
Progress:10000/48603
Progress:11000/48603
Progress:12000/48603
Progress:13000/48603
Progress:14000/48603
Progress:15000/48603
Progress:16000/48603
Progress:17000/48603
Progress:18000/48603
Progress:19000/48603
Progress:20000/48603
Progress:21000/48603
Progress:22000/48603
Progress:23000/48603
Progress:24000/48603
Progress:25000/48603
Progress:26000/48603
Progress:27000/48603
Progress:28000/48603
Progress:29000/48603
Progress:30000/48603
Progress:31000/48603
Progress:32000/48603
Progress:33000/48603
Progress:34000/48603
Progress:35000/48603
Progress:36000/48603
Progress:37000/48603
Progress:38000/48603
Progress:39000/48603
Progress:40000/48603
Progress:41000/48603
Progress:42000/48603
Progress:43000/48603
Progress:44000/48603
Progress:45000/48603
Progress:46000/48603
Progress:47000/

In [11]:
le.fit(y_train_str)
y_test = le.inverse_transform(y_test_encoded)
print(y_test)
papers_predicted = papers.copy()
papers_predicted.loc[papers['year'] >= 2019, 'category'] = y_test
papers_predicted.to_csv(path.join(dataset_dir, 'paper_predicted.csv.gz'), index=False, compression='gzip')

['cs.AI' 'cs.CV' 'cs.CV' ... 'cs.CL' 'cs.LG' 'cs.MM']


In [19]:
papers_predicted[papers['year'] >= 2019][['title', 'category']]

Unnamed: 0,title,category
346,Factored Contextual Policy Search with Bayesia...,cs.AI
398,Simultaneous Merging Multiple Grid Maps Using ...,cs.CV
451,Reconstruction of Hidden Representation for Ro...,cs.CV
480,A Look at the Time Delays in Cvss Vulnerabilit...,cs.CY
488,Common Tangents of Two Disjoint Polygons in Li...,cs.CG
...,...,...
169338,Sentinet Detecting Localized Universal Attacks...,cs.CR
169339,Interpretable Mtl From Heterogeneous Domains U...,cs.CV
169340,Learning Compositional Rules via Neural Progra...,cs.CL
169341,Certified Defenses for Adversarial Patches,cs.LG
