In [3]:
#############
## Imports ##
#############

# external libraries
import random
import math
import igraph
import nltk
import importlib
import csv
import pickle
import copy
import multiprocessing as mp
import numpy as np
from functools import reduce
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing
import lightgbm as light


# our outsourced code
import features_nodewise as nw
import features_pairwise as pw
import preprocessing as prep
import multi_func as mf


In [4]:
###############
## Read data ##
###############

# Training
with open("./data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)
    training_set = [element[0].split(" ") for element in training_set]
    
# Testing -> we call it competition_set
with open ("./data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    competition_set = list(reader)
    competition_set = [element[0].split(" ") for element in competition_set]
# Node info
with open("./data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

abstracts = [element[5] for element in node_info]
titles = [element[2] for element in node_info]
IDs = [element[0] for element in node_info]
publication_years = prep.to_feature_shape([int(info[1]) for info in node_info])
train_true_labels = [int(element[2]) for element in training_set]

In [5]:
#####################
## Preprocess data ##
#####################

# Construct term-document-tfidf matrices
t_titles = prep.tfidf(titles)
t = prep.tfidf(abstracts)
t_ngrams = prep.tfidf(abstracts, r= (2,3), midf = 2, madf=0.5,feats=150000, sublinear = True)

# Reduce matrix dimensionality
l = nw.LSA(t,n_components=100)
l_ngrams = nw.LSA(t_ngrams,n_components=300)

# Build KDTrees to accelerate nearest-neighbour searches
kdtree = nw.KDTree(l)
kdtree_n = nw.KDTree(l_ngrams)

# Build graph from gold standard training data
train_IDs = np.array(IDs)
train_edges = [(element[0],element[1]) for element in training_set if element[2]=='1']
train_graph = prep.article_graph(train_IDs,train_edges)

# dicts to accelerate ID <-> index searches
node_dict = prep.to_dict(IDs,range(len(node_info)))
index_dict = prep.to_dict(range(len(IDs)),IDs)

# Load stemmer and stopwords for later title & abstract processing
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

n_samples: 27770, n_features: 2890
n_samples: 27770, n_features: 10000
n_samples: 27770, n_features: 150000
Performing dimensionality reduction using LSA
Explained variance of the SVD step: 16%
Performing dimensionality reduction using LSA
Explained variance of the SVD step: 7%
[nltk_data] Downloading package punkt to /home/lucas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lucas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
###################################
## Create dict to store features ##
###################################
competition_features_dict = dict()

all_feature_names = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'peer_popularity',
                      'succ_pred',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length',
                      'node_degree',
                      'reverse_max_sim']

In [7]:
#########################################
## Compute features on COMPETITION_SET ##
#########################################

competition_features_dict = pw.compute_all_features(competition_set,
                                                train_graph,
                                                IDs,
                                                node_info,
                                                stemmer,
                                                stpwds,
                                                kdtree, 
                                                l, 
                                                l_ngrams,
                                                t_titles,
                                                node_dict, 
                                                index_dict,
                                                False,
                                                publication_years)


0 / 5
0 / 2
1 / 2
0 / 1535
1000 / 1535
overlap_title 1535
comm_auth 1535
temp_diff 1535
citation_check 1535
max_sim 1535
peer_popularity 1535
succ_pred 1535
LSA_distance 1535
title_sim 1535
temporal_fit 1535
N_LSA_distance 1535
path_length 1535
node_degree 1535
reverse_max_sim 1535


In [8]:
######################
## Combine features ##
######################

competition_features = np.concatenate( [competition_features_dict['overlap_title'],
                                        competition_features_dict['comm_auth'],
                                        competition_features_dict['temp_diff'],
                                        competition_features_dict['citation_check'],
                                        competition_features_dict['max_sim'],
                                        competition_features_dict['peer_popularity'],
                                        competition_features_dict['succ_pred'],
                                        competition_features_dict['LSA_distance'],
                                        competition_features_dict['title_sim'],
                                        competition_features_dict['temporal_fit'],
                                        competition_features_dict['N_LSA_distance'],
                                        competition_features_dict['path_length'],
                                        competition_features_dict['node_degree'],
                                        competition_features_dict['reverse_max_sim']]
                                        ,axis = 1)

In [9]:
###############################
## Load classifier & predict ##
###############################

# Select a subset of features to use
selection = [0, 1, 2,  3,  5,  6,  7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
             29, 30, 31, 32, 33, 34, 35, 36, 37, 38]

with open("lightgbm_model","rb") as f:
    lgb = pickle.load(f)

# Predictions
preds_lgb = lgb.predict(competition_features[:,selection])

  if diff:


In [10]:
###############################
## Print predictions to file ##
###############################

with open("predictions.csv","w") as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(["id","category"])
    for i,pred in enumerate(preds_lgb):
        writer.writerow([i,pred])
        