In [None]:
#############
## Imports ##
#############

# external libraries
import random
import math
import igraph
import nltk
import importlib
import csv
import pickle
import copy
import multiprocessing as mp
import numpy as np
from functools import reduce
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing

# our outsourced code
import features_nodewise as nw
import features_pairwise as pw
import preprocessing as prep
import multi_func as mf


In [None]:
###############
## Read data ##
###############

# Training
with open("./data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)
    training_set = [element[0].split(" ") for element in training_set]
    
# Testing -> we call it competition_set
with open ("./data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    competition_set = list(reader)
    competition_set = [element[0].split(" ") for element in competition_set]

# Node info
with open("./data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)


abstracts = [element[5] for element in node_info]
titles = [element[2] for element in node_info]
IDs = [element[0] for element in node_info]
publication_years = prep.to_feature_shape([int(info[1]) for info in node_info])
train_true_labels = [int(element[2]) for element in training_set]

In [None]:
#####################
## Preprocess data ##
#####################

# Construct term-document-tfidf matrices
t_titles = prep.tfidf(titles)
t = prep.tfidf(abstracts)
t_ngrams = prep.tfidf(abstracts, r= (2,3), midf = 2, madf=0.5,feats=150000, sublinear = True)

# Reduce matrix dimensionality
l = nw.LSA(t,n_components=100)
l_ngrams = nw.LSA(t_ngrams,n_components=300)

# Build KDTrees to accelerate nearest-neighbour searches
kdtree = nw.KDTree(l)
kdtree_n = nw.KDTree(l_ngrams)

# Build graph from gold standard training data
train_IDs = np.array(IDs)
train_edges = [(element[0],element[1]) for element in training_set if element[2]=='1']
train_graph = prep.article_graph(train_IDs,train_edges)

# dicts to accelerate ID <-> index searches
node_dict = prep.to_dict(IDs,range(len(node_info)))
index_dict = prep.to_dict(range(len(IDs)),IDs)

# Load stemmer and stopwords for later title & abstract processing
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

In [None]:
####################################
## Create dicts to store features ##
####################################
train_features_dict = dict()
competition_features_dict = dict()

all_feature_names = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'peer_popularity',
                      'succ_pred',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length',
                      'node_degree',
                      'reverse_max_sim']

In [None]:
######################################
## Compute features on TRAINING_SET ##
######################################

train_features_dict = pw.compute_all_features(training_set,
                                                train_graph,
                                                IDs,
                                                node_info,
                                                stemmer,
                                                stpwds,
                                                kdtree, 
                                                l, 
                                                l_ngrams,
                                                t_titles,
                                                node_dict, 
                                                index_dict,
                                                True,
                                                publication_years)

In [None]:
#########################################
## Compute features on COMPETITION_SET ##
#########################################

competition_features_dict = pw.compute_all_features(competition_set,
                                                train_graph,
                                                IDs,
                                                node_info,
                                                stemmer,
                                                stpwds,
                                                kdtree, 
                                                l, 
                                                l_ngrams,
                                                t_titles,
                                                node_dict, 
                                                index_dict,
                                                False,
                                                publication_years)


In [None]:
##################################
## Combine & normalize features ##
##################################

train_features = np.concatenate([train_features_dict['overlap_title'],
                                 train_features_dict['comm_auth'],
                                 train_features_dict['temp_diff'],
                                 train_features_dict['citation_check'],
                                 train_features_dict['max_sim'],
                                 train_features_dict['peer_popularity'],
                                 train_features_dict['succ_pred'],
                                 train_features_dict['LSA_distance'],
                                 train_features_dict['title_sim'],
                                 train_features_dict['temporal_fit'],
                                 train_features_dict['N_LSA_distance'],
                                 train_features_dict['path_length'],
                                 train_features_dict['node_degree'],
                                 train_features_dict['reverse_max_sim']]                                            
                                 ,axis = 1)

competition_features = np.concatenate( [competition_features_dict['overlap_title'],
                                        competition_features_dict['comm_auth'],
                                        competition_features_dict['temp_diff'],
                                        competition_features_dict['citation_check'],
                                        competition_features_dict['max_sim'],
                                        competition_features_dict['peer_popularity'],
                                        competition_features_dict['succ_pred'],
                                        competition_features_dict['LSA_distance'],
                                        competition_features_dict['title_sim'],
                                        competition_features_dict['temporal_fit'],
                                        competition_features_dict['N_LSA_distance'],
                                        competition_features_dict['path_length'],
                                        competition_features_dict['node_degree'],
                                        competition_features_dict['reverse_max_sim']]
                                        ,axis = 1)


In [None]:
################################
## Train classifier & predict ##
################################

import lightgbm as light
import time

t0 = time.time()
# Select a subset of features to use
selection = [0, 1, 2,  3,  5,  6,  7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
             29, 30, 31, 32, 33, 34, 35, 36, 37, 38]

lgb = light.LGBMClassifier()
# Training
lgb.fit(train_features[:,selection], train_true_labels)
# Predictions
preds_lgb = lgb.predict(competition_features[:,selection])

In [None]:
###############################
## Extend the training graph ##
###############################

# We now incorporate the predictions just made in the training set, recompute features and

predicted_edges = []
for i in range(len(competition_set)):
    if int(preds_lgb[i]) == 1:
        predicted_edges.append((competition_set[i][0],competition_set[i][1]))
    
    
train_graph.add_edges(predicted_edges)

In [None]:
############################
## REcompute all features ##
############################

# Based on the extended train_graph we recompute all features

########################################
## REcompute features on TRAINING_SET ##
########################################

train_features_dict = pw.compute_all_features(training_set,
                                                train_graph,
                                                IDs,
                                                node_info,
                                                stemmer,
                                                stpwds,
                                                kdtree, 
                                                l, 
                                                l_ngrams,
                                                t_titles,
                                                node_dict, 
                                                index_dict,
                                                True,
                                                publication_years)

###########################################
## REcompute features on COMPETITION_SET ##
###########################################

# add the prediction (0/1) to the competition set. That information is used to remove that edge from the graph when recomputing the features
competition_set_extended = competition_set
for i in range(len(competition_set)):
    competition_set_extended[i].append(preds_lgb[i])
    
competition_features_dict = pw.compute_all_features(competition_set_extended,
                                                train_graph,
                                                IDs,
                                                node_info,
                                                stemmer,
                                                stpwds,
                                                kdtree, 
                                                l, 
                                                l_ngrams,
                                                t_titles,
                                                node_dict, 
                                                index_dict,
                                                True,
                                                publication_years)


In [None]:
##################################
## Combine & normalize features ##
##################################

train_features = np.concatenate([train_features_dict['overlap_title'],
                                 train_features_dict['comm_auth'],
                                 train_features_dict['temp_diff'],
                                 train_features_dict['citation_check'],
                                 train_features_dict['max_sim'],
                                 train_features_dict['peer_popularity'],
                                 train_features_dict['succ_pred'],
                                 train_features_dict['LSA_distance'],
                                 train_features_dict['title_sim'],
                                 train_features_dict['temporal_fit'],
                                 train_features_dict['N_LSA_distance'],
                                 train_features_dict['path_length'],
                                 train_features_dict['node_degree'],
                                 train_features_dict['reverse_max_sim']]                                            
                                 ,axis = 1)

competition_features = np.concatenate( [competition_features_dict['overlap_title'],
                                        competition_features_dict['comm_auth'],
                                        competition_features_dict['temp_diff'],
                                        competition_features_dict['citation_check'],
                                        competition_features_dict['max_sim'],
                                        competition_features_dict['peer_popularity'],
                                        competition_features_dict['succ_pred'],
                                        competition_features_dict['LSA_distance'],
                                        competition_features_dict['title_sim'],
                                        competition_features_dict['temporal_fit'],
                                        competition_features_dict['N_LSA_distance'],
                                        competition_features_dict['path_length'],
                                        competition_features_dict['node_degree'],
                                        competition_features_dict['reverse_max_sim']]
                                        ,axis = 1)

# normalization
from sklearn.preprocessing import StandardScaler as SS
scaler = SS()
normalized_train_features = scaler.fit_transform(train_features)
normalized_competition_features = scaler.fit_transform(competition_features)

In [None]:
################################
## Train classifier & predict ##
################################

import lightgbm as light
import time

t0 = time.time()
# Select a subset of features to use
selection = [0, 1, 2,  3,  5,  6,  7, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
             29, 30, 31, 32, 33, 34, 35, 36, 37, 38]

lgb = light.LGBMClassifier()
# Training
lgb.fit(train_features[:,selection], train_true_labels)
# Predictions
preds_lgb_2 = lgb.predict(competition_features[:,selection])

In [None]:
###############################
## Print predictions to file ##
###############################

with open("predictions.csv","w") as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(["id","category"])
    for i,pred in enumerate(preds_lgb_2):
        writer.writerow([i,pred])
        