In [1]:
#############
## Imports ##
#############

# external libraries
import random
import math
import igraph
import nltk
import importlib
import csv
import pickle
import copy
import multiprocessing as mp
import numpy as np
from functools import reduce
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing

# our outsourced code
import features_nodewise as nw
import features_pairwise as pw
import preprocessing as prep
import multi_func as mf


In [2]:
###############
## Read data ##
###############

# Training
with open("./data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)
    training_set = [element[0].split(" ") for element in training_set]
    
# Testing -> we call it competition_set
with open ("./data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    competition_set = list(reader)
    competition_set = [element[0].split(" ") for element in competition_set]

# Node info
with open("./data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs_present = []
for edge in competition_set:
    IDs_present.append(edge[0])
    IDs_present.append(edge[1])
for edge in training_set:
    IDs_present.append(edge[0])
    IDs_present.append(edge[1])
IDs_present = list(set(IDs_present))

new_node_info = []
for info in node_info:
    if info[0] in IDs_present:
        new_node_info.append(info)
node_info = new_node_info

abstracts = [element[5] for element in node_info]
titles = [element[2] for element in node_info]
IDs = [element[0] for element in node_info]
publication_years = prep.to_feature_shape([int(info[1]) for info in node_info])
train_true_labels = [element[2] for element in training_set]

In [4]:
#####################
## Preprocess data ##
#####################

# Construct term-document-tfidf matrices
t_titles = prep.tfidf(titles)
t = prep.tfidf(abstracts)
t_ngrams = prep.tfidf(abstracts, r= (2,3), midf = 2, madf=0.5,feats=150000, sublinear = True)

# Reduce matrix dimensionality
l = nw.LSA(t,n_components=100)
l_ngrams = nw.LSA(t_ngrams,n_components=300)

# Build KDTrees to accelerate nearest-neighbour searches
kdtree = nw.KDTree(l)
kdtree_n = nw.KDTree(l_ngrams)

# Build graph from gold standard training data
train_IDs = np.array(IDs_present)
train_edges = [(element[0],element[1]) for element in training_set if element[2]=='1']
train_graph = prep.article_graph(train_IDs,train_edges)

# dicts to accelerate ID <-> index searches
node_dict = prep.to_dict(IDs,range(len(node_info)))
index_dict = prep.to_dict(range(len(IDs)),IDs)

# Load stemmer and stopwords for later title & abstract processing
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

n_samples: 6788, n_features: 1432
n_samples: 6788, n_features: 6088
n_samples: 6788, n_features: 61425
Performing dimensionality reduction using LSA
Explained variance of the SVD step: 19%
Performing dimensionality reduction using LSA
Explained variance of the SVD step: 13%
[nltk_data] Downloading package punkt to /home/lucas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lucas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
####################################
## Create dicts to store features ##
####################################
train_features_dict = dict()
competition_features_dict = dict()

In [45]:
######################################
## Compute features on TRAINING_SET ##
######################################

features_to_create = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'peer_popularity',
                      'succ_pred',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length',
                      'node_degree']

# We insert features in a dictionary
insert_features_dict = train_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = training_set

# Compute some features in parallelized chunks.
p = mf.params(train_graph, kdtree, l, node_dict, index_dict, chunk_size = 1000)
grouped_set = [training_set[2000*i:2000*(i+1)] for i in range(math.ceil(len(training_set)/2000))]
pool = mp.Pool(mp.cpu_count())
path_dict_list = pool.map(p.all_paths_noparams, grouped_set, chunksize = 10)
chunked_output = zip(*pool.map(p.by_chunk_noparams, grouped_set, chunksize = 10))
pool.close()

# Recombine chunked output
all_path_dict = dict()
for i in IDs:
    all_path_dict[i] = dict()
    
for d in path_dict_list:
    for source_id in d:
        for target_id in d[source_id]:
            all_path_dict[source_id][target_id] = d[source_id][target_id]

list_chunked_output = list(chunked_output)
for feature_list in list_chunked_output[0]:
        insert_features_dict['succ_pred'].extend(feature_list)
for feature_list in list_chunked_output[1]:
        insert_features_dict['max_sim'].extend(feature_list)
for feature_list in list_chunked_output[2]:
        insert_features_dict['citation_check'].extend(feature_list)
for feature_list in list_chunked_output[3]:
        insert_features_dict['node_degree'].extend(feature_list)


# Compute features per node pair
for i,triple in enumerate(set_to_use):
    
    # Read basic data about the current node pair
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    # Creating features
    overlap_title = len(set(source_title).intersection(set(target_title)))
    insert_features_dict["overlap_title"].append(overlap_title)
    temp_diff = int(source_info[1]) - int(target_info[1])
    insert_features_dict["temp_diff"].append(temp_diff)
    comm_auth = len(set(source_auth).intersection(set(target_auth)))
    insert_features_dict["comm_auth"].append(comm_auth)

    peer_pop = pw.peer_popularity(train_graph,source,target)
    insert_features_dict["peer_popularity"].append(peer_pop)

    LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    insert_features_dict["LSA_distance"].append(LSA_dist)
    
    title_weighted = t_titles.getrow(index_source).dot(t_titles.getrow(index_target).transpose())[0,0]
    insert_features_dict["title_sim"].append(title_weighted)

    N_LSA_dist = pw.LSA_distance(source,target,node_dict,l_ngrams)
    insert_features_dict["N_LSA_distance"].append(N_LSA_dist)

    temporal_fit = pw.temp_fit(source,target,train_graph,node_dict,publication_years)
    insert_features_dict["temporal_fit"].append(temporal_fit)
    
    path_length = pw.path_length(source, target, all_path_dict)
    insert_features_dict["path_length"].append(path_length)
    
    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = prep.to_feature_shape(value)

0 / 6
0 / 2
0 / 2
1 / 2
0 / 1
0 / 2696
1000 / 2696
2000 / 2696
overlap_title 2696
comm_auth 2696
temp_diff 2696
citation_check 2696
max_sim 2696
peer_popularity 2696
succ_pred 2696
LSA_distance 2696
title_sim 2696
temporal_fit 2696
N_LSA_distance 2696
path_length 2696
node_degree 2696


In [None]:
importlib.reload(preprocessing)
importlib.reload(mf)

In [46]:
###########################################
## Construct features on COMPETITION_SET ##
###########################################

features_to_create = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'node_degree',
                      'peer_popularity',
                      'succ_pred',
                      'node_degree',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length']

# Where to insert created features
insert_features_dict = competition_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = competition_set

# Compute some features in parallelized chunks.
p = mf.params(train_graph, kdtree, l, node_dict, index_dict, chunk_size=1000, pairs_subset_edges=False)
grouped_set = [competition_set[2000*i:2000*(i+1)] for i in range(math.ceil(len(competition_set)/2000))]
print(len(grouped_set))
print(mp.cpu_count())
pool = mp.Pool(mp.cpu_count())
path_dict_list = pool.map(p.all_paths_noparams, grouped_set, chunksize = 1)
chunked_output = zip(*pool.map(p.by_chunk_noparams, grouped_set, chunksize = 1))
pool.close()
# Recombine chunked output
all_path_dict = dict()
for i in IDs:
    all_path_dict[i] = dict()
    
for d in path_dict_list:
    for source_id in d:
        for target_id in d[source_id]:
            all_path_dict[source_id][target_id] = d[source_id][target_id]

list_chunked_output = list(chunked_output)
for feature_list in list_chunked_output[0]:
        insert_features_dict['succ_pred'].extend(feature_list)
for feature_list in list_chunked_output[1]:
        insert_features_dict['max_sim'].extend(feature_list)
for feature_list in list_chunked_output[2]:
        insert_features_dict['citation_check'].extend(feature_list)
for feature_list in list_chunked_output[3]:
        insert_features_dict['node_degree'].extend(feature_list)

# Compute features per node pair
for i,triple in enumerate(set_to_use):
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    # Creating features
    # Baseline #
    overlap_title = len(set(source_title).intersection(set(target_title)))
    insert_features_dict["overlap_title"].append(overlap_title)
    temp_diff = int(source_info[1]) - int(target_info[1])
    insert_features_dict["temp_diff"].append(temp_diff)
    comm_auth = len(set(source_auth).intersection(set(target_auth)))
    insert_features_dict["comm_auth"].append(comm_auth)

    peer_pop = pw.peer_popularity(train_graph,source,target)
    insert_features_dict["peer_popularity"].append(peer_pop)

    LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    insert_features_dict["LSA_distance"].append(LSA_dist)
    
    title_weighted = t_titles.getrow(index_source).dot(t_titles.getrow(index_target).transpose())[0,0]
    insert_features_dict["title_sim"].append(title_weighted)
    
    N_LSA_dist = pw.LSA_distance(source,target,node_dict,l_ngrams)
    insert_features_dict["N_LSA_distance"].append(N_LSA_dist)

    temporal_fit = pw.temp_fit(source,target,train_graph,node_dict,publication_years)
    insert_features_dict["temporal_fit"].append(temporal_fit)
    
    path_length = pw.path_length(source, target, all_path_dict)
    insert_features_dict["path_length"].append(path_length)

    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = prep.to_feature_shape(value)

1
4
0 / 5
0 / 2
1 / 2
0 / 1535
1000 / 1535
overlap_title 1535
comm_auth 1535
temp_diff 1535
citation_check 1535
max_sim 1535
node_degreepeer_popularity 0
succ_pred 1535
node_degree 1535
LSA_distance 1535
title_sim 1535
temporal_fit 1535
N_LSA_distance 1535
path_length 1535
peer_popularity 1535


In [50]:
##################################
## Combine & normalize features ##
##################################

train_features = np.concatenate([train_features_dict['overlap_title'],
                                 train_features_dict['comm_auth'],
                                 train_features_dict['temp_diff'],
                                 train_features_dict['citation_check'],
                                 train_features_dict['max_sim'],
                                 train_features_dict['peer_popularity'],
                                 train_features_dict['succ_pred'],
                                 train_features_dict['LSA_distance'],
                                 train_features_dict['title_sim'],
                                 train_features_dict['temporal_fit'],
                                 train_features_dict['N_LSA_distance'],
                                 train_features_dict['path_length'],
                                 train_features_dict['node_degree']]                                            
                                 ,axis = 1)

competition_features = np.concatenate( [competition_features_dict['overlap_title'],
                                        competition_features_dict['comm_auth'],
                                        competition_features_dict['temp_diff'],
                                        competition_features_dict['citation_check'],
                                        competition_features_dict['max_sim'],
                                        competition_features_dict['peer_popularity'],
                                        competition_features_dict['succ_pred'],
                                        competition_features_dict['LSA_distance'],
                                        competition_features_dict['title_sim'],
                                        competition_features_dict['temporal_fit'],
                                        competition_features_dict['N_LSA_distance'],
                                        competition_features_dict['path_length'],
                                        competition_features_dict['node_degree']]
                                        ,axis = 1)

# normalization
from sklearn.preprocessing import StandardScaler as SS
scaler = SS()
normalized_train_features = scaler.fit_transform(train_features)
normalized_competition_features = scaler.fit_transform(competition_features)

In [60]:
################################
## Train classifier & predict ##
################################

import lightgbm as light
import time

t0 = time.time()
# Select a subset of features to use
selection = [2,  3,  5,  6,  7, 10, 11, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 
             30]

lgb = light.LGBMClassifier()
# Training
lgb.fit(train_features[:,selection], train_true_labels)
# Predictions
preds_lgb = lgb.predict(competition_features[:,selection])

  if diff:


In [None]:
CONTINUE FROM HERE

In [None]:
###############################
## Extend the training graph ##
###############################

# We now incorporate the predictions just made in the training set, recompute features and

predicted_edges = [tuple(competition_set[i]) for i in range(len(preds_lgb)) if int(preds_lgb[i]) == 1]
train_graph.add_edges(predicted_edges)

In [None]:
################################
## REcompute all features (2) ##
################################

# Based on the extended train_graph we recompute all features


########################################
## REcompute features on TRAINING_SET ##
########################################

features_to_create = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'peer_popularity',
                      'edge_check',
                      'succ_pred',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length']

# We insert features in a dictionary
insert_features_dict = train_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = training_set

# Compute some features in parallelized chunks.
p = mf.params(train_graph, kdtree, l, node_dict, index_dict, chunk_size = 1000)
grouped_set = [training_set[2000*i:2000*(i+1)].tolist() for i in range(math.ceil(len(training_set)/2000))]
pool = mp.Pool(mp.cpu_count())
path_dict_list = pool.map(p.all_paths_noparams, grouped_set, chunksize = 10)
chunked_output = zip(*pool.map(p.by_chunk_noparams, grouped_set, chunksize = 10))
pool.close()
# Recombine chunked output
all_path_dict = dict()
for i in IDs:
    all_path_dict[i] = dict()
    
for d in path_dict_list:
    for source_id in d:
        for target_id in d[source_id]:
            all_path_dict[source_id][target_id] = d[source_id][target_id]

for feature_list in list_chunked_output[0]:
        insert_features_dict['succ_pred'].extend(feature_list)
for feature_list in list_chunked_output[1]:
        insert_features_dict['Max_Sim'].extend(feature_list)
for feature_list in list_chunked_output[2]:
        insert_features_dict['Citation_Check'].extend(feature_list)
for feature_list in list_chunked_output[3]:
        insert_features_dict['node_degree'].extend(feature_list)


# Compute features per node pair
for i,triple in enumerate(set_to_use):
    
    # Read basic data about the current node pair
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    # Creating features
    overlap_title = len(set(source_title).intersection(set(target_title)))
    insert_features_dict["overlap_title"].append(overlap_title)
    temp_diff = int(source_info[1]) - int(target_info[1])
    insert_features_dict["temp_diff"].append(temp_diff)
    comm_auth = len(set(source_auth).intersection(set(target_auth)))
    insert_features_dict["comm_auth"].append(comm_auth)

    peer_pop = pw.peer_popularity(train_graph,source,target)
    insert_features_dict["peer_popularity"].append(peer_pop)

    LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    insert_features_dict["LSA_distance"].append(LSA_dist)
    
    title_weighted = t_titles.getrow(index_source)*t_titles.getrow(index_target).transpose()
    insert_features_dict["title_sim"].append(title_weighted)

    N_LSA_dist = pw.LSA_distance(source,target,node_dict,l_ngrams)
    insert_features_dict["N_LSA_distance"].append(N_LSA_dist)

    temporal_fit = pw.temp_fit(source,target,train_graph,node_dict,publication_years)
    insert_features_dict["temporal_fit"].append(temporal_fit)
    
    path_length = pw.path_length(source, target, all_path_dict)
    insert_features_dict["path_length"].append(path_length)
    
    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = to_feature_shape(value)
    


###########################################
## REcompute features on COMPETITION_SET ##
###########################################
features_to_create = ['overlap_title',
                      'comm_auth',
                      'temp_diff',
                      'citation_check',
                      'max_sim',
                      'peer_popularity',
                      'edge_check',
                      'succ_pred',
                      'node_degree',
                      'LSA_distance',
                      'title_sim',
                      'temporal_fit',
                      'N_LSA_distance',
                      'path_length']

# Where to insert created features
insert_features_dict = competition_features_dict
for feat in features_to_create:
    insert_features_dict[feat] = []
set_to_use = competition_set

# Compute some features in parallelized chunks.
p = mf.params(train_graph, kdtree, l, node_dict, index_dict, chunk_size=1000, pairs_subset_edges=False)
grouped_set = [competition_set[2000*i:2000*(i+1)].tolist() for i in range(math.ceil(len(competition_set)/2000))]
pool = mp.Pool(mp.cpu_count())
path_dict_list = pool.map(p.all_paths_noparams, grouped_set, chunksize = 1)
chunked_output = zip(*pool.map(p.by_chunk_noparams, grouped_set, chunksize = 1))
pool.close()
# Recombine chunked output
all_path_dict = dict()
for i in IDs:
    all_path_dict[i] = dict()
    
for d in path_dict_list:
    for source_id in d:
        for target_id in d[source_id]:
            all_path_dict[source_id][target_id] = d[source_id][target_id]

for feature_list in list_chunked_output[0]:
        insert_features_dict['succ_pred'].extend(feature_list)
for feature_list in list_chunked_output[1]:
        insert_features_dict['Max_Sim'].extend(feature_list)
for feature_list in list_chunked_output[2]:
        insert_features_dict['Citation_Check'].extend(feature_list)
for feature_list in list_chunked_output[3]:
        insert_features_dict['node_degree'].extend(feature_list)

# Compute features per node pair
for i,triple in enumerate(set_to_use):
    source = triple[0]
    target = triple[1]
    index_source = node_dict[source]
    index_target = node_dict[target]
    
    source_info = node_info[index_source]
    target_info = node_info[index_target]

    # convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
    # remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]

    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]

    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",") 
    
    # Creating features
    # Baseline #
    overlap_title = len(set(source_title).intersection(set(target_title)))
    insert_features_dict["overlap_title"].append(overlap_title)
    temp_diff = int(source_info[1]) - int(target_info[1])
    insert_features_dict["temp_diff"].append(temp_diff)
    comm_auth = len(set(source_auth).intersection(set(target_auth)))
    insert_features_dict["comm_auth"].append(comm_auth)

    peer_pop = pw.peer_popularity(train_graph,source,target)
    insert_features_dict["peer_popularity"].append(peer_pop)

    edge_check = pw.edge_check(source,target,train_graph)
    insert_features_dict["edge_check"].append(edge_check)

    LSA_dist = pw.LSA_distance(source,target,node_dict,l)
    insert_features_dict["LSA_distance"].append(LSA_dist)
    
    title_weighted = title_sim[index_source,index_target]
    insert_features_dict["title_sim"].append(title_weighted)
    
    N_LSA_dist = pw.LSA_distance(source,target,node_dict,l_ngrams)
    insert_features_dict["N_LSA_distance"].append(N_LSA_dist)

    temporal_fit = pw.temp_fit(source,target,train_graph,node_dict,publication_years)
    insert_features_dict["temporal_fit"].append(temporal_fit)
    
    path_length = pw.path_length(source, target, all_path_dict)
    insert_features_dict["path_length"].append(path_length)

    if i%1000==0:
        print(i,"/",len(set_to_use))

# Reshape features into np column arrays, one row per node pair
for (name,value) in insert_features_dict.items():
    print(name,len(value))
    insert_features_dict[name] = to_feature_shape(value)

In [None]:
##########################################
## Combine & normalize features (again) ##
##########################################

train_features_dict.keys()
test_features_dict.keys()
for key,feat in train_features_dict.items():
    print(key,feat.shape)
train_features = np.concatenate([train_features_dict['overlap_title'],
                                 train_features_dict['comm_auth'],
                                 train_features_dict['temp_diff'],
                                 train_features_dict['citation_check'],
                                 train_features_dict['max_sim'],
                                 train_features_dict['peer_popularity'],
                                 train_features_dict['edge_check'],
                                 train_features_dict['succ_pred'],
                                 train_features_dict['LSA_distance'],
                                 train_features_dict['title_sim'],
                                 train_features_dict['temporal_fit'],
                                 train_features_dict['N_LSA_distance'],
                                 train_features_dict['path_length'],
                                 train_features_dict['node_degree']]                                            
                                 ,axis = 1)

competition_features = np.concatenate( [competition_features_dict['overlap_title'],
                                        competition_features_dict['comm_auth'],
                                        competition_features_dict['temp_diff'],
                                        competition_features_dict['citation_check'],
                                        competition_features_dict['max_sim'],
                                        competition_features_dict['peer_popularity'],
                                        competition_features_dict['edge_check'],
                                        competition_features_dict['succ_pred'],
                                        competition_features_dict['LSA_distance'],
                                        competition_features_dict['title_sim'],
                                        competition_features_dict['temporal_fit'],
                                        competition_features_dict['N_LSA_distance'],
                                        competition_features_dict['path_length'],
                                        competition_features_dict['node_degree']]
                                        ,axis = 1)

# normalization (again)
from sklearn.preprocessing import StandardScaler as SS
scaler = SS()
normalized_train_features = scaler.fit_transform(training_features)
normalized_competition_features = scaler.fit_transform(competition_features)

In [None]:
########################################
## Train classifier & predict (again) ##
########################################

import lightgbm as light
import time

t0 = time.time()
# Select a subset of features to use
selection = [2,  3,  5,  6,  7, 10, 11, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 
             30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]

lgb = light.LGBMClassifier()
# Training
lgb.fit(training_features[:,selection], train_true_labels)
# Predictions
preds_lgb = lgb.predict(competition_features[:,selection])

# Statistics
t1 = time.time()-t0
print(t1)
acc = metrics.accuracy_score(list(map(int,train_true_labels)), list(map(int,preds_lgb)))
f1 = metrics.f1_score(list(map(int,train_true_labels)), list(map(int,preds_lgb)))
print('acc:',acc,'f1:',f1)

In [85]:
#######################
## Other classifiers ##
#######################

## SVM ##
from sklearn import svm
import copy

classifier = svm.LinearSVC(loss='hinge')
selection = list(range(train_features_reduced.shape[1]))
svm_s = copy.copy(selection)

# classifier.fit(train_features_reduced[:,selection], train_true_labels_reduced)
# preds_svm = list(classifier.predict(test_features[:,selection]))
classifier.fit(normalized_train_features_reduced[:,selection], train_true_labels_reduced)
svm_c = classifier
preds_svm = list(classifier.predict(normalized_test_features[:,selection]))

acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_svm)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_svm)))
print('acc:',acc,'f1:',f1)



acc: 0.9707108849121655 f1: 0.9728685707333395


In [95]:
from sklearn.linear_model import LogisticRegression as lr 
selection = [ 1,  7,  9, 10, 11, 15, 16, 19, 21, 22, 23]
selection.extend([6,25,26,27])
selection.append(31)
lr_s = copy.copy(selection)
# selection = [i for i in range(16)]
# selection.extend([41,103,115])

model = lr(penalty='l1').fit(train_features_reduced[:,selection], train_true_labels_reduced[:])
lr_c = model
preds_lg = list(model.predict(test_features[:,selection]))

# model = lr(penalty='l1').fit(train_features_reduced, train_true_labels)
# preds_lg = list(model.predict(test_features))

acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_lg)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_lg)))
print('acc:',acc,'f1:',f1)



acc: 0.9649333333333333 f1: 0.9672028931288191


In [15]:
from sklearn.neighbors import KNeighborsClassifier
selection = [1, 2, 6, 13, 16, 20, 21, 22]
knn_s = copy.copy(selection)

nNhbr = KNeighborsClassifier(n_neighbors=9,weights='distance')
nNhbr.fit(train_features_reduced[:,selection],train_true_labels_reduced) # do Ytrain.ravel() for length one Y values
knn_c = nNhbr
preds_knn = nNhbr.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_knn)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_knn)))
print('acc:',acc,'f1:',f1)

acc: 0.9581021178788376 f1: 0.9613848202396804


In [19]:
from sklearn.tree import DecisionTreeClassifier
# selection = [0, 1, 2, 8, 13, 16, 20, 22]
selection = [ 0,  2,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 19, 20, 21, 22,23, 25, 26, 27]
dt_s = copy.copy(selection)

dTree = DecisionTreeClassifier()
dTree.fit(train_features_reduced[:,selection],train_true_labels_reduced) # do Ytrain.ravel() for length one Y values
dt_c = dTree

preds_dt = dTree.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_dt)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_dt)))
print('acc:',acc,'f1:',f1)

acc: 0.9557872270563126 f1: 0.9595274951532184


In [57]:
test = np.concatenate((probs_lr.reshape(-1,1),probs_dt.reshape(-1,1)),axis=1)
np.concatenate((test,probs_dt.reshape(-1,1)),axis=1)

array([[2.50986568e-06, 0.00000000e+00, 0.00000000e+00],
       [9.99997490e-01, 1.00000000e+00, 1.00000000e+00],
       [9.43410527e-01, 1.00000000e+00, 1.00000000e+00],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [9.24313612e-01, 1.00000000e+00, 1.00000000e+00],
       [7.56863879e-02, 0.00000000e+00, 0.00000000e+00]])

In [16]:
probs_svm = svm_c.decision_function(normalized_test_features[:,svm_s])
probs_lr = lr_c.predict_proba(test_features[:,lr_s])[:,0]
probs_knn = knn_c.predict_proba(test_features[:,knn_s])[:,0]
all_probs = [probs_svm,probs_lr,probs_knn]

probs_features = all_probs[0].reshape(-1,1)
for i in range(1,len(all_probs)):
    probs_features = np.concatenate((probs_features,all_probs[i].reshape(-1,1)),axis=1)

In [17]:
joint_model = lr(penalty='l1').fit(probs_features[:30000,:], test_true_labels[:30000])
preds_lg = list(joint_model.predict(probs_features[30000:,:]))

acc = metrics.accuracy_score(list(map(int,test_true_labels[30000:])), list(map(int,preds_lg)))
f1 = metrics.f1_score(list(map(int,test_true_labels[30000:])), list(map(int,preds_lg)))
print('acc:',acc,'f1:',f1)

acc: 0.9712067292138467 f1: 0.9731976148888756




In [27]:
# Joined forces
joined_DTree = DecisionTreeClassifier()

preds_test_svm = np.reshape(preds_svm,(len(preds_svm),1))
preds_test_lg = np.reshape(preds_lg,(len(preds_lg),1))
preds_test_knn = np.reshape(preds_knn,(len(preds_knn),1))
preds_test_dt = np.reshape(preds_dt,(len(preds_dt),1))
combined_preds = np.concatenate([preds_test_svm,preds_test_lg,preds_test_knn,preds_test_dt],axis=1)

joined_DTree.fit(combined_preds[0:50000,:], test_true_labels[0:50000])
preds_joined = joined_DTree.predict(combined_preds[50000:,:])

acc = metrics.accuracy_score(list(map(int,test_true_labels[50000:])), list(map(int,preds_joined)))
f1 = metrics.f1_score(list(map(int,test_true_labels[50000:])), list(map(int,preds_joined)))
print("acc:",acc,"f1:",f1)

acc: 0.970852428964253 f1: 0.9730234136409909


In [None]:
# Which features are important?
# Histogram of the feature frequency for all selections that reached > 90% acc
frequency = [0]*total_num_features

num_good_preds = 0
min_acc = 0.93
for i,acc in enumerate(accs):
    if acc > min_acc:
        num_good_preds += 1
        for f in feature_selections[i]:
            frequency[f] += 1
#frequency = [freq/num_good_preds for freq in frequency]
print("")
print("number of classifiers: ",len(accs))
print("number of accs >",min_acc,": ",sum([1 for acc in accs if acc > min_acc]))
plt.figure()
plt.bar(x=range(len(frequency)),height=frequency)
plt.show()

In [116]:
# Adaboost DecisionTrees
from sklearn.ensemble import AdaBoostClassifier
selection =  [ 0,  1,  2,  4,  5,  6,  7, 10, 15, 16, 18, 19, 20, 21, 22, 23, 25, 26]

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4, min_samples_leaf = 1, min_samples_split = 2),
                         n_estimators=750,learning_rate=0.01)
ada.fit(train_features[:100000,selection],train_true_labels[:100000])
preds_ada = ada.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_ada)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_ada)))
print("acc:",acc,"f1:",f1)
#0.9761527670935336

acc: 0.9740436709899852 f1: 0.9761527670935336


In [61]:
# ExtraTreesClassifier
#fiddle with n_estimators and min_samples_leaf

from sklearn.ensemble import ExtraTreesClassifier
# selection = [1, 2, 6, 13, 16, 20, 22, 23]
selection = [1, 2,  5,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]

#add one to min_sample_leaf for full train_features (or change to 4)
extraTrees = ExtraTreesClassifier(n_estimators=750,max_depth=90,min_samples_split=10,min_samples_leaf=0.00001)
extraTrees.fit(train_features_reduced[:,selection],train_true_labels_reduced)
preds_extra = extraTrees.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels)), list(map(int,preds_extra)))
f1 = metrics.f1_score(list(map(int,test_true_labels)), list(map(int,preds_extra)))
print("acc:",acc,"f1:",f1)

acc: 0.9645706780495813 f1: 0.9675674050918271


In [77]:
from sklearn.ensemble import RandomForestClassifier

#f1: 0.96989, takes ~35 mins
selection = [2,  3,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]

randForest = RandomForestClassifier(n_estimators = 750, min_samples_split = 5, min_samples_leaf = 4, max_depth = 60)

randForest.fit(train_features[:,selection],train_true_labels)
preds_randForest = randForest.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels[:])), list(map(int,preds_randForest)))
f1 = metrics.f1_score(list(map(int,test_true_labels[:])), list(map(int,preds_randForest)))
print("acc:",acc,"f1:",f1)

acc: 0.9724019044491873 f1: 0.9746398129290187


In [39]:
selection = [2,  3,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
preds_randForest = randForest.predict(competition_features[:,selection])

In [40]:
preds_rf = list(preds_randForest)

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
preds_rf = list(zip(range(len(competition_set)), preds_rf))

with open("rf_predictions0.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in preds_rf:
        csv_out.writerow(row)
        
# with open('random_forest_model', 'wb') as file:
#         pickle.dump(randForest,file)

In [122]:
from xgboost.sklearn import XGBClassifier

selection = [2,22, 23, 25, 26, 7, 15, 21, 10, 5, 6,  16, 19, 27, 28, 29, 30, 31]
# selection = [i for i in range(16)]
# selection.extend([ 95, 102])

xgb = XGBClassifier(objective= 'binary:logistic', subsample = 0.8, colsample_bytree=0.8, learning_rate=0.01, 
                     max_depth=5, min_child_weight = 4, gamma=0, reg_lambda=2)

xgb.fit(train_features[:100000,selection],train_true_labels[:100000])
preds_xgb = xgb.predict(test_features[:,selection])
acc = metrics.accuracy_score(list(map(int,test_true_labels[:])), list(map(int,preds_xgb)))
f1 = metrics.f1_score(list(map(int,test_true_labels[:])), list(map(int,preds_xgb)))
print("acc:",acc,"f1:",f1)

acc: 0.9716795271712363 f1: 0.9739366926040642


In [25]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(objective= 'binary:logistic', subsample = 0.8, colsample_bytree=0.8, learning_rate=0.01, 
                     max_depth=5, min_child_weight = 4, gamma=0, reg_lambda=2)

In [134]:
selection = [0,2,7,8,9,13,16,19,20,21,22,23,24,25,26,27]
selection = [18,19,20,21]
xgb = XGBClassifier(objective= 'binary:logistic', subsample = 0.8, colsample_bytree=0.8, learning_rate=0.01, 
                     max_depth=5, min_child_weight = 4, gamma=0, reg_lambda=2)

xgb.fit(train_features[:25000,selection],train_true_labels[:25000])
preds_xgb = xgb.predict(train_features[25000:30000,selection])
acc = metrics.accuracy_score(list(map(int,train_true_labels[25000:30000])), list(map(int,preds_xgb)))
f1 = metrics.f1_score(list(map(int,train_true_labels[25000:30000])), list(map(int,preds_xgb)))
print("acc:",acc,"f1:",f1)

acc: 0.9834 f1: 0.9846438482886217


In [28]:
preds_xgb = list(preds_xgb)

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
preds_xgb = list(zip(range(len(competition_set)), preds_xgb))

with open("xgb_predictions.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in preds_xgb:
        csv_out.writerow(row)

In [66]:
from sklearn.ensemble import VotingClassifier

#### NEEDS TO BE FIXED
preds_test_svm = np.reshape(preds_svm,(len(preds_svm),1))
preds_test_lg = np.reshape(preds_lg,(len(preds_lg),1))
preds_test_knn = np.reshape(preds_knn,(len(preds_knn),1))
preds_test_et = np.reshape(preds_extra,(len(preds_extra),1))
preds_test_xgb = np.reshape(preds_xgb,(len(preds_xgb),1))

combined_preds = np.concatenate([preds_test_svm,preds_test_lg,preds_test_knn,preds_test_et,preds_test_xgb],axis=1)

VotingClassifier.fit(combined_preds[0:50000,:], test_true_labels[0:50000])
preds_joined = VotingClassifier.predict(combined_preds[50000:,:])

acc = metrics.accuracy_score(list(map(int,test_true_labels[50000:])), list(map(int,preds_joined)))
f1 = metrics.f1_score(list(map(int,test_true_labels[50000:])), list(map(int,preds_joined)))
print("acc:",acc,"f1:",f1)

TypeError: fit() missing 1 required positional argument: 'y'

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
# tuned_parameters = [{'penalty': ['l1'], 'C': [1, 10], 'loss': ['squared_hinge'],'dual': [False],
#                      'max_iter': [1000,5000]}]
tuned_parameters = [{'n_estimators': [500],'criterion': ['gini'],'max_depth': [None,50,100],
                    'min_samples_split': [2,10], 'min_samples_leaf': [4,0.00001], 'bootstrap': [True, False]}]
selection = [1, 2, 6, 13, 16, 20, 22, 23]

scores = ['f1_macro'] #'accuracy_score'

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(ExtraTreesClassifier(), tuned_parameters, cv=5,
                       scoring='%s' % score)
    clf.fit(train_features_reduced[:,selection], train_true_labels_reduced)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = test_true_labels, clf.predict(test_features[:,selection])
    print(classification_report(y_true, y_pred))
    print()

In [None]:
base_estimator = ...
splitter = ['best', 'random']
max_depth =  [int(x) for x in np.linspace(1, 110, num = 15)].append([None])
min_weight_fraction_leaf = [0, 0.00001, 0.000001]
max_features = [None, 'sqrt', 'log2']
max_leaf_nodes = [None, 10, 100, 1000]

n_estimators = [10, 20, 50, 100, 200]
learning_rate = [0.001, 0.01, 0.1, 1, 10]

In [47]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

adaBoost = AdaBoostClassifier()
selection = [ 0,  1,  2,  4,  5,  6,  7, 10, 15, 16, 18, 19, 20, 21, 22, 23, 25, 26]

#n_estimators: 100
#learning_rate: 0.01

#max_depth = 5
#min_samples_leaf = 1
#min_samples_split = 2
#min_weight_Fraction_leaf = 0


base_estimator = [DecisionTreeClassifier(splitter = 'best', max_depth = 1, min_weight_fraction_leaf = 0.00001, max_features = 'sqrt', max_leaf_nodes = 10), 
                 DecisionTreeClassifier(splitter = 'best', max_depth = 4, min_weight_fraction_leaf = 0.00001, max_features = 'sqrt', max_leaf_nodes = 50),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 10, min_weight_fraction_leaf = 0.00001, max_features = None, max_leaf_nodes = 10),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 4, min_weight_fraction_leaf = 0.000001, max_features = 'sqrt', max_leaf_nodes = 100),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 7, min_weight_fraction_leaf = 0, max_features = 'sqrt', max_leaf_nodes = 10),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 2, min_weight_fraction_leaf = 0.00001, max_features = 'sqrt', max_leaf_nodes = 25),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 7, min_weight_fraction_leaf = 0.00001, max_features = None, max_leaf_nodes = None),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 7, min_weight_fraction_leaf = 0, max_features = 'sqrt', max_leaf_nodes = None),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 10, min_weight_fraction_leaf = 0, max_features = None, max_leaf_nodes = None),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 4, min_weight_fraction_leaf = 0, max_features = None, max_leaf_nodes = None),
                  DecisionTreeClassifier(splitter = 'best', max_depth = 4, min_weight_fraction_leaf = 0, max_features = None, max_leaf_nodes = None)
                 ]

n_estimators = [ 50, 100, 200, 500, 750]
learning_rate = [0.005, 0.01, 0.1, 0.05]

# Create the random grid
random_grid = {'base_estimator': base_estimator, 
              'n_estimators': n_estimators,
              'learning_rate': learning_rate}

# Random search of parameters, using 3 fold cross validation, 
ada_random = RandomizedSearchCV(estimator = adaBoost, param_distributions = random_grid, n_iter = 65,
                              cv = 2, verbose = 10, random_state=42, n_jobs = -1)

ada_random.fit(train_features_reduced[:,selection], train_true_labels_reduced)

Fitting 2 folds for each of 65 candidates, totalling 130 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed: 13.6min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
          fit_params=None, iid='warn', n_iter=65, n_jobs=-1,
          param_distributions={'base_estimator': [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features='sqrt', max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...itter='best')], 'n_estimators': [50, 100, 200, 500, 750], 'learning_rate': [0.005, 0.01, 0.1, 0.05]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [53]:
from scipy.stats import mode
top = np.argsort(ada_random.cv_results_['rank_test_score'])
# top[:5]
# xgb_random.best_index_
# mode([rf_random.cv_results_['params'][i]['bootstrap'] for i in top[:15]])
[ada_random.cv_results_['params'][i]['learning_rate'] for i in top[:10]]
#n_estimators: 750
#learning_rate: 0.01

#max_depth = 4
#min_samples_leaf = 1
#min_samples_split = 2
#min_weight_Fraction_leaf = 1e-05


[0.01, 0.01, 0.01, 0.05, 0.05, 0.005, 0.05, 0.01, 0.05, 0.05]

In [None]:
len(range(27,))

In [112]:
from sklearn.linear_model import LogisticRegression as lr 

classifier = XGBClassifier(objective= 'binary:logistic', subsample = 0.8, colsample_bytree=0.8, learning_rate=0.01, 
                     max_depth=5, min_child_weight = 4, gamma=0, reg_lambda=2)

weights = WRANDSEARCH(classifier, train_sp[:60000], train_true_labels[:60000], [1,  2,  4,  6,  9, 11, 16, 17, 20, 21, 22, 23, 24, 25, 26, 27], 
                      num_features = 21, iterations = 50)
weights

0 / 50
1 / 50
2 / 50
3 / 50
4 / 50
5 / 50
6 / 50
7 / 50
8 / 50
9 / 50
10 / 50
11 / 50
12 / 50
13 / 50
14 / 50
15 / 50
16 / 50
17 / 50
18 / 50
19 / 50
20 / 50
21 / 50
22 / 50
23 / 50
24 / 50
25 / 50
26 / 50
27 / 50
28 / 50
29 / 50
30 / 50
31 / 50
32 / 50
33 / 50
34 / 50
35 / 50
36 / 50
37 / 50
38 / 50
39 / 50
40 / 50
41 / 50
42 / 50
43 / 50
44 / 50
45 / 50
46 / 50
47 / 50
48 / 50
49 / 50


array([1.        , 1.89805278, 1.79589894, 1.        , 1.78291101,
       1.        , 1.87606643, 1.        , 1.        , 1.83137194,
       1.        , 1.90559243, 1.        , 1.        , 1.        ,
       1.        , 1.92520885, 1.93640245, 1.        , 1.        ,
       1.93564076, 1.93544767, 1.8534672 , 1.9270696 , 1.9209067 ,
       1.91657448, 1.82074365, 1.80740913, 1.        , 1.00834313,
       1.0032723 , 1.        , 1.00834313, 1.        , 1.00504387,
       1.        , 1.03667017, 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.78414592, 1.        ,
       1.00625022, 1.        , 1.        , 1.        , 1.00049745,
       1.00040459, 1.        , 1.        , 1.        , 1.        ,
       1.00696388, 1.        , 1.9463327 , 1.        , 1.        ,
       1.        , 1.0042149 , 1.        , 1.        , 1.        ,
       1.00142482, 1.        , 1.00464836, 1.        , 1.     

In [23]:
np.where(selector.ranking_==1)

(array([ 0,  1,  2,  4,  5,  6,  7, 10, 15, 16, 18, 19, 20, 21, 22, 23, 25,
        26]),)

In [134]:
t0 = time.time()
(ms,cc) = pw.by_chunk(training_set[:10000],train_graph,kdtree,l,node_dict,index_dict)
t1 = time.time()-t0

0 / 10
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10


ValueError: too many values to unpack (expected 2)

In [36]:
competition_paths_dict = prep.all_paths(competition_set,train_graph,pairs_subset_edges=False)

0 / 94
20 / 94
40 / 94
60 / 94
80 / 94


In [144]:
file_path = './features_train/'+'max_sim'+'_reducedgraph'
try:
    this_feat = read_feature(file_path)
except:
    print(1)