In [None]:
import random
import pickle
import sklearn
import numpy as np

from feature_extractor import *
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

## Run this first time

In [None]:
matrix = {}
for line in tqdm(lines, ascii = True):
    temp = list(map(int, line.split()))
    if len(temp[1:]) > 0:
        matrix[temp[0]] = set(temp[1:])

with open("matrix.pickle","wb") as f:
    pickle.dump(matrix, f)

## Flag 1

In [None]:
with open("matrix.pickle","rb") as f:
    matrix = pickle.load(f)

In [None]:
def feature_generator(pairs):
    
    X = []
    for datum in tqdm(pairs):
        a, b = datum

        # node feature
        a_in = indegree(a, matrix)
        a_out = outdegree(a, matrix)
        b_in = indegree(b, matrix)
        b_out = indegree(b, matrix)

        # neighbouring feature
        neighbour = common_neighbour(a, b, matrix)
        jac = jaccard(neighbour, a, b, matrix)
        p_a = pref_attach(a, b, matrix)
        cos = cosine_sim(neighbour, p_a)
        adar = adamic_adar(a, b, matrix)

        # path feature
        #sim_r = sim_rank(a, b, matrix, 0)

        X.append([a_in,a_out,b_in,b_out,neighbour,jac,p_a,cos,adar])
        
    return X

## Flag 2

In [None]:
X_train = np.load("Xtrain")
X_test = np.load("Xtest")
y_train = np.load("ytrain")
y_test = np.load("ytest")

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbdt = GradientBoostingClassifier(verbose=1)
gbdt.fit(X_train, y_train)
print(gbdt.score(X_test, y_test))

In [None]:
base = GradientBoostingClassifier()
parameters = {"learning_rate":[0.05,0.1,0.15,0.2,0.3],
              "max_depth":[2,3,5,7],
              "min_samples_leaf":[1,3,5]}
model = GridSearchCV(base, parameters, n_jobs=-1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

In [None]:
print(roc_auc_score(y_test, np.squeeze(model.predict_proba(X_test))[:,1]))

In [None]:
model.best_estimator_

In [None]:
model.best_estimator_.feature_importances_

In [None]:
with open("test-public.txt","r") as f:
    test = f.readlines()

X_t = []
for l in test[1:]:
    temp = list(map(int, l.split()))
    X_t.append([temp[1], temp[2]])
print(len(X_t))

In [None]:
X_t = np.array(feature_generator(X_t))

In [None]:
y_pred = model.best_estimator_.predict(X_t)

In [None]:
y_pred_prob = model.best_estimator_.predict_proba(X_t)

In [None]:
with open("pred.csv","w") as f:
    f.write("Id,Prediction\n")
    for i in range(1,len(y_pred_prob)+1):
        f.write(str(i) + "," + str(y_pred_prob[i-1][1]) + "\n")

In [None]:
X_t[-1]

In [None]:
(y_pred == 0).sum()

In [None]:
model.best_estimator_.classes_