In [1]:
import random
import pickle
import sklearn
import numpy as np

from feature_extractor import *
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

## Run this first time

In [2]:
matrix = {}
for line in tqdm(lines, ascii = True):
    temp = list(map(int, line.split()))
    if len(temp[1:]) > 0:
        matrix[temp[0]] = set(temp[1:])

with open("matrix.pickle","wb") as f:
    pickle.dump(matrix, f)

Flag 1

In [None]:
with open("raw.pickle","rb") as f:
    data = pickle.load(f)

In [3]:
with open("matrix.pickle","rb") as f:
    matrix = pickle.load(f)

In [17]:
def feature_generator(pairs):
    
    X = []
    for datum in tqdm(pairs):
        a, b = datum

        # node feature
        a_in = indegree(a, matrix)
        a_out = outdegree(a, matrix)
        b_in = indegree(b, matrix)
        b_out = indegree(b, matrix)

        # neighbouring feature
        neighbour = common_neighbour(a, b, matrix)
        jac = jaccard(neighbour, a, b, matrix)
        p_a = pref_attach(a, b, matrix)
        cos = cosine_sim(neighbour, p_a)
        adar = adamic_adar(a, b, matrix)

        # path feature
        #sim_r = sim_rank(a, b, matrix, 0)

        X.append([a_in,a_out,b_in,b_out,neighbour,jac,p_a,cos,adar])
        
    return X

## Flag 2

In [5]:
X_train = np.load("Xtrain")
X_test = np.load("Xtest")
y_train = np.load("ytrain")
y_test = np.load("ytest")

In [6]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


0.6296666666666667


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

gbdt = GradientBoostingClassifier(verbose=1)
gbdt.fit(X_train, y_train)
print(gbdt.score(X_test, y_test))

      Iter       Train Loss   Remaining Time 
         1           1.3596            2.27s
         2           1.3378            2.10s
         3           1.3198            2.23s
         4           1.3047            2.13s
         5           1.2917            2.05s
         6           1.2808            2.00s
         7           1.2713            1.96s
         8           1.2623            1.96s
         9           1.2546            1.93s
        10           1.2484            1.90s
        20           1.2097            1.41s
        30           1.1913            1.13s
        40           1.1801            0.98s
        50           1.1692            0.79s
        60           1.1610            0.63s
        70           1.1544            0.48s
        80           1.1499            0.31s
        90           1.1462            0.16s
       100           1.1428            0.00s
0.6653333333333333


In [13]:
base = GradientBoostingClassifier()
parameters = {"learning_rate":[0.05,0.1,0.15,0.2,0.3],
              "max_depth":[2,3,5,7],
              "min_samples_leaf":[1,3,5]}
model = GridSearchCV(base, parameters, n_jobs=-1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.6623333333333333


In [15]:
print(roc_auc_score(y_test, np.squeeze(model.predict_proba(X_test))[:,1]))

0.7404098187596537


In [14]:
model.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [11]:
model.best_estimator_.feature_importances_

array([0.16155703, 0.22354075, 0.1681103 , 0.14384539, 0.02819839,
       0.04747582, 0.11487595, 0.08677268, 0.0256237 ])

In [16]:
with open("test-public.txt","r") as f:
    test = f.readlines()

X_t = []
for l in test[1:]:
    temp = list(map(int, l.split()))
    X_t.append([temp[1], temp[2]])
print(len(X_t))

2000


In [18]:
X_t = np.array(feature_generator(X_t))

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:24<00:00, 80.91it/s]


In [19]:
y_pred = model.best_estimator_.predict(X_t)

In [21]:
y_pred_prob = model.best_estimator_.predict_proba(X_t)

In [22]:
with open("pred.csv","w") as f:
    f.write("Id,Prediction\n")
    for i in range(1,len(y_pred_prob)+1):
        f.write(str(i) + "," + str(y_pred_prob[i-1][1]) + "\n")

In [None]:
X_t[-1]

In [20]:
(y_pred == 0).sum()

1131

In [None]:
model.best_estimator_.classes_