# Day1 - Traditional ML methods for KG completion

In this lab you will learn: 
- how to read a Knowledge Graph using NetworkX
- how to build a simple threshold model based on graph topological features
- how to train a classifier using the topological features of the graph

In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
from tqdm import tqdm

In [2]:
#Given a dataframe, generate a list with all the triples and another list with the "negative" relations
def generate_pos_neg_relations(df, number_neg_rel_triple=1):
    pairs=defaultdict(list)
    relations=defaultdict(list)
    for i in df.iterrows():
        s = i[1]['source']
        t = i[1]['tail']
        r = i[1][1]
      #  print(s,t,r)
        pairs[(s,t)].append(r)
        relations[r].append((s,t))
    
    sourceList=set()
    targetList=set()
    for i in list(pairs.keys()):
        if type(i)==int:
            break
        sourceList.add(i[0])
        targetList.add(i[1])
    
    negative_relations = defaultdict(list)
    for rel, p in relations.items():
        rel_targets = set()
        for i in p:
            rel_targets.add(i[1])
        candidate_targets = targetList.difference(rel_targets)
        
        for i,j in p:
            negative_targets = random.choices(list(candidate_targets),
                                          k=(min(len(candidate_targets),number_neg_rel_triple)))
            for t in negative_targets:
                negative_relations[rel].append((i,t))
    return relations,negative_relations

In [3]:
# Given a graph, the corresponding undirected graph and the corresponding undirected and 
# unweighted graph, it computes some topological node features
def reachable_from_rel(s,r):
    reachable=set()
    if s in Gtrain:
        for n,i in Gtrain[s].items():
            if r==i[0][1]:
                reachable.add(n)
    return reachable



def compute_features(G,GUnd,GUndSimple,i,j,rel=None):
    cn=aa=jc=ind_i = outd_i = ind_j = outd_j =0
    
    if ((i in GUnd) and (j in GUnd)):
        cn = len(list(nx.common_neighbors(GUnd, i,j)))
   # if ((i in GUndSimple) and (j in GUndSimple)):
   #     aa =[aa for u,v,aa in nx.adamic_adar_index(GUndSimple, [(i,j)])][0]
    if ((i in GUndSimple) and (j in GUndSimple)):
        jc = [jc for u,v,jc in nx.jaccard_coefficient(GUndSimple, [(i,j)])][0]
    if (i in G):
        ind_i = G.in_degree(i)
        outd_i = G.out_degree(i)
    if (j in G):
        ind_j = G.in_degree(j)
        outd_j = G.out_degree(j)
    '''
    if rel:
        rel_s=len(reachable_from_rel(i,rel))
        rel_t=len(reachable_from_rel(j,rel))
        common_rel_s_t=len(reachable_from_rel(i,rel).intersection(reachable_from_rel(j,rel)))
        union_rel_s_t=len(reachable_from_rel(i,rel).union(reachable_from_rel(j,rel)))
        jaccard_rel_s_t=0
        if union_rel_s_t>0:
            jaccard_rel_s_t=(union_rel_s_t-common_rel_s_t)/union_rel_s_t
    '''   
    #print(i,j,cn,aa,jc,ind_i,ind_j,outd_i,outd_j)
    return cn,jc,ind_i,ind_j,outd_i,outd_j


In [4]:
def computeF1(df_res):
    tn = df_res['tn'].sum()
    tp = df_res['tp'].sum()
    fn = df_res['fn'].sum()
    fp = df_res['fp'].sum() 
    precision = tp/(tp+fp)
    recall = tp / (tp+fn)
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1

In [5]:
#Given a triple, compute some node features
def compute_triples_features(G,GUnd,GUndSimple,rel,triple,label):
    curr_triples_feature={}
    curr_triples_feature['relation'] = rel
    curr_triples_feature['label'] = label
    curr_triples_feature['source'] = triple[0]
    curr_triples_feature['target'] = triple[1]
    for ix,feature in enumerate(compute_features(G,GUnd,GUndSimple,triple[0],triple[1],rel)):
        curr_triples_feature['f'+str(ix)] = feature
    return curr_triples_feature

In [6]:
#Compute featurees for each positive and negative relation
def generate_df_features(G,relations,negative_relations=None):
    triples=[]
    GUnd = G.to_undirected()
    GUndSimple = nx.Graph(GUnd)
    for rel in relations.keys():
        for posTriple in relations[rel]:
            triples.append(compute_triples_features(G,GUnd,GUndSimple,rel,posTriple,1))
        if negative_relations:
            for negTriple in negative_relations[rel]:
                triples.append(compute_triples_features(G,GUnd,GUndSimple,rel,negTriple,0))
    df_rel = pd.DataFrame(triples)
    return df_rel

# Read dataset

We use the Kinship KG, you can download it at https://github.com/ZhenfengLei/KGDatasets/tree/master/Kinship

In [7]:
#train = pd.read_csv('KGDatasets/FB15k-237/train.txt',sep='\t',header=None)
#validation =  pd.read_csv('KGDatasets/FB15k-237/valid.txt',sep='\t',header=None)
#test =  pd.read_csv('KGDatasets/FB15k-237/test.txt',sep='\t',header=None)
#entity2id_df =  pd.read_csv('KGDatasets/FB15k-237/entity2id.txt',sep='\t',header=None)
#entity2id =  dict(zip(entity2id_df[0], entity2id_df[1]))


In [8]:
train = pd.read_csv('KGDatasets/Kinship/train.txt',sep='\t',header=None)
validation =  pd.read_csv('KGDatasets/Kinship/valid.txt',sep='\t',header=None)
test =  pd.read_csv('KGDatasets/Kinship/test.txt',sep='\t',header=None)
entity2id_df =  pd.read_csv('KGDatasets/Kinship/entity2id.txt',sep='\t',header=None)
entity2id =  dict(zip(entity2id_df[0], entity2id_df[1]))


In [9]:
train.sample(5)

Unnamed: 0,0,1,2
7168,person88,term8,person21
822,person94,term22,person71
7736,person50,term5,person14
7372,person20,term0,person39
3282,person5,term15,person94


In [10]:
test.sample(5)

Unnamed: 0,0,1,2
666,person33,term12,person35
530,person33,term8,person16
799,person75,term9,person25
1006,person78,term22,person70
657,person50,term8,person41


In [11]:
validation.sample(5)

Unnamed: 0,0,1,2
1065,person57,term15,person68
857,person24,term9,person32
173,person47,term18,person58
339,person82,term16,person40
914,person47,term13,person93


In [12]:
train['source'] = train[0].apply(lambda x: entity2id[x])
train['tail'] = train[2].apply(lambda x: entity2id[x])
test['source'] = test[0].apply(lambda x: entity2id[x])
test['tail'] = test[2].apply(lambda x: entity2id[x])
validation['source'] = validation[0].apply(lambda x: entity2id[x])
validation['tail'] = validation[2].apply(lambda x: entity2id[x])


# Create the train MultiDigraph

We represent the KG as a Multi-Directed graph. We use NetworkX (full documentation at https://networkx.org/documentation/stable/ ) to convert the Pandas DataFrame in a graph.

In [13]:
Gtrain=nx.from_pandas_edgelist(train, source='source', target='tail', edge_attr=1, create_using=nx.MultiDiGraph)

In [14]:
print('Train-> Nodes:{} Edges:{}'.format(Gtrain.number_of_nodes(),Gtrain.number_of_edges()))

Train-> Nodes:104 Edges:8544


In NetworkX, some functions (e.g. Common Neighbors) are only defined on simple and undirected graphs. 
For this reason, we transform the train MultiDigraph in a (simple) undirected graph

In [15]:
GtrainUnd = Gtrain.to_undirected()
GtrainSimple = nx.DiGraph(Gtrain)
GtrainUndSimple = nx.Graph(GtrainUnd)

## Add negative relations to train, test and validation set

In order to properly train and thest our models, out training, validation and test set should include negative examples i.e. triples not in the graph. In the following, ee generate such triples.

Notice that in the dataset Kaggle competition both training and test set have already some negative examples (however, you might want to increase the negative riples in the training set)

In [16]:
number_neg_rel_triple = 1
print("Generate list of positive and negative relations in the training set..")
rel_train,negrel_train = generate_pos_neg_relations(train,number_neg_rel_triple)
print("Generate list of positive and negative relations in the validation set..")
rel_validation,negrel_validation = generate_pos_neg_relations(validation,number_neg_rel_triple)
print("Generate list of positive and negative relations in the test set..")
rel_test,negrel_test = generate_pos_neg_relations(test,number_neg_rel_triple)


Generate list of positive and negative relations in the training set..
Generate list of positive and negative relations in the validation set..
Generate list of positive and negative relations in the test set..


## Generate node features for the training set
In the following, we generate some topological features for training, validation and test sets.


### Training set

In [17]:
print("Generate edge fetures..")
df_train = generate_df_features(Gtrain,rel_train,negrel_train).sample(frac=1)
df_train.sample(5)

Generate edge fetures..


Unnamed: 0,relation,label,source,target,f0,f1,f2,f3,f4,f5
9537,term1,1,43,92,98,0.942308,85,87,81,84
11746,term13,0,64,18,98,0.942308,78,83,89,83
15066,term3,1,38,17,97,0.932692,84,72,88,83
6589,term17,1,58,101,92,0.884615,84,81,80,83
5174,term8,0,101,68,92,0.884615,81,82,83,76


### Validation set

In [18]:
print("Generate edge fetures..")
df_validation = generate_df_features(Gtrain,rel_validation,negrel_validation).sample(frac=1)
df_validation.sample(5)

Generate edge fetures..


Unnamed: 0,relation,label,source,target,f0,f1,f2,f3,f4,f5
1214,term3,1,84,47,94,0.921569,81,70,85,82
761,term11,1,75,58,92,0.884615,80,84,81,80
2014,term22,1,59,63,93,0.894231,77,91,78,79
1001,term8,0,63,40,98,0.942308,91,85,79,89
840,term11,0,84,103,95,0.931373,81,80,85,76


### Test set

In [19]:
print("Generate edge fetures..")
df_test = generate_df_features(Gtrain,rel_test,negrel_test).sample(frac=1) #shuffle
df_test.sample(5)

Generate edge fetures..


Unnamed: 0,relation,label,source,target,f0,f1,f2,f3,f4,f5
1467,term17,1,37,6,97,0.932692,87,79,80,82
383,term10,1,90,13,96,0.923077,87,78,87,86
509,term18,1,77,72,95,0.931373,83,85,84,85
1041,term15,0,8,59,90,0.865385,77,77,83,78
85,term11,1,10,15,91,0.90099,85,80,75,80


## 1) Combine node features for each relation



In [20]:

def get_threshold_score(r,df_train,df_validation,feature_name): 
    df_train_rel = df_train[(df_train.relation==r) & (df_train.label==1)]
    #
    df_train_rel['score']=df_train_rel['label']*df_train_rel[feature_name]
    train_score = df_train_rel['score'].mean(axis=0)
    
    
    y_validation = df_validation[df_validation.relation==r]['label']
    X_validation = df_validation[df_validation.relation==r][feature_name]
    #print(df_validation[df_validation.relation==r].apply(lambda x: 1 if (x['f0']>train_score) else 0))
    pred_class_validation = X_validation>train_score
   # print(pred_class_validation)
   # print(y_validation,pred_class_validation)
    tn, fp, fn, tp = metrics.confusion_matrix(y_validation,pred_class_validation).ravel()
    return {'train':df_train_rel.shape[0],'test':y_validation.shape[0],
            'tn':tn,'fp':fp,'fn':fn,'tp':tp,
            'precision':metrics.precision_score(y_validation,pred_class_validation),
            'recall':metrics.recall_score(y_validation,pred_class_validation),
            'accuracy':metrics.accuracy_score(y_validation,pred_class_validation),
            'F1':metrics.f1_score(y_validation,pred_class_validation)}
    
    


In [21]:
features = ['f0','f1','f2','f3','f4','f5']
threshold_score =[]
for feat in features:
    baseline_validation = []
    for r in rel_validation.keys():
        res = get_threshold_score(r,df_train,df_validation,feat)
        print("{}: train {}, test {}, tn {},fp {},fn {},tp {},precision {}, recall {},accuracy {}, f1-score {}"
              .format(r,res['train'],res['test'],res['tn'],res['fp'],res['fn'],res['tp']
                      ,res['precision'],res['recall'],res['accuracy'],res['F1']))
        baseline_validation.append(res)
    f1 = computeF1(pd.DataFrame(baseline_validation))
    threshold_score.append((feat,f1))
threshold_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


term12: train 236, test 66, tn 22,fp 11,fn 17,tp 16,precision 0.5925925925925926, recall 0.48484848484848486,accuracy 0.5757575757575758, f1-score 0.5333333333333333
term5: train 404, test 124, tn 34,fp 28,fn 35,tp 27,precision 0.4909090909090909, recall 0.43548387096774194,accuracy 0.49193548387096775, f1-score 0.4615384615384615
term15: train 757, test 190, tn 44,fp 51,fn 40,tp 55,precision 0.5188679245283019, recall 0.5789473684210527,accuracy 0.5210526315789473, f1-score 0.5472636815920399
term16: train 1004, test 242, tn 52,fp 69,fn 52,tp 69,precision 0.5, recall 0.5702479338842975,accuracy 0.5, f1-score 0.5328185328185328
term1: train 384, test 92, tn 19,fp 27,fn 19,tp 27,precision 0.5, recall 0.5869565217391305,accuracy 0.5, f1-score 0.54
term11: train 600, test 134, tn 34,fp 33,fn 32,tp 35,precision 0.5147058823529411, recall 0.5223880597014925,accuracy 0.5149253731343284, f1-score 0.5185185185185184
term8: train 637, test 156, tn 41,fp 37,fn 43,tp 35,precision 0.48611111111111

term4: train 393, test 110, tn 30,fp 25,fn 30,tp 25,precision 0.5, recall 0.45454545454545453,accuracy 0.5, f1-score 0.47619047619047616
term18: train 460, test 108, tn 28,fp 26,fn 28,tp 26,precision 0.5, recall 0.48148148148148145,accuracy 0.5, f1-score 0.49056603773584906
term6: train 370, test 82, tn 25,fp 16,fn 25,tp 16,precision 0.5, recall 0.3902439024390244,accuracy 0.5, f1-score 0.4383561643835617
term22: train 153, test 46, tn 8,fp 15,fn 8,tp 15,precision 0.5, recall 0.6521739130434783,accuracy 0.5, f1-score 0.5660377358490566
term21: train 106, test 40, tn 8,fp 12,fn 8,tp 12,precision 0.5, recall 0.6,accuracy 0.5, f1-score 0.5454545454545454
term20: train 209, test 56, tn 17,fp 11,fn 17,tp 11,precision 0.5, recall 0.39285714285714285,accuracy 0.5, f1-score 0.44
term12: train 236, test 66, tn 19,fp 14,fn 19,tp 14,precision 0.5, recall 0.42424242424242425,accuracy 0.5, f1-score 0.4590163934426229
term5: train 404, test 124, tn 30,fp 32,fn 34,tp 28,precision 0.4666666666666667, 

term13: train 367, test 64, tn 22,fp 10,fn 18,tp 14,precision 0.5833333333333334, recall 0.4375,accuracy 0.5625, f1-score 0.5
term17: train 320, test 70, tn 14,fp 21,fn 14,tp 21,precision 0.5, recall 0.6,accuracy 0.5, f1-score 0.5454545454545454
term3: train 299, test 90, tn 29,fp 16,fn 21,tp 24,precision 0.6, recall 0.5333333333333333,accuracy 0.5888888888888889, f1-score 0.5647058823529412
term7: train 663, test 142, tn 33,fp 38,fn 39,tp 32,precision 0.45714285714285713, recall 0.4507042253521127,accuracy 0.45774647887323944, f1-score 0.45390070921985815
term9: train 370, test 106, tn 25,fp 28,fn 19,tp 34,precision 0.5483870967741935, recall 0.6415094339622641,accuracy 0.5566037735849056, f1-score 0.591304347826087
term2: train 183, test 54, tn 13,fp 14,fn 17,tp 10,precision 0.4166666666666667, recall 0.37037037037037035,accuracy 0.42592592592592593, f1-score 0.39215686274509803
term10: train 392, test 98, tn 25,fp 24,fn 33,tp 16,precision 0.4, recall 0.32653061224489793,accuracy 0.4

[('f0', 0.5250463821892394),
 ('f1', 0.5650821856952465),
 ('f2', 0.5046382189239332),
 ('f3', 0.4424695977549112),
 ('f4', 0.44258872651356995),
 ('f5', 0.5070821529745042)]

In [22]:
best_feature,score = max(threshold_score,key=lambda x:x[1])
best_feature,score

('f1', 0.5650821856952465)

In [23]:
baseline_test = []
for r in rel_test.keys():
    res = get_threshold_score(r,df_train,df_test,best_feature)
    print("{}: train {}, test {}, precision {}, recall {},accuracy {}, f1-score {}"
          .format(r,res['train'],res['test'],res['precision'],res['recall'],res['accuracy'],res['F1']))
    baseline_test.append(res)


term21: train 106, test 32, precision 0.6875, recall 0.6875,accuracy 0.6875, f1-score 0.6875
term11: train 600, test 144, precision 0.5119047619047619, recall 0.5972222222222222,accuracy 0.5138888888888888, f1-score 0.5512820512820512
term7: train 663, test 166, precision 0.5529411764705883, recall 0.5662650602409639,accuracy 0.5542168674698795, f1-score 0.5595238095238094
term10: train 392, test 128, precision 0.5492957746478874, recall 0.609375,accuracy 0.5546875, f1-score 0.5777777777777777
term18: train 460, test 110, precision 0.53125, recall 0.6181818181818182,accuracy 0.5363636363636364, f1-score 0.5714285714285714


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


term6: train 370, test 84, precision 0.5098039215686274, recall 0.6190476190476191,accuracy 0.5119047619047619, f1-score 0.5591397849462365
term8: train 637, test 180, precision 0.47474747474747475, recall 0.5222222222222223,accuracy 0.4722222222222222, f1-score 0.4973544973544973
term12: train 236, test 60, precision 0.5, recall 0.6333333333333333,accuracy 0.5, f1-score 0.5588235294117647
term15: train 757, test 182, precision 0.4528301886792453, recall 0.5274725274725275,accuracy 0.44505494505494503, f1-score 0.4873096446700508
term5: train 404, test 84, precision 0.5348837209302325, recall 0.5476190476190477,accuracy 0.5357142857142857, f1-score 0.5411764705882354
term16: train 1004, test 262, precision 0.45806451612903226, recall 0.5419847328244275,accuracy 0.45038167938931295, f1-score 0.49650349650349646
term17: train 320, test 74, precision 0.6136363636363636, recall 0.7297297297297297,accuracy 0.6351351351351351, f1-score 0.6666666666666666
term1: train 384, test 118, precision

In [24]:
pd.DataFrame(baseline_test).mean()

train        371.130435
test          93.391304
tn            20.739130
fp            25.956522
fn            19.956522
tp            26.739130
precision      0.516904
recall         0.596834
accuracy       0.517639
F1             0.549827
dtype: float64

In [25]:
print('Test F1-score: {}'.format(computeF1(pd.DataFrame(baseline_test))))

Test F1-score: 0.5380577427821522


## 2) Train and test a classifier for each relation

### Train with grid search on validation set

In [26]:
from sklearn.model_selection import ParameterGrid
param_grid = {
    'max_depth': [20,40,60,80],
    'n_estimators': [50,100, 200]
}
list(ParameterGrid(param_grid))

[{'max_depth': 20, 'n_estimators': 50},
 {'max_depth': 20, 'n_estimators': 100},
 {'max_depth': 20, 'n_estimators': 200},
 {'max_depth': 40, 'n_estimators': 50},
 {'max_depth': 40, 'n_estimators': 100},
 {'max_depth': 40, 'n_estimators': 200},
 {'max_depth': 60, 'n_estimators': 50},
 {'max_depth': 60, 'n_estimators': 100},
 {'max_depth': 60, 'n_estimators': 200},
 {'max_depth': 80, 'n_estimators': 50},
 {'max_depth': 80, 'n_estimators': 100},
 {'max_depth': 80, 'n_estimators': 200}]

In [27]:
def train_model_parameter(r,df_train,df_validation,param):
    print(param)
    y_train = df_train[df_train.relation==r]['label']
    X_train = df_train[df_train.relation==r][['f0','f1','f2','f3','f4','f5']]
    y_validation = df_validation[df_validation.relation==r]['label']
    X_validation = df_validation[df_validation.relation==r][['f0','f1','f2','f3','f4','f5']]
    model = RandomForestClassifier(**param, random_state=42)
    model.fit(X_train, y_train)
    pred_validation = model.predict_proba(X_validation)
    pred_class_validation = model.predict(X_validation)
    fpr, tpr, thresholds = metrics.roc_curve(y_validation, pred_validation[:,0], pos_label=1)
    tn, fp, fn, tp = metrics.confusion_matrix(y_validation,pred_class_validation).ravel()
    return {'train':y_train.shape[0],'test':y_validation.shape[0],
            'tn':tn,'fp':fp,'fn':fn,'tp':tp,
            'AUC':metrics.auc(fpr, tpr),'precision':metrics.precision_score(y_validation,pred_class_validation),
            'recall':metrics.recall_score(y_validation,pred_class_validation),
            'accuracy':metrics.accuracy_score(y_validation,pred_class_validation),
            'F1':metrics.f1_score(y_validation,pred_class_validation)}

We perform a grid search in orderto find the best paramters for the Random forest model.

It is also possibile to select a set of parameters for each model and to explore different classification models (e.g. gradient boosting).

In [28]:
score_param = []
for param in list(ParameterGrid(param_grid)):
    auc_validation = []
    for r in rel_validation.keys():
        res = train_model_parameter(r,df_train,df_validation,param)
        print("{}: train {}, test {}, auc {}, precision {},recall {}, accuracy {}, f1-score {}"
              .format(r,res['train'],res['test'],res['AUC'],res['precision'],res['recall'],res['accuracy'],res['F1']))
        auc_validation.append(res)
    f1 = pd.DataFrame(auc_validation).mean()['F1']
    score_param.append((param,f1))
score_param

{'max_depth': 20, 'n_estimators': 50}
term12: train 472, test 66, auc 0.12626262626262624, precision 0.7435897435897436,recall 0.8787878787878788, accuracy 0.7878787878787878, f1-score 0.8055555555555556
{'max_depth': 20, 'n_estimators': 50}
term5: train 808, test 124, auc 0.1757284079084287, precision 0.7313432835820896,recall 0.7903225806451613, accuracy 0.75, f1-score 0.7596899224806203
{'max_depth': 20, 'n_estimators': 50}
term15: train 1514, test 190, auc 0.38764542936288093, precision 0.5652173913043478,recall 0.8210526315789474, accuracy 0.5947368421052631, f1-score 0.6695278969957081
{'max_depth': 20, 'n_estimators': 50}
term16: train 1004, test 242, auc 0.5, precision 0.5,recall 1.0, accuracy 0.5, f1-score 0.6666666666666666
{'max_depth': 20, 'n_estimators': 50}
term1: train 768, test 92, auc 0.5278827977315691, precision 0.5063291139240507,recall 0.8695652173913043, accuracy 0.5108695652173914, f1-score 0.6400000000000001
{'max_depth': 20, 'n_estimators': 50}
term11: train 12

term12: train 472, test 66, auc 0.1271808999081726, precision 0.725,recall 0.8787878787878788, accuracy 0.7727272727272727, f1-score 0.7945205479452054
{'max_depth': 20, 'n_estimators': 200}
term5: train 808, test 124, auc 0.1677939646201873, precision 0.7272727272727273,recall 0.7741935483870968, accuracy 0.7419354838709677, f1-score 0.7500000000000001
{'max_depth': 20, 'n_estimators': 200}
term15: train 1514, test 190, auc 0.3809418282548477, precision 0.5563380281690141,recall 0.8315789473684211, accuracy 0.5842105263157895, f1-score 0.6666666666666667
{'max_depth': 20, 'n_estimators': 200}
term16: train 1004, test 242, auc 0.5, precision 0.5,recall 1.0, accuracy 0.5, f1-score 0.6666666666666666
{'max_depth': 20, 'n_estimators': 200}
term1: train 768, test 92, auc 0.5439508506616257, precision 0.5128205128205128,recall 0.8695652173913043, accuracy 0.5217391304347826, f1-score 0.6451612903225806
{'max_depth': 20, 'n_estimators': 200}
term11: train 1200, test 134, auc 0.56237469369570

term21: train 212, test 40, auc 0.33375000000000005, precision 0.5454545454545454,recall 0.6, accuracy 0.55, f1-score 0.5714285714285713
{'max_depth': 40, 'n_estimators': 50}
term20: train 418, test 56, auc 0.42538265306122447, precision 0.6071428571428571,recall 0.6071428571428571, accuracy 0.6071428571428571, f1-score 0.6071428571428571
{'max_depth': 40, 'n_estimators': 100}
term12: train 472, test 66, auc 0.12672176308539942, precision 0.75,recall 0.9090909090909091, accuracy 0.803030303030303, f1-score 0.821917808219178
{'max_depth': 40, 'n_estimators': 100}
term5: train 808, test 124, auc 0.1750780437044745, precision 0.7205882352941176,recall 0.7903225806451613, accuracy 0.7419354838709677, f1-score 0.7538461538461538
{'max_depth': 40, 'n_estimators': 100}
term15: train 1514, test 190, auc 0.3914127423822714, precision 0.5579710144927537,recall 0.8105263157894737, accuracy 0.5842105263157895, f1-score 0.6609442060085836
{'max_depth': 40, 'n_estimators': 100}
term16: train 1004, t

term22: train 306, test 46, auc 0.10302457466918713, precision 0.8,recall 0.8695652173913043, accuracy 0.8260869565217391, f1-score 0.8333333333333333
{'max_depth': 40, 'n_estimators': 200}
term21: train 212, test 40, auc 0.31125, precision 0.5925925925925926,recall 0.8, accuracy 0.625, f1-score 0.6808510638297872
{'max_depth': 40, 'n_estimators': 200}
term20: train 418, test 56, auc 0.3877551020408163, precision 0.5666666666666667,recall 0.6071428571428571, accuracy 0.5714285714285714, f1-score 0.5862068965517241
{'max_depth': 60, 'n_estimators': 50}
term12: train 472, test 66, auc 0.12350780532598712, precision 0.7435897435897436,recall 0.8787878787878788, accuracy 0.7878787878787878, f1-score 0.8055555555555556
{'max_depth': 60, 'n_estimators': 50}
term5: train 808, test 124, auc 0.17169614984391257, precision 0.7428571428571429,recall 0.8387096774193549, accuracy 0.7741935483870968, f1-score 0.787878787878788
{'max_depth': 60, 'n_estimators': 50}
term15: train 1514, test 190, auc 0

term6: train 740, test 82, auc 0.19928613920285543, precision 0.72,recall 0.8780487804878049, accuracy 0.7682926829268293, f1-score 0.7912087912087912
{'max_depth': 60, 'n_estimators': 100}
term22: train 306, test 46, auc 0.11153119092627599, precision 0.8,recall 0.8695652173913043, accuracy 0.8260869565217391, f1-score 0.8333333333333333
{'max_depth': 60, 'n_estimators': 100}
term21: train 212, test 40, auc 0.3175, precision 0.5833333333333334,recall 0.7, accuracy 0.6, f1-score 0.6363636363636365
{'max_depth': 60, 'n_estimators': 100}
term20: train 418, test 56, auc 0.4113520408163265, precision 0.5862068965517241,recall 0.6071428571428571, accuracy 0.5892857142857143, f1-score 0.5964912280701754
{'max_depth': 60, 'n_estimators': 200}
term12: train 472, test 66, auc 0.1271808999081726, precision 0.725,recall 0.8787878787878788, accuracy 0.7727272727272727, f1-score 0.7945205479452054
{'max_depth': 60, 'n_estimators': 200}
term5: train 808, test 124, auc 0.16298126951092612, precision 

term18: train 920, test 108, auc 0.49417009602194795, precision 0.5051546391752577,recall 0.9074074074074074, accuracy 0.5092592592592593, f1-score 0.6490066225165563
{'max_depth': 80, 'n_estimators': 50}
term6: train 740, test 82, auc 0.2046400951814396, precision 0.68,recall 0.8292682926829268, accuracy 0.7195121951219512, f1-score 0.7472527472527474
{'max_depth': 80, 'n_estimators': 50}
term22: train 306, test 46, auc 0.0992438563327032, precision 0.7727272727272727,recall 0.7391304347826086, accuracy 0.7608695652173914, f1-score 0.7555555555555555
{'max_depth': 80, 'n_estimators': 50}
term21: train 212, test 40, auc 0.33375000000000005, precision 0.5454545454545454,recall 0.6, accuracy 0.55, f1-score 0.5714285714285713
{'max_depth': 80, 'n_estimators': 50}
term20: train 418, test 56, auc 0.42538265306122447, precision 0.6071428571428571,recall 0.6071428571428571, accuracy 0.6071428571428571, f1-score 0.6071428571428571
{'max_depth': 80, 'n_estimators': 100}
term12: train 472, test 

term4: train 786, test 110, auc 0.3223140495867769, precision 0.5487804878048781,recall 0.8181818181818182, accuracy 0.5727272727272728, f1-score 0.656934306569343
{'max_depth': 80, 'n_estimators': 200}
term18: train 920, test 108, auc 0.4885116598079561, precision 0.5051546391752577,recall 0.9074074074074074, accuracy 0.5092592592592593, f1-score 0.6490066225165563
{'max_depth': 80, 'n_estimators': 200}
term6: train 740, test 82, auc 0.20047590719809638, precision 0.6923076923076923,recall 0.8780487804878049, accuracy 0.7439024390243902, f1-score 0.7741935483870966
{'max_depth': 80, 'n_estimators': 200}
term22: train 306, test 46, auc 0.10302457466918713, precision 0.8,recall 0.8695652173913043, accuracy 0.8260869565217391, f1-score 0.8333333333333333
{'max_depth': 80, 'n_estimators': 200}
term21: train 212, test 40, auc 0.31125, precision 0.5925925925925926,recall 0.8, accuracy 0.625, f1-score 0.6808510638297872
{'max_depth': 80, 'n_estimators': 200}
term20: train 418, test 56, auc 0

[({'max_depth': 20, 'n_estimators': 50}, 0.6854724456598276),
 ({'max_depth': 20, 'n_estimators': 100}, 0.6960738811617405),
 ({'max_depth': 20, 'n_estimators': 200}, 0.6965001194230355),
 ({'max_depth': 40, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 40, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 40, 'n_estimators': 200}, 0.696406999466795),
 ({'max_depth': 60, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 60, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 60, 'n_estimators': 200}, 0.696406999466795),
 ({'max_depth': 80, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 80, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 80, 'n_estimators': 200}, 0.696406999466795)]

In [29]:
score_param

[({'max_depth': 20, 'n_estimators': 50}, 0.6854724456598276),
 ({'max_depth': 20, 'n_estimators': 100}, 0.6960738811617405),
 ({'max_depth': 20, 'n_estimators': 200}, 0.6965001194230355),
 ({'max_depth': 40, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 40, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 40, 'n_estimators': 200}, 0.696406999466795),
 ({'max_depth': 60, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 60, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 60, 'n_estimators': 200}, 0.696406999466795),
 ({'max_depth': 80, 'n_estimators': 50}, 0.6856548743030275),
 ({'max_depth': 80, 'n_estimators': 100}, 0.6951670483586356),
 ({'max_depth': 80, 'n_estimators': 200}, 0.696406999466795)]

In [30]:
best_params,score = max(score_param,key=lambda x:x[1])
best_params,score

({'max_depth': 20, 'n_estimators': 200}, 0.6965001194230355)

### Test the best parameters

In [31]:
auc_test = []
for r in rel_test.keys():
    res = train_model_parameter(r,df_train,df_test,best_params)
    print("{}: train {}, test {}, auc {}, accuracy {}, f1-score {}".format(r,res['train'],res['test'],res['AUC'],
                                                              res['precision'],res['recall'],res['accuracy'],res['F1']))
    auc_test.append(res)


{'max_depth': 20, 'n_estimators': 200}
term21: train 212, test 32, auc 0.35546875, accuracy 0.6190476190476191, f1-score 0.8125
{'max_depth': 20, 'n_estimators': 200}
term11: train 1200, test 144, auc 0.5732060185185185, accuracy 0.5, f1-score 0.9027777777777778
{'max_depth': 20, 'n_estimators': 200}
term7: train 1326, test 166, auc 0.13775584264769922, accuracy 0.7741935483870968, f1-score 0.8674698795180723
{'max_depth': 20, 'n_estimators': 200}
term10: train 784, test 128, auc 0.1082763671875, accuracy 0.8115942028985508, f1-score 0.875
{'max_depth': 20, 'n_estimators': 200}
term18: train 920, test 110, auc 0.5193388429752067, accuracy 0.5, f1-score 0.8727272727272727
{'max_depth': 20, 'n_estimators': 200}
term6: train 740, test 84, auc 0.13973922902494332, accuracy 0.7358490566037735, f1-score 0.9285714285714286
{'max_depth': 20, 'n_estimators': 200}
term8: train 1274, test 180, auc 0.11944444444444442, accuracy 0.8409090909090909, f1-score 0.8222222222222222
{'max_depth': 20, 'n_e

In [32]:
pd.DataFrame(auc_test).sample(5)

Unnamed: 0,train,test,tn,fp,fn,tp,AUC,precision,recall,accuracy,F1
19,306,34,11,6,5,12,0.304498,0.666667,0.705882,0.676471,0.685714
21,68,4,2,0,1,1,0.5,1.0,0.5,0.75,0.666667
7,472,60,18,12,3,27,0.171667,0.692308,0.9,0.75,0.782609
13,598,70,13,22,5,30,0.405714,0.576923,0.857143,0.614286,0.689655
14,418,70,22,13,10,25,0.367755,0.657895,0.714286,0.671429,0.684932


In [33]:
pd.DataFrame(auc_test).mean()

train        698.608696
test          93.391304
tn            20.391304
fp            26.304348
fn             5.956522
tp            40.739130
AUC            0.338421
precision      0.645144
recall         0.814537
accuracy       0.658531
F1             0.704911
dtype: float64

In [34]:
df_res_test = pd.DataFrame(auc_test)

In [35]:
F1 = computeF1(df_res_test)

In [36]:
print('Test F1-score: {}'.format(F1))

Test F1-score: 0.7163608562691132
