# Unicredit training camp

---

## Imports

In [366]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict
import random
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.inspection import permutation_importance
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
# Define seed for replicability
seed = 0

# Define functions

In [367]:
#Given a dataframe, generate a list with all the triples and another list with the "negative" relations
def generate_pos_neg_relations(df, number_neg_rel_triple=1):
    pairs=defaultdict(list)
    relations=defaultdict(list)
    for i in df.iterrows():
        s = i[1]['source']
        t = i[1]['tail']
        r = i[1]['relation']
      #  print(s,t,r)
        pairs[(s,t)].append(r)
        relations[r].append((s,t))
    
    sourceList=set()
    targetList=set()
    for i in list(pairs.keys()):
        if type(i)==int:
            break
        sourceList.add(i[0])
        targetList.add(i[1])
    
    negative_relations = defaultdict(list)
    for rel, p in relations.items():
        rel_targets = set()
        for i in p:
            rel_targets.add(i[1])
        candidate_targets = targetList.difference(rel_targets)
        
        for i,j in p:
            negative_targets = random.choices(list(candidate_targets),
                                          k=(min(len(candidate_targets),number_neg_rel_triple)))
            for t in negative_targets:
                negative_relations[rel].append((i,t))
    return relations,negative_relations

In [368]:
def compute_features(G,GUnd,GUndSimple,i,j,rel=None):
    cn=aa=jc=ind_i = outd_i = ind_j = outd_j =0
    
    if ((i in GUnd) and (j in GUnd)):
        cn = len(list(nx.common_neighbors(GUnd, i,j)))
   # if ((i in GUndSimple) and (j in GUndSimple)):
   #     aa =[aa for u,v,aa in nx.adamic_adar_index(GUndSimple, [(i,j)])][0]
    if ((i in GUndSimple) and (j in GUndSimple)):
        jc = [jc for u,v,jc in nx.jaccard_coefficient(GUndSimple, [(i,j)])][0]
    if (i in G):
        ind_i = G.in_degree(i)
        outd_i = G.out_degree(i)
    if (j in G):
        ind_j = G.in_degree(j)
        outd_j = G.out_degree(j)
    '''
    if rel:
        rel_s=len(reachable_from_rel(i,rel))
        rel_t=len(reachable_from_rel(j,rel))
        common_rel_s_t=len(reachable_from_rel(i,rel).intersection(reachable_from_rel(j,rel)))
        union_rel_s_t=len(reachable_from_rel(i,rel).union(reachable_from_rel(j,rel)))
        jaccard_rel_s_t=0
        if union_rel_s_t>0:
            jaccard_rel_s_t=(union_rel_s_t-common_rel_s_t)/union_rel_s_t
    '''   
    #print(i,j,cn,aa,jc,ind_i,ind_j,outd_i,outd_j)
    return cn,jc,ind_i,ind_j,outd_i,outd_j

In [369]:
#Given a triple, compute some node features
def compute_triples_features(G,GUnd,GUndSimple,rel,triple,label):
    curr_triples_feature={}
    curr_triples_feature['relation'] = rel
    curr_triples_feature['label'] = label
    curr_triples_feature['source'] = triple[0]
    curr_triples_feature['target'] = triple[1]
    for ix,feature in enumerate(compute_features(G,GUnd,GUndSimple,triple[0],triple[1],rel)):
        curr_triples_feature['f'+str(ix)] = feature
    return curr_triples_feature

In [370]:
#Compute featurees for each positive and negative relation
def generate_df_features(G,relations,negative_relations=None):
    triples=[]
    GUnd = G.to_undirected()
    GUndSimple = nx.Graph(GUnd)
    for rel in relations.keys():
        for posTriple in relations[rel]:
            triples.append(compute_triples_features(G,GUnd,GUndSimple,rel,posTriple,1))
        if negative_relations:
            for negTriple in negative_relations[rel]:
                triples.append(compute_triples_features(G,GUnd,GUndSimple,rel,negTriple,0))
    df_rel = pd.DataFrame(triples)
    return df_rel

In [409]:
def computeF1(df_res):
    tn = df_res['tn'].sum()
    tp = df_res['tp'].sum()
    fn = df_res['fn'].sum()
    fp = df_res['fp'].sum() 
    precision = tp/(tp+fp)
    recall = tp / (tp+fn)
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1

## Data loading

In [371]:
train_set = pd.read_csv('./data/kg_train.csv')
# train_set = train_set.sample(frac=0.5)
val_set = pd.read_csv('./data/kg_validation.csv')
# val_set = val_set.sample(frac=0.5)
test_set = pd.read_csv('./data/kg_test_nolabel.csv')
# test_set = test_set.sample(frac=0.5)

In [372]:
print("======================================================================================")
print("Train shape:",train_set.shape)
print("Validation shape:",val_set.shape)
print("Test shape:",test_set.shape)

Train shape: (91802, 3)
Validation shape: (22952, 3)
Test shape: (28690, 2)


In [373]:
train_set.head()

Unnamed: 0,Id,Triple,Label
0,0,34881 intercommunality 14230,0
1,1,9387 ownerOper 39573,0
2,2,12480 coach 24064,0
3,3,6871 branches 22010,0
4,4,13789 damsire 33095,0


In [374]:
# Split source, relation and tail from the Triple
train_set[['source', 'relation', 'tail']] = train_set['Triple'].str.split(' ', 2, expand=True)
val_set[['source', 'relation', 'tail']] = val_set['Triple'].str.split(' ', 2, expand=True)
test_set[['source', 'relation', 'tail']] = test_set['Triple'].str.split(' ', 2, expand=True)

In [375]:
train_set.head()

Unnamed: 0,Id,Triple,Label,source,relation,tail
0,0,34881 intercommunality 14230,0,34881,intercommunality,14230
1,1,9387 ownerOper 39573,0,9387,ownerOper,39573
2,2,12480 coach 24064,0,12480,coach,24064
3,3,6871 branches 22010,0,6871,branches,22010
4,4,13789 damsire 33095,0,13789,damsire,33095


In [376]:
val_set.head()

Unnamed: 0,Id,Triple,Label,source,relation,tail
0,91802,41074 host 9832,0,41074,host,9832
1,91803,12583 primaryLanguages 388,0,12583,primaryLanguages,388
2,91804,22259 seasonTopscorer 22261,1,22259,seasonTopscorer,22261
3,91805,8408 jurisdiction 646,0,8408,jurisdiction,646
4,91806,32 combatant 20690,0,32,combatant,20690


In [377]:
test_set.head()

Unnamed: 0,Id,Triple,source,relation,tail
0,114754,1322 operatingSystem 14477,1322,operatingSystem,14477
1,114755,5210 parent 11412,5210,parent,11412
2,114756,38658 leadersSeat 35321,38658,leadersSeat,35321
3,114757,41457 cableServ 4591,41457,cableServ,4591
4,114758,21579 visitorConference 214,21579,visitorConference,214


In [378]:
# Get number of different values for the relation field
print(len(train_set.relation.value_counts()))
print(len(val_set.relation.value_counts()))
print(len(test_set.relation.value_counts()))

# Get sizes for the datasets
print(train_set.shape)
print(val_set.shape)
print(test_set.shape)

2533
1174
1222
(91802, 6)
(22952, 6)
(28690, 5)


In [379]:
# Get the number and percentage of values in the response variable
print(train_set.Label.value_counts())
print(train_set.Label.value_counts()/train_set.shape[0])

1    45901
0    45901
Name: Label, dtype: int64
1    0.5
0    0.5
Name: Label, dtype: float64


## Feature engineering

In [380]:
# generate graph from list of edges
Gtrain=nx.from_pandas_edgelist(train_set, source='source', target='tail', edge_attr='relation', create_using=nx.MultiDiGraph)

In [381]:
# Show number of nodes and edges of the graph
print('Train-> Nodes:{} Edges:{}'.format(Gtrain.number_of_nodes(),Gtrain.number_of_edges()))

Train-> Nodes:43990 Edges:91802


In [382]:
GtrainUnd = Gtrain.to_undirected()
GtrainSimple = nx.DiGraph(Gtrain)
GtrainUndSimple = nx.Graph(GtrainUnd)

In [383]:
# Set number of negative relations to 0 for being able to replicate the code presented in the training session
number_neg_rel_triple = 0

print("Generate list of positive and negative relations in the training set..")
rel_train,negrel_train = generate_pos_neg_relations(train_set,number_neg_rel_triple)
print("Generate list of positive and negative relations in the validation set..")
rel_validation,negrel_validation = generate_pos_neg_relations(val_set,number_neg_rel_triple)
print("Generate list of positive and negative relations in the test set..")
rel_test,negrel_test = generate_pos_neg_relations(test_set,number_neg_rel_triple)


Generate list of positive and negative relations in the training set..
Generate list of positive and negative relations in the validation set..
Generate list of positive and negative relations in the test set..


In [384]:
print("Generate edge fetures..")
df_train = generate_df_features(Gtrain,rel_train)
# Retrieve the Id and Label from the initial data
df_train = pd.merge(train_set,df_train,how='inner',left_on=['source','relation','tail'],right_on=['source','relation','target']).drop_duplicates()
df_train.head(5)

Generate edge fetures..


Unnamed: 0,Id,Triple,Label,source,relation,tail,label,target,f0,f1,f2,f3,f4,f5
0,0,34881 intercommunality 14230,0,34881,intercommunality,14230,1,14230,0,0.0,2,2,2,1
1,1,9387 ownerOper 39573,0,9387,ownerOper,39573,1,39573,0,0.0,2,2,2,0
2,2,12480 coach 24064,0,12480,coach,24064,1,24064,0,0.0,4,4,1,3
3,3,6871 branches 22010,0,6871,branches,22010,1,22010,0,0.0,4,3,21,1
4,4,13789 damsire 33095,0,13789,damsire,33095,1,33095,0,0.0,0,3,17,0


In [385]:
print("Generate edge fetures..")
df_validation = generate_df_features(Gtrain,rel_validation)
# Retrieve the Id and Label from the initial data
df_validation = pd.merge(val_set,df_validation,how='inner',left_on=['source','relation','tail'],right_on=['source','relation','target']).drop_duplicates()
df_validation.head(5)

Generate edge fetures..


Unnamed: 0,Id,Triple,Label,source,relation,tail,label,target,f0,f1,f2,f3,f4,f5
0,91802,41074 host 9832,0,41074,host,9832,1,9832,0,0.0,3,1,1,1
1,91803,12583 primaryLanguages 388,0,12583,primaryLanguages,388,1,388,0,0.0,2,2,0,9
2,91804,22259 seasonTopscorer 22261,1,22259,seasonTopscorer,22261,1,22261,0,0.0,0,3,3,0
3,91805,8408 jurisdiction 646,0,8408,jurisdiction,646,1,646,0,0.0,3,1,3,2
4,91806,32 combatant 20690,0,32,combatant,20690,1,20690,0,0.0,2,2,1,0


In [386]:
print("Generate edge fetures..")
df_test = generate_df_features(Gtrain,rel_test)
# Retrieve the Id and Label from the initial data
df_test = pd.merge(test_set,df_test,how='inner',left_on=['source','relation','tail'],right_on=['source','relation','target']).drop_duplicates()
df_test.head(5)

Generate edge fetures..


Unnamed: 0,Id,Triple,source,relation,tail,label,target,f0,f1,f2,f3,f4,f5
0,114754,1322 operatingSystem 14477,1322,operatingSystem,14477,1,14477,0,0.0,1,6,3,0
1,114755,5210 parent 11412,5210,parent,11412,1,11412,0,0.0,2,3,0,1
2,114756,38658 leadersSeat 35321,38658,leadersSeat,35321,1,35321,0,0.0,0,1,4,4
3,114757,41457 cableServ 4591,41457,cableServ,4591,1,4591,0,0.0,0,4,5,2
4,114758,21579 visitorConference 214,21579,visitorConference,214,1,214,0,0.0,2,24,7,1


## Modeling with Random Forest classifier

In [391]:
def train_model_parameter(r,df_train,df_validation,param):
#     print(param)
    y_train = df_train[df_train.relation==r]['Label']
    X_train = df_train[df_train.relation==r][['f0','f1','f2','f3','f4','f5']]
    y_validation = df_validation[df_validation.relation==r]['Label']
    X_validation = df_validation[df_validation.relation==r][['f0','f1','f2','f3','f4','f5']]
    model = RandomForestClassifier(**param, random_state=42)
    model.fit(X_train, y_train)
    pred_validation = model.predict_proba(X_validation)
    pred_class_validation = model.predict(X_validation)
    fpr, tpr, thresholds = metrics.roc_curve(y_validation, pred_validation[:,0], pos_label=1)
    tn, fp, fn, tp = metrics.confusion_matrix(y_validation,pred_class_validation).ravel()
    return {'train':y_train.shape[0],'test':y_validation.shape[0],
            'tn':tn,'fp':fp,'fn':fn,'tp':tp,
            'AUC':metrics.auc(fpr, tpr),'precision':metrics.precision_score(y_validation,pred_class_validation),
            'recall':metrics.recall_score(y_validation,pred_class_validation),
            'accuracy':metrics.accuracy_score(y_validation,pred_class_validation),
            'F1':metrics.f1_score(y_validation,pred_class_validation)}

In [388]:
param_grid = {
    'max_depth': [20,40,60,80],
    'n_estimators': [50,100, 200]
}
list(ParameterGrid(param_grid))

[{'max_depth': 20, 'n_estimators': 50},
 {'max_depth': 20, 'n_estimators': 100},
 {'max_depth': 20, 'n_estimators': 200},
 {'max_depth': 40, 'n_estimators': 50},
 {'max_depth': 40, 'n_estimators': 100},
 {'max_depth': 40, 'n_estimators': 200},
 {'max_depth': 60, 'n_estimators': 50},
 {'max_depth': 60, 'n_estimators': 100},
 {'max_depth': 60, 'n_estimators': 200},
 {'max_depth': 80, 'n_estimators': 50},
 {'max_depth': 80, 'n_estimators': 100},
 {'max_depth': 80, 'n_estimators': 200}]

In [392]:
score_param = []
for param in list(ParameterGrid(param_grid)):
    auc_validation = []
    for r in rel_validation.keys():
        res = train_model_parameter(r,df_train,df_validation,param)
#         print("{}: train {}, test {}, auc {}, precision {},recall {}, accuracy {}, f1-score {}"
#               .format(r,res['train'],res['test'],res['AUC'],res['precision'],res['recall'],res['accuracy'],res['F1']))
        auc_validation.append(res)
    f1 = pd.DataFrame(auc_validation).mean()['F1']
    score_param.append((param,f1))
# score_param

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [403]:
best_params,score = max(score_param,key=lambda x:x[1])
best_params,score

({'max_depth': 20, 'n_estimators': 200}, 0.6341227656097995)

In [404]:
auc_val = []
for r in rel_validation.keys():
    res = train_model_parameter(r,df_train,df_validation,best_params)
#     print("{}: train {}, test {}, auc {}, accuracy {}, f1-score {}".format(r,res['train'],res['test'],res['AUC'],
#                                                               res['precision'],res['recall'],res['accuracy'],res['F1']))
    auc_val.append(res)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [405]:
pd.DataFrame(auc_val).sample(5)

Unnamed: 0,train,test,tn,fp,fn,tp,AUC,precision,recall,accuracy,F1
722,44,8,3,1,1,3,0.1875,0.75,0.75,0.75,0.75
395,26,6,2,1,0,3,0.222222,0.75,1.0,0.833333,0.857143
200,102,20,5,5,3,7,0.27,0.583333,0.7,0.6,0.636364
299,32,14,6,1,5,2,0.418367,0.666667,0.285714,0.571429,0.4
555,8,6,3,0,2,1,0.333333,1.0,0.333333,0.666667,0.5


In [406]:
pd.DataFrame(auc_val).mean()

train        68.136286
test         19.550256
tn            5.741056
fp            4.034072
fn            2.459966
tp            7.315162
AUC           0.310966
precision     0.590541
recall        0.732035
accuracy      0.632425
F1            0.634123
dtype: float64

In [407]:
df_res_val = pd.DataFrame(auc_val)

In [410]:
F1 = computeF1(df_res_val)

In [411]:
print('Test F1-score: {}'.format(F1))

Test F1-score: 0.6925806451612904


In [394]:
# # Define features to be used
# vars_select = ['f0','f1','f2','f3','f4','f5']
# # vars_select = ['f1','f2','f3','f4','f5']

# x_train = df_train.loc[:, df_train.columns.isin(vars_select)].copy()
# y_train = df_train.loc[:, df_train.columns == "Label"].copy()

# x_val = df_validation.loc[:, df_validation.columns.isin(vars_select)].copy()
# y_val = df_validation.loc[:, df_validation.columns == "Label"].copy()

# x_test = df_test.loc[:, df_test.columns.isin(vars_select)].copy()

# print("======================================================================================")
# print("X Train shape:",x_train.shape)
# print("Y Train shape:",y_train.shape)
# print("X Validation shape:",x_val.shape)
# print("Y Validation shape:",y_val.shape)
# print("X Test shape:",x_test.shape)


In [395]:
# #Define the grid to be searched in
# xgb1 = XGBClassifier(random_state = seed)
# parameters = {'max_depth':range(3,10,3),
#               'min_child_weight':range(1,6,3),
#               'learning_rate': [0.0001, 0.001, 0.01, 0.1]}

# #Execute cross-validated random grid search
# xgb_grid = RandomizedSearchCV(xgb1, parameters, cv = 5, n_jobs = -1, verbose=False, random_state = seed)
# # Train Model
# xgb_grid.fit(x_train,y_train)

In [396]:
# print("======================================================================================")
# print("Best parameters found: ")
# print(xgb_grid.best_params_)

# print("======================================================================================")
# # Predict values of regression with train
# ypred_train = pd.DataFrame(xgb_grid.best_estimator_.predict(x_train))

In [397]:
# # Define the optimal cutoff (using validation set)
# predicted_prob_val = xgb_grid.best_estimator_.predict_proba(x_val)[:, 1]
# false_pos_rate, true_pos_rate, proba = roc_curve(y_val, predicted_prob_val)

# # To define the cut off it's used the maximum difference btw true positive rate and false positive rate
# optimal_proba_cutoff = sorted(list(zip(np.abs(true_pos_rate - false_pos_rate), proba)), key=lambda i: i[0], reverse=True)[0][1]

# # Predict in train
# predicted_prob_train = xgb_grid.best_estimator_.predict_proba(x_train)[:, 1]
# y_train_predicted = [1 if i >= optimal_proba_cutoff else 0 for i in predicted_prob_train]

# # Predict in validation
# predicted_prob_val = xgb_grid.best_estimator_.predict_proba(x_val)[:, 1]
# y_val_predicted = [1 if i >= optimal_proba_cutoff else 0 for i in predicted_prob_val]

# # Predict in test
# predicted_prob_test = xgb_grid.best_estimator_.predict_proba(x_test)[:, 1]
# y_test_predicted = [1 if i >= optimal_proba_cutoff else 0 for i in predicted_prob_test]

# print("F1 Score for train, validation: {}, {}".format(f1_score(y_train, y_train_predicted), f1_score(y_val, y_val_predicted)))
# print("Accuracy Score for train, validation: {}, {}".format(accuracy_score(y_train, y_train_predicted), accuracy_score(y_val, y_val_predicted)))
# print("Precision Score for train, validation: {}, {}".format(precision_score(y_train, y_train_predicted), precision_score(y_val, y_val_predicted)))
# print("Recall Score for train, validation: {}, {}".format(recall_score(y_train, y_train_predicted), recall_score(y_val, y_val_predicted)))
# print("AUROC for train, validation: {}, {}".format(roc_auc_score(y_train, predicted_prob_train), roc_auc_score(y_val, predicted_prob_val)))
# print("GINI for train, validation: {}, {}".format(roc_auc_score(y_train, predicted_prob_train) * 2 -1, roc_auc_score(y_val, predicted_prob_val) * 2 -1))

## Feature importance

In [398]:
# # Feature importance calculation
# imps = permutation_importance(xgb_grid.best_estimator_, x_train, y_train)

# # Create dummie variables and stack them with the previous dataframe
# importance = pd.concat([pd.DataFrame(x_train.columns), pd.DataFrame(imps.importances_mean)], axis=1)
# importance.columns = ["variable","importance"]

# importance = importance.sort_values(by='importance', ascending=False)
# importance

## Prepare final result (scoring of test) to export

In [215]:
# export_result = df_test.copy()
# export_result['Expected'] = y_test_predicted
# export_result = export_result[['Id','Expected']]
# export_result.to_csv('JeDiS_xgboost.csv', index=False)