In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
gDrivePath = "/content/drive/My Drive/Link prediction/"

In [None]:
!pip install -q pymorphy2
!pip install -q bigartm10
!pip install -q catboost

In [None]:
import pandas as pd
import pickle
import json, os, re
from tqdm import tqdm as tq
import pymorphy2
import artm
import numpy as np
from scipy import spatial
import networkx as nx
from itertools import combinations
import catboost as cgb
from sklearn.model_selection import train_test_split
import scipy
import matplotlib.pyplot   as plt
from scipy import sparse
from sklearn.metrics import roc_auc_score, precision_score, balanced_accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from catboost.utils import get_roc_curve, select_threshold
from sklearn.metrics import balanced_accuracy_score, precision_recall_curve, auc, recall_score, f1_score, precision_score, classification_report

## Чистим текст

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def cleanNames(names_list):
    expr = r'[А-ЯЁ]\.[А-ЯЁ]\. [А-ЯЁ][а-яё]*|[А-ЯЁ][а-яё]* [А-ЯЁ]\.[А-ЯЁ]\.'
    filt_list = []
    for name in names_list:
        filt_list.append([x for x in re.findall(expr, name) if x != []])
    filt_list2 = []
    for l in flatten(filt_list):
        x1, x2 = l.split(' ')
        if x1.count('.') == 2:
            filt_list2.append(l)
        else:
            filt_list2.append(' '.join([x2,x1]))
    return filt_list2

In [None]:
json_files = os.listdir(gDrivePath + "geofix/")
json_files.sort()
df = pd.DataFrame(columns=['authors_raw', 'year', 'geo', 'geo_full', 'text'])

for index, js in enumerate(json_files):
    with open(os.path.join(gDrivePath + "geofix/", js)) as json_file:
        json_text = json.load(json_file)
        authors = json_text['authors_cleaned']
        year = json_text['year']
        geo = json_text['geo_tags']
        geo_full = json_text['geo_tags_full']
        text = json_text['text']
        df.loc[index] = [authors, year, geo, geo_full, text]

In [None]:
df['authors'] = df['authors_raw'].apply(cleanNames)
df['year'] = df['year'].astype(int)
df.head()

In [None]:
df = pd.read_csv(gDrivePath + 'initial_data_2015.csv')
df.updated_geo = df.updated_geo.apply(eval)
df.geo_full = df.geo_full.apply(eval)
#df.to_csv(gDrivePath + 'initial_data.csv')
#df.to_csv(gDrivePath + 'initial_data_2015.csv')

In [None]:
F = pd.read_csv(gDrivePath + 'F_2015.csv')

In [None]:
df[df.year==2016].shape

In [None]:
from collections import Counter

def set_geos(geos):
  all_geos = geos.items()
  authors = list(geos.keys())
  geo_to_count = Counter([geo[1] for geo in all_geos if isinstance(geo[1], str)])
  if len(geo_to_count) == 0:
    for author in authors:
      geos[author] = ''
    return geos
  frequent_geo = max(geo_to_count.items(), key = lambda kvp: kvp[1])[0]
  for author in authors:
    if not isinstance(geos[author], str):
      geos[author] = frequent_geo
  return geos

df['updated_geo'] = df.geo_full.apply(set_geos)

In [None]:
def remove_useless(row):
  text = row['text']
  index = row['index']
  kw_index_en = text.find("Key words")
  kw_index_ru = text.find("Ключевые слова")
  email_index = text.find("E-mail")
  reference_index_en = text.find("Reference")
  reference_index_ru = text.find("Спиоск литературы")
  begin = 0
  if kw_index_en > kw_index_ru:
    begin = kw_index_en + len("key words")
  elif kw_index_en < kw_index_ru:
    begin = kw_index_ru + len("Ключевые слова")
  end = len(text)
  if reference_index_en < reference_index_ru:
    end = reference_index_en
  else:
    end = reference_index_ru
  center = text[begin:end]
  words = [morph.parse(x.lower().replace(u'ё', u'е'))[0].normal_form
           for x in re.findall(r'[ЁА-Яа-яё]+', center)
           if len(x) > 2 and x != "при" and x != "для" and x != "быть" and x != "что" and x != "рис" and x != 'это' and x != 'где']
  return str(index) + " |words " + " ".join(words)

In [None]:
morph = pymorphy2.MorphAnalyzer()
vw = df.reset_index().apply(remove_useless, axis=1)
vw.head(3)

In [None]:
# with open(gDrivePath + 'vw_rp_2015.txt', 'w') as fout:
#     for line in vw:
#         fout.write(line + '\n')

# Строим тематические модели

In [None]:
# batch_vectorizer = artm.BatchVectorizer(data_path=gDrivePath + 'vw_rp_2015.txt', data_format='vowpal_wabbit', collection_name='vw', target_folder=gDrivePath + 'batches_rp_2015')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=gDrivePath + 'batches_rp_2015', data_format = 'batches')

In [None]:
dictionary = artm.Dictionary()
dictionary.gather(data_path=batch_vectorizer.data_path)
dictionary

In [None]:
def print_topic_top_words(model, metric):
    for topic_name in model.topic_names:
      print(topic_name + ': '),
      try:
          print(", ".join(model.score_tracker[metric].last_tokens[topic_name]))
      except:
          print("Not enough unigrams in a topic")
          print()

## PLSA

In [None]:
model_plsa = artm.ARTM(topic_names=['topic_{}'.format(i) for i in range(10)],
                       scores=[artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)],
                       cache_theta=True, reuse_theta=True, theta_columns_naming='title')

In [None]:
model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScoreP', class_id='words', eps=1e-5))
model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScoreP', eps=1e-5))
model_plsa.scores.add(artm.TopTokensScore(name='TopTokensScoreP', num_tokens=10, class_id='words'))

In [None]:
model_plsa.initialize(dictionary=dictionary)

In [None]:
%%time
model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25)

In [None]:
print_topic_top_words(model_plsa, 'TopTokensScoreP')

In [None]:
print(model_plsa.score_tracker['SparsityPhiScoreP'].last_value)
print(model_plsa.score_tracker['SparsityThetaScoreP'].last_value)
theta0 = model_plsa.get_theta()
print('Num zeros col in theta: ', sum([(theta0[i] == 0).all() for i in theta0.columns]))

In [None]:
# F = model_plsa.get_theta()
# F.to_csv(gDrivePath+f'/data/plsa_theta.csv')

## LDA

In [None]:
model_lda = artm.ARTM(topic_names=['topic_{}'.format(i) for i in range(14)],
                       scores=[artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)],
                       regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.7),
                                     artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)],
                       cache_theta=True, reuse_theta=True, theta_columns_naming='title')

In [None]:
model_lda.scores.add(artm.SparsityPhiScore(name='SparsityPhiScoreL', class_id='words', eps=1e-5))
model_lda.scores.add(artm.SparsityThetaScore(name='SparsityThetaScoreL', eps=1e-5))
model_lda.scores.add(artm.TopTokensScore(name='TopTokensScoreL', num_tokens=15, class_id='words'))
model_lda.initialize(dictionary=dictionary)

In [None]:
%%time
model_lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25)

In [None]:
print(model_lda.score_tracker['SparsityPhiScoreL'].last_value)
print(model_lda.score_tracker['SparsityThetaScoreL'].last_value)
theta0 = model_lda.get_theta()
print('Num zeros col in theta: ', sum([(theta0[i] == 0).all() for i in theta0.columns]))

In [None]:
print_topic_top_words(model_lda, 'TopTokensScoreL')

In [None]:
# F = model_lda.get_theta()
# F.to_csv(gDrivePath+'/data/lda_theta.csv')

## ARTM

In [None]:
model_artm = artm.ARTM(topic_names=['topic_{}'.format(i) for i in range(9)],
                       scores=[artm.PerplexityScore(name='PerplexityScore',
                                                    dictionary=dictionary)],
                       regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.5),
                                     artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.001)],
                       cache_theta=True, reuse_theta=True, theta_columns_naming='title')

In [None]:
model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=-50e3))

In [None]:
model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScoreA', class_id='words', eps=1e-5))
model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScoreA', eps=1e-5))
model_artm.scores.add(artm.TopTokensScore(name='TopTokensScoreA', num_tokens=15, class_id='words'))

In [None]:
model_artm.initialize(dictionary=dictionary)

In [None]:
%%time
model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25)

In [None]:
print(model_artm.score_tracker['SparsityPhiScoreA'].last_value)
print(model_artm.score_tracker['SparsityThetaScoreA'].last_value)
theta0 = model_artm.get_theta()
print('Num zeros col in theta: ', sum([(theta0[i] == 0).all() for i in theta0.columns]))

In [None]:
print_topic_top_words(model_artm, 'TopTokensScoreA')

In [None]:
F = model_artm.get_theta()
# F.to_csv(gDrivePath+f'/data/artm_theta.csv')

## Иерархическая

In [None]:
hier = artm.hARTM(scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary)],
                  cache_theta=True, reuse_theta=True, theta_columns_naming='title')

In [None]:
level0 = hier.add_level(num_topics=9)
level0.initialize(dictionary)
level0.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi0', tau=5e3))
level0.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.5))
level0.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1))

level0.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore0', class_id='words', eps=1e-5))
level0.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore0', eps=1e-5))
%time level0.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

In [None]:
print('level0')
print('SparsityPhi: ', level0.score_tracker['SparsityPhiScore0'].last_value)
print('SparsityTheta: ', level0.score_tracker['SparsityThetaScore0'].last_value)
theta0 = level0.get_theta()
print('Num zeros col in theta: ', sum([(theta0[i] == 0).all() for i in theta0.columns]))

In [None]:
level1 = hier.add_level(num_topics=50, topic_names=['child_topic_' + str(i) for i in range(25)], parent_level_weight=1)
level1.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore1', class_id='words', eps=1e-5))
level1.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore1', eps=1e-5))
level1.scores.add(artm.TopTokensScore(name='TopTokensScore1', num_tokens=15, class_id='words'))
level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=2),overwrite=True)
level1.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1),overwrite=True)

In [None]:
level1.initialize(dictionary)
level1.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.01),overwrite=True)
level1.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi1', tau=15e4),overwrite=True)
%time level1.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

In [None]:
print('level1')
print('SparsityPhi: ', level1.score_tracker['SparsityPhiScore1'].last_value)
print('SparsityTheta: ', level1.score_tracker['SparsityThetaScore1'].last_value)
theta1 = level1.get_theta()
print('Num zeros col in theta: ',sum([(theta1[i] == 0).all() for i in theta1.columns]))

In [None]:
print_topic_top_words(level1, 'TopTokensScore1')

## Визуализатор

In [None]:
theta_T = level1.get_theta().T
phi_T = level1.get_phi().T
phi_T.columns = phi_T.columns.to_series().apply(lambda name: name[1])
phi_T.head()

In [None]:
!pip install -q pyldavis
import pyLDAvis

In [None]:
doc_lengths = []
with open(gDrivePath + 'vw_rp_2015.txt', 'r') as file:
    for i in range(1459):
        words = file.readline().split()[2:]
        doc_lengths.append(len(words))

In [None]:
html = pyLDAvis.prepare(topic_term_dists = phi_T.values, 
                        doc_topic_dists = theta_T.values, 
                        doc_lengths = doc_lengths,
                        vocab = phi_T.columns,
                        term_frequency = dictionary._master.get_dictionary(dictionary._name).token_tf,
                        R = 15)

pyLDAvis.save_html(html, 'hARTM.html')

In [None]:
F = level1.get_theta()
F.to_csv(gDrivePath + 'F_2015.csv', index=False)
F.head()

In [None]:
# theta0.to_csv(gDrivePath+'/data/hier_theta_0.csv')
# theta1.to_csv(gDrivePath+'/data/hier_theta_1.csv')
# psi = level1.get_psi()
# psi.to_csv(gDrivePath+'/data/hier_psi_1.csv')

In [None]:
# with open(gDrivePath+'/data/phi1.batch', 'w') as f:
#     f.write('phi1.batch')

# Авторы

In [None]:
def normalize_geo(geo):
  geo = re.sub(r'[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_«»“”’*…/_.\\`{|}~"""), '', geo).strip()
  words = geo.split()
  if len(words) == 0:
    return ""
  if words[0] == "Institute":
    return " ".join(words[:3])
  elif words[0] == "et" or words[0] == "at":
    return ""
  else:
    return " ".join(words[:2])

In [None]:
def create_graph(frame):
    G = nx.Graph()
    for i, row in frame.iterrows():
        authors = list(row['updated_geo'].items())
        if len(authors) == 1:
            try:
              geo = normalize_geo(authors[0][1])
            except AttributeError:
              geo = ''
            G.add_node(authors[0][0], geo = geo)
        else:
            for auth1, auth2 in combinations(authors, 2):
                try:
                    geo1 = normalize_geo(auth1[1])
                except AttributeError:
                    geo1 = ''
                try:
                    geo2 = normalize_geo(auth2[1])
                except AttributeError:
                    geo2 = ''
                    
                G.add_node(auth1[0], geo=geo1)
                G.add_node(auth2[0], geo=geo2)
                G.add_edge(auth1[0], auth2[0])
    return G

In [None]:
paper_to_authors = df.geo_full.apply(lambda geos: list(geos.keys())).to_dict()
unique_authors = []
for authors in df.geo_full.apply(lambda geos: geos.keys()).tolist():
  unique_authors += authors
unique_authors = list(set(unique_authors))
author_to_papers = dict.fromkeys(unique_authors)

In [None]:
for paper in F.columns:
  for author in paper_to_authors[int(paper)]:
    if author_to_papers[author] is None:
      author_to_papers[author] = [F[paper].tolist()]
    else:
      author_to_papers[author].append(F[paper].tolist())
for author in author_to_papers.keys():
  author_to_papers[author] = np.array(author_to_papers[author])

In [None]:
author_to_vec = dict()
for author in author_to_papers.keys():
  author_to_vec[author] = np.mean(author_to_papers[author], axis = 0)

In [None]:
def S(u_vec, v_vec):
  return 1 / np.exp(spatial.distance.cosine(u_vec, v_vec))

In [None]:
def SIM(u_name, v_name, G, F):
  s = S(author_to_vec[u_name], author_to_vec[v_name])
  neighbors = nx.common_neighbors(G, u_name, v_name)
  norm = 0
  sum = 0
  for neighbor in neighbors:
    norm += 1
    uz_papers = np.array([F[str(paper)] for paper in paper_to_authors.keys()
                    if neighbor in paper_to_authors[paper]
                    and u_name in paper_to_authors[paper]])
    vz_papers = np.array([F[str(paper)] for paper in paper_to_authors.keys()
                    if neighbor in paper_to_authors[paper]
                    and v_name in paper_to_authors[paper]])
    x_uz = np.mean(uz_papers, axis=0)
    x_vz = np.mean(vz_papers, axis=0)
    sum += S(x_uz, x_vz)
  if sum == 0:
    return s
  else:
    return s / norm * sum

In [None]:
def add_sim(G, F):
  for (u, v) in G.edges():
    G.add_edge(u, v, sim = SIM(u, v, G, F))
  return G

In [None]:
df.updated_geo.head()

In [None]:
with open(gDrivePath+'a2t.pickle', 'wb') as f:
  pickle.dump(dict([(k, np.argmax(v)) for (k, v) in author_to_vec.items()]), f)

# Train test split

In [None]:
def get_geos_encode(df):
  updated_geos = df.updated_geo
  unique_geos = []
  for geos in updated_geos:
    for geo in geos.items():
      unique_geos.append(normalize_geo(geo[1]))
  unique_geos = list(set(unique_geos))
  i = 0
  geos_encode = {}
  for geo in unique_geos:
    geos_encode[geo] = i
    i += 1
  return geos_encode

In [None]:
G_train = create_graph(df[(df.year >= 2012) & (df.year < 2016)])
G_val = create_graph(df[df.year == 2016])
G_test = create_graph(df[df.year == 2017])
common_authors_val = set(G_train.nodes).intersection(set(G_val.nodes))
common_authors_test = set(G_train.nodes).intersection(set(G_test.nodes))

In [None]:
geos_encode = get_geos_encode(df)

def get_data(df, F, geos_encode):
  data = []
  G = add_sim(create_graph(df), F)
  for u, v in combinations(G.nodes(), 2):
    if G.has_edge(u, v):
      data.append([u, v, geos_encode[G.nodes[u]['geo']], geos_encode[G.nodes[v]['geo']], G[u][v]['sim'], 1])
    else:
      data.append([u, v, geos_encode[G.nodes[u]['geo']], geos_encode[G.nodes[v]['geo']], SIM(u, v, G, F), 0])
  return pd.DataFrame(data, columns=['author_1', 'author_2', 'author_1_a', 'author_2_a',  'sim', 'label'])

train_data = get_data(df[df.year < 2016], F, geos_encode)
val_data = get_data(df[df.year == 2016], F, geos_encode)
test_data = get_data(df[df.year == 2017], F, geos_encode)

In [None]:
def get_data_without_aff(df, F):
  data = []
  G = add_sim(create_graph(df), F)
  for u, v in combinations(G.nodes(), 2):
    if G.has_edge(u, v):
      data.append([u, v, G.nodes[u]['geo']==G.nodes[v]['geo'], G[u][v]['sim'], 1])
    else:
      data.append([u, v, G.nodes[u]['geo']==G.nodes[v]['geo'], SIM(u, v, G, F), 0])
  return pd.DataFrame(data, columns=['author_1', 'author_2', 'same_a',  'sim', 'label'])

In [None]:
def get_data_with_n_count(df, F):
  data = []
  G = add_sim(create_graph(df), F)
  for u, v in combinations(G.nodes(), 2):
    if G.has_edge(u, v):
      data.append([u, v, int(G.nodes[u]['geo']==G.nodes[v]['geo']), len(list(nx.common_neighbors(G, u, v))), G[u][v]['sim'], 1])
    else:
      data.append([u, v, int(G.nodes[u]['geo']==G.nodes[v]['geo']), len(list(nx.common_neighbors(G, u, v))), SIM(u, v, G, F), 0])
  return pd.DataFrame(data, columns=['author_1', 'author_2', 'same_a', 'cn',  'sim', 'label'])

In [None]:
train_data = get_data_with_n_count(df[df.year < 2016], F)
val_data = get_data_with_n_count(df[df.year == 2016], F)
test_data = get_data_with_n_count(df[df.year == 2017], F)

In [None]:
train_data.to_csv(gDrivePath + 'train_with_n.csv')
val_data.to_csv(gDrivePath + 'val_with_n.csv')
test_data.to_csv(gDrivePath + 'test_with_n.csv')

In [None]:
#train_data_a = get_data_without_aff(df[df.year < 2016], F)
#val_data_a = get_data_without_aff(df[df.year == 2016], F)
#test_data_a = get_data_without_aff(df[df.year == 2017], F)
#test_data_a = test_data_a[test_data_a.author_1.isin(common_authors_test) & test_data_a.author_2.isin(common_authors_test)]
#val_data_a = val_data_a[val_data_a.author_1.isin(common_authors_val) & val_data_a.author_2.isin(common_authors_val)]
train_data = pd.read_csv(gDrivePath + 'train_a.csv')
test_data = pd.read_csv(gDrivePath + 'test_a.csv')
val_data = pd.read_csv(gDrivePath + 'val_a.csv')
print(test_data.shape, val_data.shape, train_data.shape)

In [None]:
import seaborn as sns
train_data_a['dummy'] = 0
plt.figure(figsize=(14, 10))
plt.xticks(rotation = 90)
sns.violinplot(y="sim", x="same_a", hue="label", data=train_data_a, palette="pastel", split=True)

In [None]:
# train_data_a.to_csv(gDrivePath + 'train_a.csv', index=False)
# test_data_a.to_csv(gDrivePath + 'test_a.csv', index=False)
# val_data_a.to_csv(gDrivePath + 'val_a.csv', index=False)

In [None]:
test_data = test_data[test_data.author_1.isin(common_authors_test) & test_data.author_2.isin(common_authors_test)]
val_data = val_data[val_data.author_1.isin(common_authors_val) & val_data.author_2.isin(common_authors_val)]
print(test_data.shape, val_data.shape, train_data.shape)

In [None]:
train_data.to_csv(gDrivePath + 'train.csv', index=False)
test_data.to_csv(gDrivePath + 'test.csv', index=False)
val_data.to_csv(gDrivePath + 'val.csv', index=False)

In [None]:
train_data = pd.read_csv(gDrivePath + 'train_a.csv')
test_data = pd.read_csv(gDrivePath + 'test_a.csv')
val_data = pd.read_csv(gDrivePath + 'val_a.csv')

In [None]:
test_data.shape

# Treshold method

In [None]:
def calculateSim(G, F):
  n = G.number_of_nodes()
  d = sparse.lil_matrix((n, n), dtype=float)
  for i1, n1 in enumerate(G.nodes):
        for i2, n2 in enumerate(G.nodes):
            if n1 != n2:
                d[i2,i1] = SIM(n1, n2, G, F)
  return d

In [None]:
sims = calculateSim(G_train, F)
df_train_sim = pd.DataFrame(sims.todense(), columns=G_train.nodes, index=G_train.nodes)
df_train_sim_ca = df_train_sim.loc[common_authors_val, common_authors_val]

In [None]:
#матрица смежности для тестового набора
test_adj = nx.adjacency_matrix(G_val).todense()
# то же, только с именами
df_test_adj = pd.DataFrame(test_adj, columns=G_val.nodes, index=G_val.nodes)
# то же, только для авторов из обоих наборов (train - test)
df_test_adj_ca = df_test_adj.loc[common_authors_val, common_authors_val]

In [None]:
#матрица смежности для тестового набора
test_adj = nx.adjacency_matrix(G_test).todense()
# то же, только с именами
df_test_adj = pd.DataFrame(test_adj, columns=G_test.nodes, index=G_test.nodes)
# то же, только для авторов из обоих наборов (train - test)
df_test_adj_ca = df_test_adj.loc[common_authors_test, common_authors_test]
df_train_sim_ca = df_train_sim.loc[common_authors_test, common_authors_test]

In [None]:
#Сколько должно быть общих соседей чтобы сказать что между вершинами будет связь?
threshold = 0.791
#Матрица смежности для связей которые могут быть в будущем
mask = (df_train_sim_ca > threshold).astype(int)
#сколько ребер предсказано верно

#точность предсказания связей графа
precision = precision_score(df_test_adj_ca.values.flatten(), mask.values.flatten())
recall = recall_score(df_test_adj_ca.values.flatten(), mask.values.flatten())
f3 = 10 * recall * precision / (9 * precision + recall)
print('Precision ' + str(precision))
print('Recall ' + str(recall))
print('F3 ' + str(f3))

# TM + affiliations

In [None]:
X_train = train_data.drop('author_1', axis=1).drop('author_2', axis=1).drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('author_1', axis=1).drop('author_2', axis=1).drop('label', axis=1)
y_test = test_data['label']
X_val = val_data.drop('author_1', axis=1).drop('author_2', axis=1).drop('label', axis=1)
y_val = val_data['label']
cat_feat = ['same_a']
test = cgb.Pool(data=X_test, label=y_test, cat_features=cat_feat)
val = cgb.Pool(data=X_val, label=y_val, cat_features=cat_feat)
train = cgb.Pool(data=X_train, label=y_train, cat_features=cat_feat)

In [None]:
def score(params):
  model = cgb.CatBoostClassifier(**params, task_type="GPU", logging_level='Silent')
  model.fit(train, eval_set=val)
  metrics = model.eval_metrics(val, ['F1:use_weights=false'])

  return {'loss': -metrics['F1:use_weights=false'][-1], 'status': STATUS_OK}

space = {
    'loss_function': 'Logloss',
    'eval_metric': 'F1:use_weights=false',
    'iterations': hp.choice('iterations', np.arange(50, 500, 50)),
    'learning_rate': hp.uniform('learning_rate', 0.1, 1),
    'random_seed': 42,
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0, 20),
    'depth': hp.choice('depth', np.arange(3, 16, 1)),
    'min_data_in_leaf': hp.uniform('min_data_in_leaf', 0, 10),
    #'max_leaves': hp.choice('max_leaves', np.arange(20, 64, 1)),
    'auto_class_weights': hp.choice('auto_class_weights', ['Balanced', 'SqrtBalanced']),
          }

def optimize(trials, space):
  best = fmin(score, space, algo = tpe.suggest, max_evals = 400)
  return best

trials = Trials()
best_params = optimize(trials, space)

# Классификатор (CatBoost)

In [None]:
model = cgb.CatBoostClassifier(loss_function='Logloss',
                               eval_metric='F1:use_weights=false',
                               random_seed=42,
                               auto_class_weights='SqrtBalanced',
                               iterations = 70,
                               logging_level='Verbose')
model.fit(train, eval_set=val)

In [None]:
def findBestTresholdForF1(probs, y):
  best_f1 = 0
  treshold = 0
  for prob in set(probs):
    preds = (probs > prob).astype(int)
    f1 = f1_score(y, preds)
    if f1 > best_f1:
      best_f1 = f1
      treshold = prob
  return best_f1, treshold

In [None]:
probs = model.predict_proba(test)
f1, tresh = findBestTresholdForF1(probs[:, 1], test.get_label())
preds = (probs[:, 1] > tresh).astype(int)
f1

In [None]:
model.eval_metrics(test, ['PRAUC:use_weights=false', 'F1:use_weights=false', 'AUC:use_weights=false', 'Precision:use_weights=false', 'Recall:use_weights=false'])

In [None]:
test.get_label().sum()

In [None]:
cgb.utils.get_confusion_matrix(model, test)

In [None]:
metrics = model.eval_metrics(val, ['AUC', 'Precision', 'BalancedAccuracy'])

In [None]:
def predict(model, train, test):
  roc_curve_values = get_roc_curve(model, train)
  boundary = select_threshold(model, curve=roc_curve_values, FPR=0.9)
  print(boundary)
  proba = model.predict_proba(test)
  predicted = (proba[:, 1] >  boundary).astype(int)
  return predicted, proba[:, 1]

predicted, proba = predict(model, train, val)

In [None]:
predicted = -1 * predicted + 1
actual = -1 * val.get_label() + 1
tp = (predicted * actual).sum()
print(roc_auc_score(actual, 1 - proba), tp / predicted.sum())
print(predicted.sum(), tp.sum())
print(actual.sum())

In [None]:
plt.figure(figsize = (14,10))
plt.plot(np.arange(len(metrics['Precision'])), metrics['Precision'], label = 'Pr')
plt.plot(np.arange(len(metrics['AUC'])), metrics['AUC'], label = 'AUC')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize = (14,10))
plt.plot(np.arange(len(metrics['Precision'])), metrics['Precision'], label = 'Pr')
plt.plot(np.arange(len(metrics['AUC'])), metrics['AUC'], label = 'AUC')
plt.legend()
plt.grid(True)
plt.show()

# Classifier (Desision tree)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

In [None]:
clf = DecisionTreeClassifier(random_state=42, class_weight = 'balanced')
clf.fit(X_train, y_train)

In [None]:
train_data[train_data.label == 0].shape[0]

In [None]:
probs = clf.predict_proba(X_test)
f1, tresh = findBestTresholdForF1(probs[:, 1], y_test)
preds = (probs[:, 1] > tresh).astype(int)
roc_auc_score(y_test, probs[:, 1])

In [None]:
classification_report(y_val, clf.predict(X_val), output_dict=True)['1']['f1-score']

In [None]:
def score(params):
  clf = DecisionTreeClassifier(random_state=42, class_weight = 'balanced')
  clf.fit(X_train, y_train)
  f1 = classification_report(y_val, clf.predict(X_val), output_dict=True)['1']['f1-score']
  return {'loss': -f1, 'status': STATUS_OK}

space = {
    'max_depth': hp.choice('max_depth', np.arange(3, 16, 1)),
    'min_samples_split': hp.choice('min_samples_split', np.arange(2, 5, 1)),
    'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 5, 1)),
    'max_features': hp.choice('max_features', [1, 2]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', range(5, 64, 5)),
    'ccp_alpha': hp.uniform('ccp_alpha', 0, 10)
}

def optimize(trials, space):
  best = fmin(score, space, algo = tpe.suggest, max_evals = 1000)
  return best

trials = Trials()
best_params = optimize(trials, space)