## Search Terms

In [None]:
SEARCH_QUERY = 'dna methylation clock'

## Initialization

In [None]:
import logging

from bokeh.plotting import show, output_notebook
from matplotlib import pyplot as plt

from pysrc.papers.config import PubtrendsConfig
from pysrc.papers.pm_loader import PubmedLoader
from pysrc.papers.ss_loader import SemanticScholarLoader
from pysrc.papers.analyzer_experimental import ExperimentalAnalyzer
from pysrc.papers.plotter import Plotter
from pysrc.papers.utils import SORT_MOST_CITED

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
output_notebook()
%matplotlib inline

In [None]:
config = PubtrendsConfig(test=False)
loader = SemanticScholarLoader(config)
analyzer = KeyPaperAnalyzer(loader, config)

try:
    ids = analyzer.search_terms(SEARCH_QUERY, sort=SORT_MOST_CITED)
    analyzer.analyze_papers(ids, SEARCH_QUERY)
finally:
    loader.close_connection()
    analyzer.teardown()

## PageRank dynamics

In [None]:
# TODO Pubmed uses neo4j for now 
loader = PubmedLoader(config)
loader.values = analyzer.loader.values

with loader.conn.cursor() as cursor:
    cursor.execute(f"""
    WITH vals(pmid) AS (VALUES {loader.values})
    SELECT pmid INTO temporary table TEMP_PMIDS FROM vals;
    SELECT C.pmid_out, C.pmid_in, date_part('year', P.date)
    FROM TEMP_PMIDS T
    JOIN PMCitations C
    ON C.pmid_in = T.pmid
    JOIN PMPublications P
    ON C.pmid_out = P.pmid;
    """)
    data = cursor.fetchall()

In [None]:
import pandas as pd

citations = pd.DataFrame(data, columns=['id_out', 'id_in', 'year'])
citations['id_in'] = citations['id_in'].astype(str)
citations['id_out'] = citations['id_out'].astype(str)
citations['year'] = citations['year'].astype(int)

In [None]:
min_year, max_year = int(citations['year'].min()), int(citations['year'].max())

In [None]:
import networkx as nx

pr = {}

for year in range(min_year + 1, max_year + 1):
    G = analyzer.build_citation_graph(citations[citations['year'] <= year])
    pr[year] = nx.pagerank(G)

In [None]:
pr_df = pd.concat([pd.Series(v) for v in pr.values()], axis=1, sort=True)
pr_df = pr_df.rename(columns=dict(enumerate(pr.keys()))).reset_index().rename(columns={'index': 'id'})

In [None]:
pr_df = pr_df.fillna(0)
pr_df.head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
for index, row in pr_df.iterrows():
    paper_year = analyzer.df[analyzer.df['id'] == row['id']]['year'].values
    if len(paper_year) > 0:
        years = range(paper_year[0], max_year + 1)
        x = [year - paper_year[0] for year in years]
        y = [row[year] if year in row else 0 for year in years]
        plt.plot(years, y)
plt.grid(True)
plt.show()

## Explanation of New Topics in Evolution Diagram 

In [None]:
plotter = Plotter(analyzer)
show(plotter.topic_evolution())

In [None]:
import pandas as pd

prev = 2014
now = 2019

ct = pd.crosstab(analyzer.evolution_df[prev], analyzer.evolution_df[now])

In [None]:
percentage_ct = ct.to_numpy() / ct.to_numpy().sum(axis=0)

In [None]:
# Topic is considered as new if at least THRESHOLD values come from component '-1'
THRESHOLD = 0.5
new_topics = ct.columns[percentage_ct[0, :] > THRESHOLD].values

In [None]:
G_reversed = analyzer.G.reverse()

In [None]:
import numpy as np
import networkx as nx

# Node should have at least EXPLAIN_THRESHOLD papers in DFS tree
EXPLAIN_THRESHOLD = 0.4

for topic in new_topics:
    new_ids = analyzer.df[np.logical_and(analyzer.df['comp'] == topic,
                                         analyzer.df['year'] > prev)]['id'].values
    print(f'Topic {topic}')
    for pid in new_ids:
        if pid in G_reversed.nodes():
            nodes = []
            for node in nx.dfs_tree(G_reversed, source=str(pid)):
                sel = analyzer.df[analyzer.df['id'] == node]
                if len(sel) > 0 and sel['comp'].values[0] == topic:
                    nodes.append(node)
            if len(nodes) >= EXPLAIN_THRESHOLD * len(new_ids):
                print(pid, len(nodes) / len(new_ids))

## Graph Analysis

In [None]:
import pandas as pd
import re

loader = PubmedLoader(config)
loader.values = analyzer.loader.values
query = re.sub(loader.VALUES_REGEX, loader.values, '''
SELECT CAST(C.pmid_out AS TEXT), CAST(C.pmid_in AS TEXT)
FROM PMCitations C
JOIN (VALUES $VALUES$) AS CT(pmid) ON (C.pmid_in = CT.pmid) OR (C.pmid_out = CT.pmid);
''')

with loader.conn.cursor() as cursor:
    cursor.execute(query)

    cit_df = pd.DataFrame(cursor.fetchall(), columns=['id_out', 'id_in'])

In [None]:
papers = set(cit_df['id_out'].values) | set(cit_df['id_in'].values)
papers = [int(pid) for pid in papers]

In [None]:
loader.values = ', '.join(['({})'.format(i) for i in sorted(papers)])

## Text Preprocessing & Clustering

In [None]:
from papers.utils import tokenize, build_corpus, vectorize, lda_topics

In [None]:
corpus = build_corpus(analyzer.pub_df)
tfidf, vectorizer = vectorize(corpus, n_words=1000)

## Text Clustering - Cosine Similarity

Looks like distances are almost equal, possible curse of dimensionality

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

sim = cosine_similarity(tfidf)
euclidean = euclidean_distances(tfidf)

In [None]:
plt.hist(sim.flatten())

In [None]:
plt.hist(euclidean.flatten())

## Text Clustering

Latent Dirichlet Allocation + Perplexity-based Selection of Optimal Amoount of Topics

In [None]:
from scipy.optimize import minimize_scalar
from sklearn.decomposition import LatentDirichletAllocation

SEED = 20190816

def lda_optimal_topics(counts):    
    # Store results during optimization to avoid re-calculation of perplexity
    # for the same number of components
    results = {}

    def lda_evaluate(counts, n_comps, n_times=10):
        actual = int(round(n_comps))
        logging.info(f'Trying {n_comps} - {actual}')

        if actual not in results:
            p = 0
            for _ in range(n_times):
                lda = LatentDirichletAllocation(n_components=actual, random_state=SEED)
                lda.fit(counts)
                p += lda.perplexity(counts)
            results[actual] = p / n_times

        return results[actual]

    upper_bound = min(counts.shape[0], 100)
    res = minimize_scalar(lambda x: lda_evaluate(counts, x), bounds=(1, upper_bound), method='bounded',
                          options={'xatol': 1, 'maxiter': 10})
    
    opt = int(round(res.x))
    logging.info(f'Found {opt} topics')
    topics, lda = lda_topics(counts, n_topics=opt)
    
    return topics, lda

In [None]:
corpus = build_corpus(analyzer.df)

In [None]:
counts, vectorizer = vectorize(corpus)

In [None]:
topics, lda = lda_optimal_topics(counts)

In [None]:
from papers.utils import explain_lda_topics

explain_lda_topics(lda, vectorizer)

## Co-citation Clusters vs Text-based Clusters

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
labels = topics.argmax(axis=1)

In [None]:
graph_labels = list([c for c in set(analyzer.df['comp'].values) if c >= 0])
text_labels = list(set(labels))

cm = np.zeros((len(graph_labels), len(text_labels)))
for i in range(len(graph_labels)):
    for j in range(len(text_labels)):
        ci, cj = graph_labels[i], text_labels[j]
        cm[i, j] = np.logical_and(analyzer.df['comp'] == ci, labels == cj).sum()

In [None]:
cm

## Topic Evolution based on LDA clustering

In [None]:
import numpy as np
import pandas as pd

from papers.utils import explain_lda_topics

def lda_topic_evolution(df, step=5, min_papers=0, current=0):
    min_year = int(df['year'].min())
    max_year = int(df['year'].max())
    year_range = list(np.arange(max_year, min_year - 1, step=-step).astype(int))

    # Cannot analyze evolution
    if len(year_range) < 2:
        logging.info(f'Year step is too big to analyze evovution of topics in {min_year} - {max_year}')
        return None, None

    logging.info(f'Studying evolution of topics in {min_year} - {max_year}')
    logging.info(f"Years when topics are studied: {', '.join([str(year) for year in year_range])}")

    years_processed = 0
    evolution_series = []
    partition = {}
    explanation = {}
    for i, year in enumerate(year_range):
        valid = df[df['year'] <= year]
        corpus = build_corpus(valid)
        counts, vectorizer = vectorize(corpus, terms=analyzer.terms, n_words=len(valid) * 3)
        topics, lda = lda_optimal_topics(counts)
        explanation[year] = explain_lda_topics(lda, vectorizer, n_top_words=20)
        partition[year] = dict(zip(list(valid['id'].values), topics.argmax(axis=1)))
        partition[year], _ = analyzer.merge_components(partition[year])
                  
    return partition, explanation, year_range

In [None]:
p, explanation, year_range = lda_topic_evolution(analyzer.pub_df)

In [None]:
analyzer.evolution_year_range = year_range

evolution_series = []
for year in year_range:
    evolution_series.append(pd.Series(p[year]))

analyzer.evolution_df = pd.concat(evolution_series, axis=1).rename(
    columns=dict(enumerate(year_range)))
analyzer.evolution_df = analyzer.evolution_df[list(reversed(list(analyzer.evolution_df.columns)))]

# Assign -1 to articles that do not belong to any cluster at some step
analyzer.evolution_df = analyzer.evolution_df.fillna(-1.0).astype(int)

analyzer.evolution_df = analyzer.evolution_df.reset_index().rename(columns={'index': 'id'})
analyzer.evolution_df['id'] = analyzer.evolution_df['id'].astype(str)

In [None]:
analyzer.components = set(p[2019].values())

In [None]:
analyzer.evolution_kwds = {}
for year in year_range:
    analyzer.evolution_kwds[year] = {c: [el[1] for el in words] for c, words in explanation[year].items()}

In [None]:
plotter = Plotter(analyzer)
show(plotter.topic_evolution())

# Raw Ideas

## Stable Clustering

Make clustering more stable by aggregating output of several runs.

In [None]:
import networkx as nx
import community

CG = nx.Graph()
nodes = list(analyzer.CG.nodes())
n_nodes = len(nodes)
N_TIMES = 10
THRESHOLD = 0.66
for t in range(N_TIMES):
    p = community.best_partition(analyzer.CG, randomize=True)
    for i in range(n_nodes):
        for j in range(i + 1, n_nodes):
            v, u = nodes[i], nodes[j]
            if p[v] == p[u]:
                w = CG.edges[v, u]['weight'] if CG.has_edge(v, u) else 0
                CG.add_edge(v, u, weight=w+1)

In [None]:
reliable_edges = []

for e in CG.edges(data=True):
    v, u, data = e
    w = data['weight']
    if w > N_TIMES * THRESHOLD:
        reliable_edges.append((v, u))

In [None]:
len(reliable_edges)

In [None]:
len(CG.edges)

In [None]:
CG_reliable = CG.edge_subgraph(reliable_edges)

In [None]:
CG_reliable.number_of_nodes()

In [None]:
analyzer.CG.number_of_nodes()

In [None]:
reliable_partition = {}

n_comps = 0
for i, comp in enumerate(nx.connected_components(CG_reliable)):
    n_comps += 1
    for v in comp:
        reliable_partition[v] = i
        
# Nodes without reliable links
for v in CG.nodes():
    if v not in reliable_partition:
        reliable_partition[v] = n_comps

In [None]:
init = community.best_partition(CG, partition=reliable_partition)

In [None]:
n_comps = len(set(init))

# Nodes without reliable links
for v in analyzer.CG.nodes():
    if v not in init:
        init[v] = n_comps

In [None]:
final = community.best_partition(analyzer.CG, partition=init)

In [None]:
len(set(reliable_partition.values()))

In [None]:
len(set(community.best_partition(analyzer.CG).values()))

## MST with PageRank

In [None]:
import networkx as nx

pr = nx.pagerank(analyzer.G)

In [None]:
G_undirected = analyzer.G.to_undirected()

In [None]:
for v, u in G_undirected.edges():
    G_undirected[v][u]['weight'] = pr[v] - pr[u] - 10

In [None]:
comp = list(nx.connected_components(G_undirected))[1]

In [None]:
G_comp = G_undirected.subgraph(comp)

In [None]:
T = nx.minimum_spanning_tree(G_comp)

In [None]:
nx.is_tree(T)

In [None]:
A = nx.nx_agraph.to_agraph(T)
A.layout('sfdp')
A.draw('mst.png')

## Citation Graph for a Certain Author

In [None]:
AUTHOR = 'Horvath'

In [None]:
author_df = analyzer.df[analyzer.df['authors'].apply(lambda x: AUTHOR in x)]

In [None]:
author_G = analyzer.G.subgraph(author_df['id'].values)

In [None]:
author_G.number_of_nodes()

In [None]:
import networkx as nx
import pygraphviz as pgv
from IPython.display import Image

def draw(graph):
    return Image(nx.nx_agraph.to_agraph(graph).draw(format='png', prog='sfdp'))

In [None]:
draw(author_G)