# Grid search optimization of clustering

This notebook contains analysis of papers clustering optimization.
It contains the following clustering methods:
* LDA (Latent Dirichlet Allocation)
* Louvain communities detection algorithm, followed by merging tiny clusters
* Hierarchical clustering of word2vec based embeddings for citation graph and texts
* DBScan of embeddings, followed by merging tiny clusters

In [None]:
# Without extension
OUTPUT_NAME = 'grid_search_2021_11_02'

## Imports

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import seaborn as sns
from IPython.display import display

sns.set_style("whitegrid")
import matplotlib.pyplot as plt

import logging
import pandas as pd

from sklearn.metrics.cluster import adjusted_mutual_info_score, v_measure_score

from utils.io import load_analyzer, load_clustering, get_review_pmids
from utils.preprocessing import preprocess_clustering, get_clustering_level

In [None]:
# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

## Analyze ground truth clustering

In [None]:
results_df = pd.DataFrame()
partitions_overall = [] 

review_pmids = get_review_pmids()
n_reviews = len(review_pmids)

In [None]:
from tqdm.auto import tqdm
ground_truth_clusters_df = pd.DataFrame(columns=['Pmid', 'Level', 'Clusters'], dtype=object)
logger.info('Computing ground truth clustering features')
for pmid in tqdm(review_pmids):
    clustering = load_clustering(pmid)
    analyzer = load_analyzer(pmid)
    
    # Pre-calculate all hierarchy levels before grid search to avoid re-calculation of clusterings
    for level in range(1, get_clustering_level(clustering)):
        clusters = preprocess_clustering(
            clustering, level, include_box_sections=False, uniqueness_method='unique_only'
        )
        ground_truth_clusters_df.loc[len(ground_truth_clusters_df)] = (pmid, level, len(set(clusters.values())))
display(ground_truth_clusters_df.head())

In [None]:
! mkdir results

sns.histplot(data=ground_truth_clusters_df, x='Clusters', hue='Level', element='poly')
plt.title('Ground truth clusters number')
plt.savefig(f'results/{OUTPUT_NAME}_ground_truth_clusters.png')
plt.show()

## Grid search
See `grid_search.py` file to launch parameters grid search in parallel with Celery.

In [None]:
def reg_v_score(labels_true, labels_pred, reg=0.01):
    v_score = v_measure_score(labels_true, labels_pred)
    n_clusters = len(set(labels_pred))
    return v_score - reg * n_clusters


In [None]:
metrics = [adjusted_mutual_info_score, reg_v_score]

## Visualization

In [None]:
results_df = pd.read_csv(f'{OUTPUT_NAME}.csv')

#### Extract parameter columns

In [None]:
score_columns = set([m.__name__ for m in metrics])
param_columns = list(set(results_df.columns) - score_columns - set(['level', 'n_clusters', 'pmid']))
print(param_columns)

#### Number of clusters and adjusted mutual information

In [None]:
sns.boxplot(x='method', y='n_clusters', hue='method', data=results_df)
plt.title('Mean clusters number')
plt.xlabel('Method')
plt.ylabel('Clusters')
plt.savefig(f'results/{OUTPUT_NAME}_mean_clusters_number.png')
plt.show()

In [None]:
sns.boxplot(x='method', y='adjusted_mutual_info_score', hue='level', data=results_df)
plt.title('Mean adjusted mutual information')
plt.xlabel('Method')
plt.ylabel('AMI')
plt.savefig(f'results/{OUTPUT_NAME}_mean_adjusted_mutual_information.png')
plt.show()

#### Best scores

In [None]:
best_df = results_df.sort_values('adjusted_mutual_info_score', ascending=False).drop_duplicates(['method', 'pmid'])

In [None]:
sns.boxplot(x='method', y='n_clusters', hue='method', data=best_df)
plt.title('Clusters number for best params')
plt.xlabel('Method')
plt.ylabel('Clusters')
plt.savefig(f'results/{OUTPUT_NAME}_best_clusters_number.png')
plt.show()

In [None]:
sns.boxplot(x='method', y='adjusted_mutual_info_score', hue='level', data=best_df)
plt.title('Adjusted mutual information for best params')
plt.xlabel('Method')
plt.ylabel('AMI')
plt.savefig(f'results/{OUTPUT_NAME}_best_adjusted_mutual_information.png')
plt.show()

#### Average Scores 

In [None]:
def get_top_parameter_sets_for_method(score_df, param_cols, method, target_col, n=5):
    return score_df[score_df.method == method].groupby(param_cols)[[target_col, 'n_clusters']].mean().sort_values(by=target_col, 
                                                                                                                  ascending=False).head(n).reset_index()

In [None]:
def get_top_mean_score_for_method(score_df, param_cols, method, target_col):
    return score_df[score_df.method == method].groupby(param_cols)[target_col].mean().sort_values(ascending=False).values[0]

In [None]:
import numpy as np
target_col = 'adjusted_mutual_info_score'

tops  = []
for method in results_df.method.unique():
    top_score = get_top_mean_score_for_method(results_df, param_columns, method, target_col)
    print(method, ':', target_col, top_score, '\n')
    top_params_df = get_top_parameter_sets_for_method(results_df, param_columns, method, target_col)
    display(top_params_df)
    scores_df = results_df[results_df.method == method].copy()
    for i, row in top_params_df[param_columns].iterrows():
        filters = [True] * len(scores_df)
        for p in param_columns:
            filters = np.logical_and(filters, scores_df[p] == row[p])
        t = scores_df.loc[filters].copy()
        t['method'] = method
        t['top'] = i + 1
        tops.append(t)

top_df = pd.concat(tops)
sns.boxplot(x='method', y='adjusted_mutual_info_score', hue='top', data=top_df)
plt.title('Adjusted mutual information')
plt.xlabel('Method')
plt.ylabel('AMI')
plt.savefig(f'results/{OUTPUT_NAME}_top_adjusted_mutual_information.png')
plt.show()    

In [None]:
mean_score_data = []
for method in results_df.method.unique():
    method_data = []
    for metric in metrics:
        top_score = get_top_mean_score_for_method(results_df, param_columns, method, metric.__name__)
        method_data.append(top_score)
    mean_score_data.append((method, *method_data))

In [None]:
metric_names = [m.__name__ for m in metrics]
mean_score_df = pd.DataFrame(mean_score_data, columns=['method', *metric_names])
mean_score_df.head(4)

In [None]:
mean_score_df.to_csv(f'results/{OUTPUT_NAME}_mean_scores_per_method.csv', index=False)

In [None]:
p = mean_score_df.plot.bar(x='method', y=metric_names)
fig = p.get_figure()
fig.savefig(f'results/{OUTPUT_NAME}_mean_scores_per_method.png')

#### Best parameters visualization

In [None]:
import plotly.graph_objects as go

categories = ['similarity_bibliographic_coupling',
              'similarity_cocitation',
              'similarity_citation']

fig = go.Figure()
for method in results_df.method.unique():
    t = get_top_parameter_sets_for_method(results_df, param_columns, method, target_col)
    r = (t['similarity_bibliographic_coupling'].values[0],
         t['similarity_cocitation'].values[0],
         t['similarity_citation'].values[0])
    if method !='lda':
        fig.add_trace(go.Scatterpolar(
            r=r,
            theta=categories,
            fill='toself',
            name=method
        ))
fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 10]
    )),
  showlegend=False
)
fig.write_image(f'results/{OUTPUT_NAME}_params.png')
fig.show()

#### Average Scores for Different Clustering Levels

In [None]:
def get_top_parameter_sets_for_level_and_method(score_df, param_cols, level, method, target_col, n=5):
    return score_df[(score_df.method == method) & (score_df.level == level)]\
        .groupby(param_cols)[[target_col, 'n_clusters']].mean().sort_values(by=target_col, 
                                                                            ascending=False).head(n).reset_index()

In [None]:
def get_top_mean_score_for_level_and_method(score_df, param_cols, level, method, target_col):
    return score_df[(score_df.method == method) & (score_df.level == level)]\
        .groupby(param_cols)[target_col].mean().sort_values(ascending=False).values[0]

In [None]:
target_col = 'adjusted_mutual_info_score'

for level in results_df.level.unique():
    tops = []
    print(f'LEVEL {level}')
    for method in results_df.method.unique():
        top_score = get_top_mean_score_for_level_and_method(results_df, param_columns, level, method, target_col)
        print(method, ':', target_col, top_score, '\n')
        top_params_df = get_top_parameter_sets_for_level_and_method(results_df, param_columns, level, method, target_col)
        display(top_params_df)
        top_params_df.to_csv(f'results/{OUTPUT_NAME}_top_params_{method}_{level}.csv', index=False)
        scores_df = results_df[(results_df.method == method) & (results_df.level == level)].copy()
        for i, row in top_params_df[param_columns].iterrows():
            filters = [True] * len(scores_df)
            for p in param_columns:
                filters = np.logical_and(filters, scores_df[p] == row[p])
            t = scores_df.loc[filters].copy()
            t['method'] = method
            t['top'] = i + 1
            tops.append(t)

    top_df = pd.concat(tops)
    sns.boxplot(x='method', y='adjusted_mutual_info_score', hue='top', data=top_df)
    plt.title(f'Adjusted mutual information level {level}')
    plt.xlabel('Method')
    plt.ylabel('AMI')
    plt.savefig(f'results/{OUTPUT_NAME}_level_{level}_top_adjusted_mutual_information.png')
    plt.show()    

In [None]:
level_mean_score_data = []

for level in results_df.level.unique():
    for method in results_df.method.unique():
        method_data = []
        for metric in metrics:
            top_score = get_top_mean_score_for_level_and_method(results_df, param_columns, level, method, metric.__name__)
            method_data.append(top_score)
        level_mean_score_data.append((level, method, *method_data))

In [None]:
metric_names = [m.__name__ for m in metrics]
level_mean_score_df = pd.DataFrame(level_mean_score_data, columns=['level', 'method', *metric_names])
level_mean_score_df

In [None]:
level_mean_score_df.to_csv(f'results/{OUTPUT_NAME}_mean_scores_per_method_and_level.csv', index=False)

In [None]:
for level in level_mean_score_df.level.unique():
    p = level_mean_score_df[level_mean_score_df.level == level].plot.bar(x='method', y=metric_names, title=f'Level {level}')
    fig = p.get_figure()
    fig.savefig(f'results/{OUTPUT_NAME}_mean_scores_per_method_level_{level}.png')

In [None]:
import plotly.graph_objects as go

categories = ['similarity_bibliographic_coupling',
              'similarity_cocitation',
              'similarity_citation']


for level in results_df.level.unique():
    fig = go.Figure()
    print(f'LEVEL {level}')
    for method in results_df.method.unique():
        t = get_top_parameter_sets_for_level_and_method(results_df, param_columns, level, method, target_col)
        r = (t['similarity_bibliographic_coupling'].values[0],
             t['similarity_cocitation'].values[0],
             t['similarity_citation'].values[0])
        if method !='lda':
            fig.add_trace(go.Scatterpolar(
                r=r,
                theta=categories,
                fill='toself',
                name=method
            ))
    fig.update_layout(
      polar=dict(
        radialaxis=dict(
          visible=True,
          range=[0, 10]
        )),
      showlegend=False
    )
    fig.write_image(f'results/{OUTPUT_NAME}_params_{level}.png')
    fig.show()

In [None]:
print('Visualization - Done')