# Semantic Scholar loading time analysis

This is the notebook used to analyse Semantic Scholar loading time from log file.

In [None]:
import logging

import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
% matplotlib inline
% config InlineBackend.figure_format = 'retina'

In [None]:
import os
import re
import datetime

store_batch_times = []
insert_papers_times = []
update_tsv_times = []
insert_citations_times = []

with open(os.path.expanduser('~/Desktop/ss_update.txt')) as f:
    for line in f.readlines():
        if 'INFO' not in line:
            continue
        search_date = re.search('[\\d-]+ [\\d:]+,\\d+', line)
        if search_date is None:
            continue
        date = datetime.datetime.strptime(search_date.group(0), '%Y-%m-%d %H:%M:%S,%f')

        if 'Store batch transaction started' in line:
            store_batch_start = date
        elif 'Batch insert articles' in line:
            insert_papers_start = date
        elif 'Update TSV vector' in line:
            insert_papers_times.append(date - insert_papers_start)
            insert_papers_start = None
            update_tsv_start = date
        elif 'Batch insert citations list' in line:
            update_tsv_times.append(date - update_tsv_start)
            update_tsv_start = None
            insert_citations_start = date
        elif 'Store batch transaction finished' in line:
            insert_citations_times.append(date - insert_citations_start)
            insert_citations_start = None
            store_batch_times.append(date - store_batch_start)
            store_batch_start = None

In [None]:
df = pd.DataFrame(dict(batch=store_batch_times, papers=insert_papers_times, tsv=update_tsv_times, citations=insert_citations_times))
df

In [None]:
dfm = pd.melt(df, id_vars=[], value_vars =['batch', 'papers', 'tsv', 'citations'])
dfm['value'] = [t.seconds for t in dfm['value']]
dfm.rename(dict(variable='operation', value='time'), axis=1, inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x=dfm['operation'], y=dfm['time'])
plt.show()