# General description of the solution

TDB

## Preparation
### Dependency installation

In [None]:
!pip install spacy feedparser levenshtein pandas

Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 3.5 MB/s 
[?25hCollecting levenshtein
  Downloading Levenshtein-0.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[K     |████████████████████████████████| 110 kB 31.5 MB/s 
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Collecting rapidfuzz<1.9,>=1.8.2
  Downloading rapidfuzz-1.8.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (854 kB)
[K     |████████████████████████████████| 854 kB 21.0 MB/s 
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=e1cb52e4aef7da945fc88fe816be8691edfc6dc3b694c67fd7b2c10d1e69069d
  Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa
S

In [None]:
# !pip install textract

In [None]:
import os
import textract
import requests
import shutil
import spacy
import pandas as pd
from tqdm import tqdm

### Data location

Below we setup a working folder and a data source.

In [None]:
# this line is specific for execution at Google Drive
def mount_gdrive():
    from google.colab import drive
    drive.mount('/gdrive')

In [None]:
def init_folders():
    print("Initializing folder for a pipeline")
    import os
    global DATA_SOURCE_FOLDER
    global RESULTS_FOLDER 
    if not os.path.exists(DATA_SOURCE_FOLDER): os.mkdir(DATA_SOURCE_FOLDER)
    if not os.path.exists(RESULTS_FOLDER): os.mkdir(RESULTS_FOLDER)

## Reading the source files

In [None]:
def _data_for_year(year):
    import os
    import pandas as pd
    filename = os.path.join(DATA_SOURCE_FOLDER, f"{year}.xlsx")
    xls = pd.ExcelFile(filename)
    sheet = xls.parse(f"{year}")
    result = []
    for row in sheet[sheet.columns[1]]:
        p1 = row.find(',"')
        p2 = row.find('"', p1 + 2)
        authors = row[:p1].split(', ')
        
        # HOTFIX for existing table
        if str(year) == '2018':
            p1 = row.find(',"', p2)
            p2 = row.find('"', p1 + 2)
        
        title = row[p1+2:p2]
        result.append({'title': title, 'authors': authors, 'data': str(year)})
    return result


def load_all_scopus_titles(_range):
    result = {}
    for year in _range:
        result[str(year)] = _data_for_year(year)
    return result

### Saving the dump

In [None]:
import pickle, os

def save_index(ind, foldername, prefix=None):
    global RESULTS_FOLDER
    if prefix is None:
        prefix = RESULTS_FOLDER
    folder = os.path.join(prefix, foldername)
    if not os.path.exists(folder):
        os.mkdir(folder)
    for year in ind:
        with open(os.path.join(folder, str(year)), 'wb') as f:
            pickle.dump(ind[year], f)


def load_index(foldername, 
         years=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020], 
         prefix=None,
         with_raw_texts=False): 
    from tqdm import tqdm   
    global RESULTS_FOLDER
    if prefix is None:
        prefix = RESULTS_FOLDER
    result = {}
    for year in years:
        filename = os.path.join(prefix, foldername, str(year))
        print("Checking filename:", filename)
        if os.path.exists(filename):
            print("Loading year", year)
            with open(filename, 'rb') as f:
                result[str(year)] = pickle.load(f)
            # hack
            for item in tqdm(result[str(year)]):
                if os.path.exists(item['textfile']) and 'raw' not in item:
                    with open(item['textfile'], 'r') as f:
                        item['raw'] = f.read()
            print(f"Loaded for year {year}: {len(result[str(year)])} items")
    return result

## Find Arxiv papers relevant to our titles

In [None]:
def _get_relevant(title, n=100):
    api = f"http://export.arxiv.org/api/query?max_results={n}&search_query="
    import time
    import feedparser

    # 3 seconds delay is due to arxiv API requirements
    time.sleep(3)
    return feedparser.parse(api + title.replace(' ', '+'))


def _to_papers(feed):
    result = []
    for e in feed["entries"]:
        id = e['id'][21:].replace('/', '_')
        page = e['id']
        year = e['published'].split('-')[0]
        pdfurl = [l['href'] for l in e['links'] if l['type'] == 'application/pdf'][0]
        title = e['title'].replace('\n', '').replace('  ', ' ')
        authors = [a['name'] for a in e.authors]
        result.append({
            'id': id,
            'url': page,
            'year': year,
            'pdfurl': pdfurl,
            'title': title,
            'authors': authors
        })
    return result


def _filter_relevant_papers(feed, item, LD=10, IOU=.01):
    import itertools
    import Levenshtein
    title = item['title'].lower()

    def author_set(authors):
        return set([name.lower() for name 
                    in itertools.chain(*[i.split() for i in authors]) if '.' not in name])

    s1 = author_set(item['authors'])
    result = []
    for paper in feed:
        dist = Levenshtein.distance(paper['title'].lower(), title)
        s2 = author_set(paper['authors'])
        iou = len(set.intersection(s1, s2)) / len(set.union(s1, s2))
        if iou >= IOU and dist <= LD:
            paper['source'] = item['title']
            result.append(paper)

    return result


def collect_paper_meta(sources, idx_name='index_filtered'):
    from tqdm import tqdm
    result = load_index(idx_name)
    for year in sources:
        print("Collecting metainfo for year", year)
        if str(year) in result:
            continue
        pidx = load_index(idx_name + "_parts", [year])
        result[year] = pidx[str(year)] if str(year) in pidx else []
        source_titles = [item['source'] for item in result[year]]
        for i, paper in enumerate(tqdm(sources[year])):
            # these papers is already loaded for this year in parts
            if paper['title'] in source_titles:
                continue
            # this line is the most important, as it
            feed = _get_relevant(paper['title'])
            candidates = _to_papers(feed)
            filtered = _filter_relevant_papers(candidates, paper)
            result[year] += filtered
            if (i + 1) % 100 == 0:
                save_index({year: result[year]}, idx_name + "_parts")
        save_index(result, idx_name)
        print("index for year", year, "saved to", idx_name)
    return result


def reorder_by_year(index):
    result = {}
    for year in index:
        for item in index[year]:
            if item['year'] not in result:
                result[item['year']] = []
            result[item['year']].append(item) 
    return result

## Report papers collected

In [None]:
def report_index_size(index, filter_by_year=True):
    header = f" year\t|\titems\t|\t=year"
    print(header)
    print("-------------------------------------")
    for year in sorted(index.keys()):
        c = len(index[year])
        if filter_by_year:
            cf = len([item for item in index[year] if int(year) - int(item['year']) < 2])
        else:
            cf = c
        s = f" {year}\t|\t{c:5}\t|\t{cf:5}"
        print(s)

## Download files

We will download only paper which are matching all criteria, including exact year match

In [None]:
def download_papers(index, folder="pdf", strict_year=False, delay=3):
    from tqdm import tqdm
    import requests
    import shutil
    import time
    import urllib

    global RESULTS_FOLDER

    fullfolder = os.path.join(RESULTS_FOLDER, folder)
    if not os.path.exists(fullfolder):
        os.mkdir(fullfolder)
    for year in index:
        print(f"Downloading {year} year")
        yearfolder = os.path.join(fullfolder, str(year))
        if not os.path.exists(yearfolder):
            os.mkdir(yearfolder)

        for item in tqdm(index[year]):
            if strict_year and item['year'] == str(year):
                continue
            # + '.pdf' - hack
            url = item['pdfurl'].replace('http:', 'https:') + '.pdf'
            short_file = item['id'] + '.pdf'
            filename = os.path.join(yearfolder, short_file)
            item['pdffile'] = filename
            if os.path.exists(filename):
                # hack for the cases file was downloaded partially
                if os.path.getsize(filename) > 16 * 1024:
                    continue
            time.sleep(delay)
            # urllib.request.urlretrieve(url, filename)

            with requests.get(url, stream=True, allow_redirects=True) as r:
                if str(r.status_code)[0] in '45':
                    print(f"Error: {r.status_code}, {r.url}")
                    if str(r.status_code) == '403':
                        raise Exception("We are banned by arxiv :(")
                else:
                    with open(filename, 'wb') as f:
                        shutil.copyfileobj(r.raw, f, 1024 * 1024 * 5)
                    
        # todo remove
        save_index({year: index[year]}, "index_with_pdf")

## Recognize all texts


In [None]:
def recognize_texts(index, dest_folder='txt'):
    import os, textract
  
    global RESULTS_FOLDER

    full_dest_folder = os.path.join(RESULTS_FOLDER, dest_folder)
    if not os.path.exists(full_dest_folder):
        os.mkdir(full_dest_folder)

    for year in index:
        year_dest_folder = os.path.join(RESULTS_FOLDER, dest_folder, str(year))
        if not os.path.exists(year_dest_folder):
            os.mkdir(year_dest_folder)
        failed = 0
        for item in tqdm(index[year]):
            pdf = item['pdffile']
            txtfile = os.path.join(year_dest_folder, pdf.split('/')[-1][:-4] + '.txt')
            item['textfile'] = txtfile
            if os.path.exists(txtfile):
                continue
            try:
                bin = textract.process(pdf, method='pdfminer')
            except BaseException as e:
                print(e)
                failed += 1
                continue
            text = str(bin, encoding="utf8")
            item['raw'] = text
            with open(txtfile, 'w') as f:
                f.write(item['raw'])
        print(f"Year {year} failed {failed}")

## Prepare clean dataset

In [None]:
def _clean_numbers(string):
    import re
    string = re.sub('\\|\\d+i', '', string)
    string = re.sub('\\|\\d+>', '', string)
    string = re.sub('\\[[\\d+\\.\\-−–\\s,]+\\]', '', string)
    string = re.sub('\\(\\d+\\)', '', string)
    string = re.sub('\\b\\d+(\\.\\d+)?%?', '', string)
    string = string.replace('(cid:)', '')
    string = re.sub("\\(\\s?\\)", "", string)
    string = re.sub("\\[\\s?\\]", "", string)
    return string


def _remove_greek(string):
    import re
    string = re.sub("[ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω⊗†↓→∞↑↓=↔]+", "", string)
    return string


def prepare_clean_dataset(index):
    import re, os
    from tqdm import tqdm

    tails = ['acknowledgements\n', 'references\n', 'bibliograpy\n', 
         'confilicts of interest\n', "acknowledges support", "are grateful to the funding",
         'acknowledgements.', 'acknowledgment –', 'we thank', 'is gratefully acknowledged', 'appendix a', 'references .',
         'research was supported by', 'acknowledges support by', 'we acknowledge funding', 'work is supported by',
         'was supported by the funding', 'is supported by the funding', ]

    for year in index:
        print("Year", year)
        # TODO restore partial load, but as this stage this is not critical
        for item in tqdm(index[year]):
            if 'raw' not in item:
                continue
            raw = item['raw']
            mintail = None
            for tail in tails:
                tailstart = raw.lower().rfind(tail)
                if tailstart > -1:
                    if mintail is None:
                        mintail = tailstart
                    else:
                        if tailstart > len(raw) // 2:
                            mintail = min(tailstart, mintail)
            if mintail is not None and mintail > -1 and mintail > len(raw) // 2:
                raw = raw[:mintail]
            raw = raw.replace('\r\n\r\n', ' ').replace('\n\n', ' ')
            raw = '\n'.join([line for line in raw.split('\n') if len(line.replace(' ', '')) > 3])
            raw = raw.replace("ﬃ", "ffi").replace("ﬁ", "fi").replace("ﬂ", "fl").replace('ﬀ', 'ff')
            raw = _clean_numbers(raw)
            raw = _remove_greek(raw)
            raw = raw.replace("-\n", "")
            raw = raw.replace("\n", " ")
            item['clean'] = raw
            del item['raw']


def dataset_unload(index, key, folder):
    import os
    global RESULTS_FOLDER
    full_dest_folder = os.path.join(RESULTS_FOLDER, folder)
    if not os.path.exists(full_dest_folder):
        os.mkdir(full_dest_folder)

    for year in index:
        year_folder = os.path.join(full_dest_folder, str(year))
        if not os.path.exists(year_folder):
            os.mkdir(year_folder)
        for item in index[year]:
            if key in item:
                # print(os.path.join(year_folder, f"{item['id']}.txt"))
                with open(os.path.join(year_folder, f"{item['id']}.txt"), 'w') as ff:
                    ff.write(item[key])

## Do some NLP magic

In [None]:
def get_wic(sent, nlp, context_radius=5):
    thread = nlp(sent)
    n = len(thread)
    res = []
    for i in range(n):
        lemma = thread[i].lemma_
        pos = thread[i].pos_
        context = thread[max(0, i - context_radius):min(n, i + context_radius + 1)]
        text_context = " ".join([x.text for x in context])
        res.append({
            'lem': lemma,
            'pos': pos,
            'ctx': text_context
        })
    return res
    

def text_to_lemmas(text, nlp):
    result = []
    for sent in [s.strip() for s in text.split('.')]:
        tokenbox = []
        if not sent: continue
        thread = nlp(sent)
        for i in range(len(thread)):
            lemma = thread[i].lemma_
            pos = thread[i].pos_
            text = thread[i].text
            tokenbox.append({
                'lem': lemma,
                'txt': text, 
                'pos': pos
            })
        result.append(tokenbox)
    return result


def prepare_lemmas(years, index_source_folder, index_dest_folder):
    import os
    from tqdm import tqdm
    import spacy

    global RESULTS_FOLDER

    nlp = spacy.load("en_core_web_sm")

    for year in years:
        year = str(year)
        print("Lemmatizing year", year)

        # Hack
        if os.path.exists(os.path.join(RESULTS_FOLDER, index_dest_folder, year)):
            print(f"Year {year} already processed")
            continue

        nu_index = {year: []}
        index = load_index(index_source_folder, years=[str(year)])
        print(index.keys())
        for item in tqdm(index[year]):
            # we will write text representation of file
            filename = f"{year}_{item['id']}.txt"
            if 'clean' not in item:
                continue
            item["tokens"] = text_to_lemmas(item['clean'], nlp)
            del item['clean']
            nu_index[year].append(item)
        save_index(nu_index, index_dest_folder)

## Count keyword stats

In [None]:
def keyword_stats(index_folder, years=None, allowed=['ADJ', 'ADV', 'NOUN', 'VERB']):
    from collections import Counter
    import itertools
    from tqdm import tqdm
    result = Counter()
    if years is None: years = range(2010, 2021)
    for year in years:
        print("Counting stats in", year)
        year = str(year)
        idx = load_index(index_folder, [year])
        for item in tqdm(idx[year]):
            if 'tokens' not in item:
                continue
            for sent in item['tokens']:
                toks = [(token['lem'].lower(), token['pos']) 
                            for token 
                            in sent
                            if token['pos'] in allowed
                            ]
                result.update(toks)
    return result

In [None]:
def blacklist(counter):
    result = set()
    for key in counter:
        for symbol in ',[]~˜!@#$%^&*()+`"№;%:?*(){}/\\|<>\'?×.≡≤·∈∇∆”“𝜅𝐴𝑝′±ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω⊗†↓→∞↑↓=↔−ˆ\uf8f7':
            if symbol in key[0]:
                result.add(key)
                break
        if len(key[0]) == 1:
            result.add(key)
        if key[-1] == '-':
            result.add(key)
        elif key[0] in ['--', "the", "of", "a", 'in', 'to', 'is', 'for', 'that', 
                     'we', 'as', 'with', 'by', 'be', 'are', 'on', 'this', '−', 
                     '+', 'can', 'an', 'at', 'where', 'not', 'our', 'out', 'fig', 'it',
                     'one', 'two', 'or', 'eq', 'may', 'have', 'such', 'also', 
                     'while', 'each', 'all', 'only', 'more', 'if', 'these', 
                     'has', 'thus', 'its', 'ii', 'iii', 'vi', 'iv', 'vii', 
                     'viii', 'xi', 'ix', 'there', 'their', 'into', 'any', 
                     'through', 'so', 'they', 'under', 'now', 'per', 'could', 'can', 
                     'does', 'do', 'use', 'new', 'tree', 'four', 'some',
                     "was", "from", "were", "which", "but", "who", "and", "her", "them", "many", "both", "my", "after", "she", 
                     "about", "other", "his", "he", "than", "had", "when",
                    "will", "been", "what", "would", "between", "most", "no"]:
            result.add(key)
    return result

In [None]:
def save_stats(counter, filename, prefix=None):
    import os
    global RESULTS_FOLDER
    total = sum(counter.values())

    if prefix is None:
        prefix = RESULTS_FOLDER

    dest = os.path.join(prefix, filename)
    with open(dest, 'w') as f:
        for key, value in counter.most_common():
            for k in key:
                f.write(f"{k}\t")
            f.write(f"{value}\t{value / total:.5f}\n")


def save_stats_from_list(freq, filename, prefix=None):
    import os
    global RESULTS_FOLDER

    if prefix is None:
        prefix = RESULTS_FOLDER

    dest = os.path.join(prefix, filename)
    common = sorted(freq, key=lambda x: -x[1])
    with open(dest, 'w') as f:
        for key, value in common:
            for k in key:
                f.write(f"{k}\t")
            f.write(f"{value}\n")

In [None]:
from math import log2

def to_freq(counter):
    total = 0
    for key in counter:
        total += counter[key]
    result = {}
    for key in counter:
        result[key] = counter[key] / total
    return result


def partial_kld(px, qx):
    if px == 0:
        return 0
    elif qx == 0:
        #TODO: word is not present
        return 0

    divergence = px * log2(px / qx)    
    return divergence


def KLD(p, q):
    keys = set(p.keys()).union(q.keys())
    divergence = 0.
    for x in keys:
        px = 0 if x not in p else p[x]
        qx = 0 if x not in q else q[x]
        divergence += partial_kld(px, qx)
    return divergence
    

def get_from_first_top_KL_outliers(p, q, threshold, include_new=True):
    keys = set(p.keys()).union(q.keys())
    result = {}
    for key in keys:
        if key in p and key not in q:
            if include_new:
                result[key] = float('inf')
        elif key in p and key in q:
            pkld = partial_kld(p[key], q[key])
            if pkld > threshold:
                result[key] = pkld
    return result

# Run the pipeline

In [None]:
DATA_SOURCE_FOLDER = "/gdrive/MyDrive/data/physics/scopus/"
RESULTS_FOLDER = "/gdrive/MyDrive/data/physics/results"

In [None]:
import os

# if at gdrive
mount_gdrive()
init_folders()
references = load_all_scopus_titles(range(2010, 2021))
report_index_size(references, False)

Mounted at /gdrive
Initializing folder for a pipeline
 year	|	items	|	=year
-------------------------------------
 2010	|	 1557	|	 1557
 2011	|	 1549	|	 1549
 2012	|	 1610	|	 1610
 2013	|	 1660	|	 1660
 2014	|	 1658	|	 1658
 2015	|	 1848	|	 1848
 2016	|	 2000	|	 2000
 2017	|	 2000	|	 2000
 2018	|	 1999	|	 1999
 2019	|	 2000	|	 2000
 2020	|	 1609	|	 1609


In [None]:
meta = collect_paper_meta(references)
save_index(meta, 'index_0_meta')
report_index_size(meta)

In [None]:
meta_reordered = reorder_by_year(meta)
save_index(meta_reordered, "index_1_meta_reordered")
report_index_size(meta_reordered)

In [None]:
download_papers(meta_reordered, delay=3)

In [None]:
save_index(meta_reordered, "index_2_with_pdf_path")

In [None]:
recognize_texts(meta_reordered)

In [None]:
save_index(meta_reordered, "index_3_raw_texts")

In [None]:
index_reordered = load_index("index_3_raw_texts", with_raw_texts=True)
prepare_clean_dataset(index_reordered)

In [None]:
save_index(index_reordered, "index_4_clean")

In [None]:
prepare_clean_dataset(index_reordered)

In [None]:
dataset_unload(index_reordered, 'clean', 'txt_corpus_clean')

In [None]:
index_clean = load_index("index_4_clean")

In [None]:
prepare_lemmas(range(2010, 2021), "index_4_clean", "index_5_tokens")

In [None]:
load_index('index_5_tokens', years=[2020])['2020'][50]

In [None]:
stats = keyword_stats('index_5_tokens', years=[2011])
print(stats.most_common(10))

Counting stats in 2011
Checking filename: /gdrive/MyDrive/data/physics/results/index_5_tokens/2011
Loading year 2011


100%|██████████| 473/473 [00:00<00:00, 3364.96it/s]


Loaded for year 2011: 473 items


100%|██████████| 473/473 [00:01<00:00, 268.99it/s]


[(('state', 'NOUN'), 15958), (('can', 'VERB'), 11645), (('use', 'VERB'), 8882), (('system', 'NOUN'), 8870), (('show', 'VERB'), 6743), (('where', 'ADV'), 6575), (('time', 'NOUN'), 6198), (('field', 'NOUN'), 5995), (('b', 'NOUN'), 4937), (('t', 'NOUN'), 4834)]


In [None]:
black1 = blacklist(stats)
for b in black1: del stats[b]

In [None]:
save_stats(stats, 'most_common_lemmas.txt')

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
with open('/gdrive/MyDrive/data/physics/text_acad.txt', 'r') as f:
    lemmas = text_to_lemmas(f.read(), nlp)

In [None]:
save_index({"acad": [{"tokens": lemmas, 'textfile' : ""}] }, "academ_corpus")
acad_stats = keyword_stats("academ_corpus", years=['acad'])
black2 = blacklist(acad_stats)
for b in black2: del acad_stats[b]

Counting stats in acad
Checking filename: /gdrive/MyDrive/data/physics/results/academ_corpus/acad
Loading year acad


100%|██████████| 1/1 [00:00<00:00, 4644.85it/s]


Loaded for year acad: 1 items


100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


In [None]:
uni_texts = to_freq(stats)
uni_acad = to_freq(acad_stats)

In [None]:
outliers = get_from_first_top_KL_outliers(uni_texts, uni_acad, 0.0001, False)
outliers_with_new = get_from_first_top_KL_outliers(uni_texts, uni_acad, 0.0001, True)

In [None]:
save_stats_from_list(list(outliers.items()), "outliers.txt")
save_stats_from_list(list(outliers_with_new.items()), "outliers_with_new.txt")