##### This notebook helps to gather meta-data of research papers from five information scources, including ACM, Scopus, Springer, IEEE Xplore, and Scholar. For this to work you need to recieve api keys from Scopus, Springer, and IEEE Xplore and replace them in the code. Also, you need to change the search phrase in accordance with your needs. Note that since Scholar, bans the access at some point, it may take some hours to gather its data. It also, lets you to combine different results and merge repeating records.

In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm.notebook import tqdm
import time
import numpy as np
import datetime
html_doc = requests.get('https://dl.acm.org/action/doSearch?AllField=%28%22time+series%22+OR+%22time-series%22%29+AND+%22anomaly+detection%22%29&startPage=0&pageSize=50')

In [39]:
d = datetime.datetime.now().date().strftime('%Y-%m-%d')

# ACM

In [4]:
soup = BeautifulSoup(html_doc.text, 'html.parser')
paper_count = soup.find_all('span', attrs={'class':'hitsLength'})
paper_count = int(paper_count[0].text.replace(',', ''))

In [5]:
def title_doi_year(html_doc):
     time.sleep(np.random.randint(5))
     soup = BeautifulSoup(html_doc.text, 'html.parser')
     tit_class = "hlFld-Title"
     s = soup.find_all('span', attrs={'class': re.compile(r'hlFld-Title|hlFld-ContentGroupTitle')})
     bdois = [x.find('a').attrs['href'] for x in s]
     btitles = [x.text for x in s]
     c = 'dot-separator'
     soup = BeautifulSoup(html_doc.text, 'html.parser')
     s = soup.find_all('div', attrs={'class': 'issue-item__citation'})
     byears = [re.findall('([2019]{2}\d{2})', x.text)[0] for x in s]
     return btitles, bdois, byears

In [None]:
ps = int(paper_count/50)
titles, dois, years = [],[], []
for p in tqdm(range(0, ps+2)):
     html_doc = requests.get(f'https://dl.acm.org/action/doSearch?AllField=%28%22time+series%22+OR+%22time-series%22%29+AND+%22anomaly+detection%22%29&startPage={p}&pageSize=50')
     ttitles, tdois, tyears = title_doi_year(html_doc)
     titles.extend(ttitles)
     dois.extend(tdois)
     if len(ttitles) != len(tyears):
        print(p, len(ttitles), len(tyears))
        break
     years.extend(tyears)

  0%|          | 0/36 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(({'title': titles, 'doi': dois, 'year': years}))
df.to_excel(f'raw_output/acm - {d}.xlsx')

# Scopus

In [None]:
url = "https://api.elsevier.com/content/search/scopus?query=TITLE-ABS-KEY((%7Btime-series%7D%20OR%20%7Btime%20series%7D)%20AND%20%7Banomaly%20detection%7D)&apiKey=ce09555ff6bb6918ae44f5c2d0bfa3d6&start=0"

In [None]:
import json
res = requests.get(url)
y = json.loads(res.text)

In [None]:
ps = int(int(y['search-results']['opensearch:totalResults'])/25)+1

In [None]:
titles, dois, years = [], [], []
for p in tqdm(range(0, ps)):
     html_doc = requests.get(f'https://api.elsevier.com/content/search/scopus?query=TITLE-ABS-KEY((%7Btime-series%7D%20OR%20%7Btime%20series%7D)%20AND%20%7Banomaly%20detection%7D)&start={p*25}&apiKey=ce09555ff6bb6918ae44f5c2d0bfa3d6')
     y = json.loads(html_doc.text)
     ttitles = [x['dc:title'] for x in y['search-results']['entry']]
     tdois = [x.get('prism:doi') for x in y['search-results']['entry']]
     tyears = [x.get('prism:coverDate')[:4] for x in y['search-results']['entry']]
     titles.extend(ttitles)
     dois.extend(tdois)
     years.extend(tyears)

In [None]:
df = pd.DataFrame(({'title': titles, 'doi': dois, 'year': years}))
df.to_excel(f'raw_output/scopus - {d}.xlsx')

# Springer

In [None]:
data = requests.get('http://api.springernature.com/metadata/json?q=(%22anomaly%20detection%22%20AND%20(%22time%20series%22%20OR%20%22time-series%22))&api_key=c74deaf5b45b6c8c99fdb3cdba6636c8&p=50&s=1').text

In [None]:
total_recs = int(json.loads(data)['result'][0]['total'])

In [None]:
c = int(total_recs/50)+1

In [None]:
titles, dois, years = [], [], []
from tqdm.notebook import tqdm
for i in tqdm(range(c)):
    start = (i*50)+1
    url = f'http://api.springernature.com/metadata/json?q=(%22anomaly%20detection%22%20AND%20(%22time%20series%22%20OR%20%22time-series%22))&api_key=c74deaf5b45b6c8c99fdb3cdba6636c8&p=50&s={start}'
    data = requests.get(url).text
    ttitles = [x['title'] for x in json.loads(data)['records']]
    tdois = [x['doi'] for x in json.loads(data)['records']]
    tyears = [int(x['publicationDate'][:4]) for x in json.loads(data)['records']]
    titles.extend(ttitles)
    dois.extend(tdois)
    years.extend(tyears)

In [None]:
df = pd.DataFrame(({'title': titles, 'doi': dois, 'year': years}))
df.to_excel(f'raw_output/springer - {d}.xlsx')

# IEEE Xplore

In [None]:
page = requests.get('https://ieeexploreapi.ieee.org/api/v1/search/articles?parameter&apikey=vfcz6t7cawnewumrf7yp2gyd&querytext=(%22anomaly%20detection%22%20and%20(%22time%20series%22%20or%20%22time-series%22))&start_record=24&max_records=200')
j = json.loads(page.text)
totals = j['total_records']
c = int(totals/200)+1
titles, dois, years, abss = [], [], [], []
for i in tqdm(range(c)):
    page = requests.get(f'https://ieeexploreapi.ieee.org/api/v1/search/articles?parameter&apikey=vfcz6t7cawnewumrf7yp2gyd&querytext=(%22anomaly%20detection%22%20and%20(%22time%20series%22%20or%20%22time-series%22))&start_record={i*200}&max_records=200')
    j = json.loads(page.text)
    titles.extend([x['title'] for x in j["articles"]])
    dois.extend([x.get('doi', '') for x in j["articles"]])
    years.extend([x['publication_year'] for x in j["articles"]])
    abss.extend([x['abstract'] for x in j["articles"]])

In [None]:
df = pd.DataFrame(({'title': titles, 'doi': dois, 'year': years, 'abstract': abss}))
df.to_excel(f'raw_output/ieee xplore - {d}.xlsx')

# Scholar

In [30]:
def cat(xtext):
    try:
        return re.findall('\xa0- (.*), ', xtext)[0]
    except:
        return None

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import numpy as np
import time
import re

data = requests.get('https://scholar.google.com/scholar?q=%22anomaly+detection%22+AND+(%22time-series%22+OR+%22time+series%22)')
soup = BeautifulSoup(data.text, 'html.parser')
ttitles = soup.find_all('h3', attrs={'class':'gs_rt'})

In [35]:
titles, years, journals = [], [], []
for i in tqdm(range(100)):
    t = np.random.randint(20)
    if soup.find_all('form', attrs={'id':'captcha-form'}):
        print('Robot, I have to hide!')
        time.sleep(60*60*4)
    time.sleep(t)
    s = i*10
    data = requests.get(f'https://scholar.google.com/scholar?q=%22anomaly+detection%22+AND+(%22time-series%22+OR+%22time+series%22)&hl=en&start={i*10}&as_sdt=0,5&as_vis=1')
    soup = BeautifulSoup(data.text, 'html.parser')
    ttitles = soup.find_all('h3', attrs={'class':'gs_rt'})
    ttitles = [x.text for x in ttitles]
    gs = soup.find_all('div', attrs={'class':'gs_a'})
    tyears = [re.findall('[,-] ([1920]{2}\d{2})', x.text)[0] for x in gs]
    tjournals = [cat(x.text) for x in gs]
    titles.extend(ttitles)
    years.extend(tyears)
    journals.extend(tjournals)

  0%|          | 0/100 [00:00<?, ?it/s]

Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!
Robot, I have to hide!


In [41]:
import pandas as pd
df = pd.DataFrame(({'title': titles, 'year': years, 'journal': journals}))
df.to_excel(f'raw_output/scholar - {d}.xlsx')

ValueError: All arrays must be of the same length

In [42]:
len(journals)

100

# Merging results

In [None]:
acm = pd.read_excel(f'raw_output/acm.xlsx', index_col = None)
ieee = pd.read_excel(f'raw_output/ieee xplore - {d}.xlsx')
scopus = pd.read_excel(f'raw_output/scopus - {d}.xlsx')
springer = pd.read_excel(f'raw_output/springer - {d}.xlsx')
acm['doi'] = acm['doi'].str[5:]
df = acm.append(ieee).append(scopus).append(springer)
df_doi = df.drop_duplicates('doi')
df_doi['title_'] = df_doi['title'].str.replace(' ', '').str.lower().str.replace('\n', '')
df_doi_title = df_doi.drop_duplicates('title_').sort_values('title_')
df_doi = df.drop_duplicates('doi')
df_doi['title_'] = df_doi['title'].str.replace(' ', '').str.lower().str.replace('\n', '')
df_doi_title = df_doi.drop_duplicates('title_').sort_values('title_')
df_doi_title[df_doi_title['year']>2017]

In [None]:
import scipy.stats as ss
ss.entropy([2,2]), ss.entropy([1,1,1]), ss.entropy([1,1,1,1])

In [None]:
s = []
l = []
for i in range(1,10000):
    s.append(ss.entropy([1]*i))
    l.append(np.log(i))

In [None]:
import matplotlib.pyplot as plt
plt.plot(s)
plt.plot(l)