In [1]:
import os
import re
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [7]:
# Configurations.
debug = False
homeurl = "https://dl.acm.org/citation.cfm?id=J1566&picked=prox"
re_issue = re.compile(".*\n(?P<issue>.*)\n.*")
re_toc = re.compile("Table of Contents")
re_paper = re.compile("https://doi.org/(?P<doi>.*)")
re_title = re.compile(".*citation.cfm\?id=(?P<id>)\d+")

# Load data if exported.
df = pd.read_csv("./IMWUT.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,doi,authors,title,abstract,issue
0,0,10.1145/3314388,"Sayma Akther, Nazir Saleheen, Shahin Alan Sami...",mORAL: An mHealth Model for Inferring Oral Hyg...,We address the open problem of reliably detect...,"Volume 3 Issue 1, March 2019"
1,1,10.1145/3314389,"Ling Chen, Yifang Ding, Dandan Lyu, Xiaoze Liu...",Deep Multi-Task Learning Based Urban Air Quali...,Obtaining comprehensive air quality informatio...,"Volume 3 Issue 1, March 2019"


In [4]:
# Start a webdriver instance.
driver = webdriver.Chrome("./chromedriver")

In [5]:
# Navigate to the home page.
driver.get(homeurl)

# Request TOC.
buttons = driver.find_elements_by_tag_name("button")
for button in buttons:
    inner_html = button.get_attribute("innerHTML")
    if re_toc.search(inner_html):
        button.click()

# Wait until TOC loaded.
try:
    toc = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@id='cf_layoutareaprox']/descendant::a"))
    )
except TimeoutException as e:
    print("{} time out.".format(homeurl))

# Get issue urls.
issue_urls = driver.find_elements_by_xpath("//div[@id='cf_layoutareaprox']/descendant::a")
issue_hrefs = [url.get_attribute("href") for url in issue_urls]
print("{} issues found.".format(len(issue_hrefs)))

9 issues found.


In [23]:
# Iterating issues to get the titles and abstracts.
issue_list = []
doi_list = []
title_list = []
abstract_list = []
author_list = []

for issue_href in issue_hrefs:
    # Navigate to the issue page.
    driver.get(issue_href)
    
    # Request TOC.
    buttons = driver.find_elements_by_tag_name("button")
    for button in buttons:
        inner_html = button.get_attribute("innerHTML")
        if re_toc.search(inner_html):
            button.click()
    
    # Wait until TOC loaded.
    try:
        toc = WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, "//div[@id='cf_layoutareaprox']/descendant::p"))
        )
    except TimeoutException as e:
        print("{} time out.".format(issue_href))
        continue
    
    # Get issue name.
    heading = driver.find_element_by_xpath("//h1[text()[contains(., 'Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies')]]/..")
    re_result = re_issue.match(heading.get_attribute("innerHTML")[10:-10])
    issue_name = re_result.group("issue")
    print(issue_name)
    
    # Get title, doi, and abstact for current issue.
    tr_elements = driver.find_elements_by_xpath("//div[@id='cf_layoutareaprox']/descendant::table/tbody/tr")
    idx_tr = 0
    counter = 0
    while idx_tr < len(tr_elements):

        # Start search when hitting a title.
        tr = tr_elements[idx_tr]
        try:
            href = tr.find_element_by_tag_name("a").get_attribute("href")
            re_result = re_title.match(href)
            if re_result:
                # Hit a title.
                title = tr.text
                if debug:
                    print(title)
                idx_tr += 1
                tr = tr_elements[idx_tr]

                # Authors.
                authors = tr.text
                if debug:
                    print(authors)
                idx_tr += 1
                tr = tr_elements[idx_tr]

                # Article number, skip.
                if not tr.text.startswith("Article No"):
                    continue
                idx_tr += 1
                tr = tr_elements[idx_tr]

                # doi.
                doi = tr.find_element_by_tag_name("a").text
                if debug:
                    print(doi)
                idx_tr += 1
                tr = tr_elements[idx_tr]

                # Full text (pdf), skip.
                idx_tr += 1
                tr = tr_elements[idx_tr]

                # Abstract.
                abstract = "\n".join([p.get_attribute("innerHTML") for p in tr.find_elements_by_tag_name("p")])

                # Add to database.
                author_list.append(authors)
                issue_list.append(issue_name)
                doi_list.append(doi)
                title_list.append(title)
                abstract_list.append(abstract)
                counter += 1

            else:
                pass
        except NoSuchElementException as e:
            print(e)

        idx_tr += 1

    print("{} papers extracted.".format(counter))
    print("{} {} {} {}".format(len(doi_list), len(author_list), len(title_list), len(abstract_list)))
    time.sleep(3)

Volume 3 Issue 1, March 2019
34 papers extracted.
34 34 34 34
Volume 2 Issue 4, December 2018
50 papers extracted.
84 84 84 84
Volume 2 Issue 3, September 2018
64 papers extracted.
148 148 148 148
Volume 2 Issue 2, June 2018
32 papers extracted.
180 180 180 180
Volume 2 Issue 1, March 2018
56 papers extracted.
236 236 236 236
Volume 1 Issue 4, December 2017
55 papers extracted.
291 291 291 291
Volume 1 Issue 3, September 2017
90 papers extracted.
381 381 381 381
Volume 1 Issue 2, June 2017
29 papers extracted.
410 410 410 410
Volume 1 Issue 1, March 2017
Message: no such element: Unable to locate element: {"method":"tag name","selector":"a"}
  (Session info: chrome=73.0.3683.86)
  (Driver info: chromedriver=73.0.3683.68 (47787ec04b6e38e22703e856e101e840b65afe72),platform=Mac OS X 10.14.4 x86_64)

3 papers extracted.
413 413 413 413


In [25]:
# From dataframe and export.
df = pd.DataFrame({"doi": doi_list,
                   "authors": author_list,
                   "title": title_list,
                   "abstract": abstract_list,
                   "issue": issue_list})
df.to_csv("IMWUT.csv")

In [16]:
# Author stat.
author_papers = {}

for authors in df.authors:
    names = authors.split(", ")
    for name in names:
        if name in author_papers:
            author_papers[name] += 1
        else:
            author_papers[name] = 1
            
ser_author_papers = pd.Series(author_papers)
ser_author_papers = ser_author_papers.sort_values(ascending=False)
ser_author_papers.head(10)

Yong Li              12
Vassilis Kostakos    11
Daqing Zhang          8
Cecilia Mascolo       8
Uichin Lee            7
Jie Xiong             7
Jorge Goncalves       7
Tao Gu                7
Wei Wang              7
Niels van Berkel      6
dtype: int64

In [17]:
# Word count.
import string
import nltk
from pattern.text.en import singularize, pluralize, lemma

from collections import Counter
combined_text = " ".join(df.abstract).lower()

# Preprocessing.
# Remove punctuation.
combined_text.translate(str.maketrans("", "", string.punctuation))
# Remove prepositions, conjunctions, etc. 
tokens = nltk.word_tokenize(combined_text)
tagged = nltk.pos_tag(tokens)
black_list = ["%", "result",]
white_list = ["NN", "NNS", "NNP", "NNPS"]
filtered = list(filter(lambda x: x[1] in white_list, tagged))
filtered = [token[0] for token in filtered]
filtered = list(filter(lambda x: x not in black_list, filtered))
filtered = [singularize(word) for word in filtered]

wordcount = Counter(filtered)
ser_wordcount = pd.Series(wordcount)
ser_wordcount = ser_wordcount.sort_values(ascending=False)
ser_wordcount.head(10)

user        544
datum       511
system      501
device      361
study       293
accuracy    228
method      223
model       223
approach    221
paper       220
dtype: int64

In [23]:
# Word count.
import string
import nltk
from pattern.text.en import singularize, pluralize, lemma

from collections import Counter
combined_text = " ".join(df.title).lower()

# Preprocessing.
# Remove punctuation.
combined_text.translate(str.maketrans("", "", string.punctuation))
# Remove prepositions, conjunctions, etc. 
tokens = nltk.word_tokenize(combined_text)
tagged = nltk.pos_tag(tokens)
black_list = ["%", "result",]
white_list = ["NN", "NNS", "NNP", "NNPS"]
filtered = list(filter(lambda x: x[1] in white_list, tagged))
filtered = [token[0] for token in filtered]
filtered = list(filter(lambda x: x not in black_list, filtered))
filtered = [singularize(word) for word in filtered]

wordcount = Counter(filtered)
ser_wordcount = pd.Series(wordcount)
ser_wordcount = ser_wordcount.sort_values(ascending=False)
ser_wordcount.head(10)

datum           33
sensor          32
system          29
device          28
recognition     25
activity        24
interaction     19
smartphone      19
detection       14
localization    13
dtype: int64

In [92]:
keywords = ["data", "sensor", "system", "device", "recognition", "activity", "interaction", "smartphone", "detection", "localization"]
keyword_context_list = []
keyword_list = []

for keyword in keywords:
    for title in df.title.tolist():
        title_words = singularize(title).lower().split()
        # Search keywods.
        try:
            index_keyword_list = [i for i, x in enumerate(title_words) if x == keyword]
            for index_keyword in index_keyword_list:
                index_keyword = title_words.index(keyword)
                if index_keyword > 0:
                    context_phrase = " ".join([title_words[index_keyword-1], keyword])
                    keyword_list.append(keyword)
                    keyword_context_list.append(context_phrase)
        except ValueError as e:
            pass
        
df_keyword_context = pd.DataFrame({"keyword": keyword_list,
                                   "context": keyword_context_list})
ser_count = df_keyword_context.groupby(df_keyword_context.columns.tolist(),as_index=False).size()
print(ser_count.loc["data"])
print(ser_count.loc["sensor"])
print(ser_count.loc["device"])


# keyword_context_counter = Counter(keyword_context_list)
# ser_keyword_context = pd.Series(keyword_context_counter).sort_index()
# ser_keyword_context

context
and data                   2
clinical data              1
coarse-grained data        1
cohort data                1
dyadic data                1
gps data                   1
improves data              1
intimate data              1
mobility data              1
on data                    1
personal data              1
privacy-preserving data    1
sensing data               1
spatio-temporal data       1
urban data                 1
use data                   1
dtype: int64
context
activity sensor            1
algorithmic sensor         1
ear-mounted sensor         1
inertial sensor            1
magnetic sensor            1
mobile sensor              2
motion sensor              1
of sensor                  1
physiological sensor       1
quality sensor             1
respiration sensor         1
smartphone-based sensor    1
soft sensor                1
unaided sensor             1
vibration sensor           2
wearable sensor            5
dtype: int64
context
hmd device            

In [89]:
["word", "word"].index("word")

0