# Preprocessing of the ACM Periodicals Database

In [39]:
import re
import glob
import sys
import json
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from xml.dom import minidom
from xml.parsers.expat import ExpatError
from tqdm import tqdm

len("CCS->Mathematics of computing->Mathematical software".split("->"))-1

2

content -> (article_rec) -> (categories | ccs2012 | keywords | abstract) 
ccs2012
keywords
abstract
concept_significance <br>

kewywords -> kw <br>
abstract -> par

In [None]:
all_files = glob.glob("periodicals/*/*.xml")
all_files

In [None]:
def kw_preproc(kw):
    return kw.lower()

In [None]:
articles_dict  = {"doi":[], "abstract":[], "ft_body":[], "ccs":[], "keywords":[]}
count_ccs      = {}
count_ccs_root = {}
count_ccs_kw   = {}
bad_data_count = 0
bad_xml_count  = 0
count_abstract = 0
count_body     = 0

for i, path in enumerate(tqdm(all_files)):
    

    try:                                             
        mydoc = minidom.parse(path)
    except ExpatError:
        bad_xml_count += 1
        continue


    article_rec    = mydoc.getElementsByTagName('article_rec')

    for articles in article_rec:
        doi_number = articles.getElementsByTagName('doi_number')
        article_id = articles.getElementsByTagName('article_id')

        if(len(doi_number) > 1):
            bad_data_count += 1
            # print("Two doi numbers")

        # print(doi_number[0].firstChild.data)
        if(doi_number[0].firstChild != None):
            articles_dict["doi"].append(doi_number[0].firstChild.data)
        elif(article_id[0].firstChild != None):
            articles_dict["doi"].append(article_id[0].firstChild.data)
        else:
            continue

        abstract = articles.getElementsByTagName('abstract')

        if(len(abstract) > 1):
            bad_data_count += 1
            articles_dict["abstract"].append(abstract[0].getElementsByTagName('par')[0].firstChild.data)
            count_abstract += 1
            # print("Two abstracts")
        elif(len(abstract) == 0):
            bad_data_count += 1
            # print("No Abstract")
            articles_dict["abstract"].append(np.nan)
        else:
            if(abstract[0].getElementsByTagName('par')[0].firstChild != None):
                articles_dict["abstract"].append(abstract[0].getElementsByTagName('par')[0].firstChild.data)
            else:
                articles_dict["abstract"].append(abstract[0].firstChild.data)
            count_abstract += 1

        ft_body = articles.getElementsByTagName('ft_body')

        if(len(ft_body) > 1):
            bad_data_count += 1
            articles_dict["ft_body"].append(ft_body[0].firstChild.data)
            # print("Two bodies")
            count_body     += 1
        elif(len(ft_body) == 0):
            bad_data_count += 1
            # print("No Body")
            articles_dict["ft_body"].append(np.nan)
        else:
            articles_dict["ft_body"].append(ft_body[0].firstChild.data)
            count_body     += 1

        kw_list = []
        for kw in articles.getElementsByTagName('kw'):
            # print(kw.firstChild.data) 
            kw_list.append(kw_preproc(kw.firstChild.data))
            
        if len(kw_list) > 0:
            articles_dict['keywords'].append(kw_list)
        else:
            articles_dict['keywords'].append(np.nan)

        ccs_dict = {}

        for concept in articles.getElementsByTagName('concept'):
            concept_desc         = concept.getElementsByTagName('concept_desc')
            concept_significance = concept.getElementsByTagName('concept_significance')

            significance = int(concept_significance[0].firstChild.data)
            concept_tree = concept_desc[0].firstChild.data
            
            if('->' in concept_tree):
                concept_root = concept_tree.split("->")[1]
            elif('~' in concept_tree):
                concept_root = concept_tree.split("~")[0]
            else:
                # print(concept_tree)
                concept_root = concept_tree
            # print(concept_tree)
            # print(concept_root)
            # print(significance)

            ## We count concepts only once !
            if concept_tree not in ccs_dict.keys():

                if concept_root in count_ccs_root.keys():
                    count_ccs_root[concept_root] += 1
                else:
                    count_ccs_root[concept_root]  = 1
            
                if concept_tree in count_ccs.keys():
                    count_ccs[concept_tree] += 1
                else:
                    count_ccs[concept_tree]  = 1

                if concept_tree not in count_ccs_kw.keys() and len(kw_list) > 0:
                    count_ccs_kw[concept_tree] = {}
                
                for kw in kw_list:
                    if kw in count_ccs_kw[concept_tree].keys():
                        count_ccs_kw[concept_tree][kw] += 1
                    else:
                        count_ccs_kw[concept_tree][kw]  = 1

                ## Adds concept to ccs_dict with sign level
                ccs_dict[concept_tree] = significance

            elif concept_tree in ccs_dict.keys():
                ## Adds concept to ccs_dict with MAX sign level
                ccs_dict[concept_tree] = max(significance, ccs_dict[concept_tree])        

        
        articles_dict['ccs'].append(ccs_dict)



print(bad_data_count, bad_xml_count)

In [None]:
print("Number of abstracts: ", count_abstract)
print("Number of bodies: ", count_body)

#### Save the created data:

In [None]:
with open('Data/count_ccs.json', 'w') as fp:
    json.dump(count_ccs, fp)

with open('Data/count_ccs_root.json', 'w') as fp:
    json.dump(count_ccs_root, fp)

with open('Data/count_ccs_kw.json', 'w') as fp:
    json.dump(count_ccs_kw, fp)

with open('Data/articles_dict.json', 'w') as fp:
    json.dump(articles_dict, fp)

### Stats on the ACM DataBase

##### Load Data

In [2]:
with open('Data/count_ccs.json', 'r') as fp:
    count_ccs = json.load(fp)
 
    # Print the type of data variable
    print("Type:", type(count_ccs))

with open('Data/count_ccs_root.json', 'r') as fp:
    count_ccs_root = json.load(fp)

with open('Data/count_ccs_kw.json', 'r') as fp:
    count_ccs_kw = json.load(fp)

with open('Data/articles_dict.json', 'r') as fp:
    articles_dict = json.load(fp)


Type: <class 'dict'>


##### Descr of the Created DataFrame

In [3]:
articles_df = pd.DataFrame(articles_dict)
articles_df.tail()

Unnamed: 0,doi,abstract,ft_body,ccs,keywords
107908,10.1145/2767134,<p>Online social media allow users to interact...,\n Detection of Political Manipulation in Onli...,{'CCS->Information systems->World Wide Web->We...,"[online social media, machine learning, opinio..."
107909,10.1145/2767135,<p>A classifier that determines if a webpage i...,\n Improving Researcher Homepage Classi.cation...,{'CCS->Information systems->Information retrie...,"[researcher homepage classification, co-traini..."
107910,10.1145/2789211,<p>There has been a recent swell of interest i...,\n Diversionary Comments under Blog Posts JING...,{'Mathematics of computing~Bayesian networks':...,"[diversionary comments, classification, corefe..."
107911,10.1145/2790304,<p>This work addresses the problem of estimati...,\n Estimating Clustering Coef.cients and Size ...,{'CCS->Theory of computation->Design and analy...,"[estimation, clustering coefficient, sampling,..."
107912,10.1145/2812812,<p>The Web 2.0 brought new requirements to the...,\n Fona: Quantitative Metric to Measure Focus ...,{'CCS->Hardware': 500},"[aria, focus navigation, web accessibility]"


In [None]:
articles_df.keywords.isna

In [None]:
articles_df['ccs'].iloc[1]

##### Most popular concepts

In [None]:
N = 15
ccs_ids = np.argsort(list(count_ccs.values()))[::-1]
popular_ccs = np.array(list(count_ccs.keys()))[ccs_ids][:N]
print("Here is a list of the most popular concepts: ")
print(popular_ccs)
print("And here are their occurences:", np.sort(list(count_ccs.values()))[::-1][:N])
print("There is {0} concepts with keywords out of {1} concepts.".format(len(count_ccs_kw), len(count_ccs)))

##### Stats on Width of Concepts

In [None]:
depth_df = pd.DataFrame({"ccs_root": articles_df['doi'], "occurences": [len(dic.values()) for dic in articles_df['ccs']]})
depth_df.describe()

In [None]:
depth_df[depth_df['occurences'] > 0].hist(bins = 30)

##### Stats on Depth of Concepts

In [None]:
depth_of_concepts = [len(ccs.split("->"))-1 if '->' in ccs else len(ccs.split("~"))-1 if '~' in ccs else 1 for ccs in count_ccs.keys()]

print(np.median(depth_of_concepts), np.mean(depth_of_concepts), np.max(depth_of_concepts), np.min(depth_of_concepts))

Min théorique est de 1, max théorique est de 6

##### Number of different CCS roots:

In [None]:
ccs_root_df = pd.DataFrame({"ccs_root": count_ccs_root.keys(), "occurences": count_ccs_root.values()})
ccs_root_df.describe()

In [None]:
ccs_root_df[ccs_root_df['occurences'] > 5].sort_values(by=['occurences'], ascending=False)

#### Keywords Stats

##### On a single ccs concept

In [None]:
ccs_name = 'CCS->Human-centered computing->Human computer interaction (HCI)'

def create_kw_occ_df(ccs_name):
    kw_occ_df = pd.DataFrame({"keywords": count_ccs_kw[ccs_name].keys(), "occurences": count_ccs_kw[ccs_name].values()})
    return kw_occ_df

kw_occ_df = create_kw_occ_df(ccs_name)
kw_occ_df.describe()

In [None]:
kw_occ_df.hist(bins=30, alpha=0.5)

In [None]:
kw_occ_df[kw_occ_df['occurences'] > 5].sort_values(by=['occurences'], ascending=False)

##### Stats on overall keywords

In [None]:
kw_count_avg = 0
kw_count_max = 0
kw_entry_num = 1
kw_agg_dict  = {}

for val in count_ccs_kw.values():
    for kw, count in val.items():
        kw_count_avg += (count-kw_count_avg)/kw_entry_num
        if(kw_count_max < count):
            kw_count_max = count
        kw_entry_num+=1

        if kw in kw_agg_dict.keys():
            kw_agg_dict[kw] += 1
        else:
            kw_agg_dict[kw]  = 1

print(kw_count_max, kw_count_avg, kw_entry_num, len(kw_agg_dict))

In [None]:
kw_agg_df = pd.DataFrame({"keyword_names": kw_agg_dict.keys(), "occurences": kw_agg_dict.values()})
kw_agg_df.tail()

In [None]:
len(kw_agg_df[kw_agg_df['occurences'] >= 3])

### Vectorization of Keywords in a Doc for Hierarchical Clustering

#### BOW Binary

In [None]:
def sentence_vector(sentence, vocab_words):
    return [1 if token in sentence else 0 for token in vocab_words]

In [None]:
## Filters the dataframe only to include articles with keywords
filtered_articles_df = articles_df.dropna(subset='keywords')
# [articles_df['keywords'].notnull()]

## Sets the number of keywords used for vectorization
nb_key_words = int(1e4)
kw_vocab = kw_agg_df.sort_values("occurences")['keyword_names'][::-1][:nb_key_words].tolist()

## Computes the vector representation for each document
filtered_articles_df["BOWrepresentation_kw"]    = filtered_articles_df["keywords"].apply(lambda sentence: sentence_vector(sentence, kw_vocab))
filtered_articles_df.tail()

##### Hierarchical Clustering

In [None]:
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.title("Article Dendrogram for Binary BOW")

# Selecting Binary BOW as Information
selected_data = filtered_articles_df.iloc[:, 5].array
selected_data = np.array([np.array(el) for el in selected_data])# Algorithm expects 2D array as an observation vector
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")# Euclidean distance 
shc.dendrogram(Z=clusters)
plt.show()

Metric  can be ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulczynski1’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.

#### Tf-Idf BOW

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

filtered_articles_df["keywords_str"]    = filtered_articles_df["keywords"].apply(lambda x: " ".join(x))


pipe_kw = Pipeline([('count', CountVectorizer(vocabulary=kw_vocab)),
                       ('tfidf', TfidfTransformer())]).fit(filtered_articles_df["keywords_str"])

Compute the TF-IDF representation 

In [None]:
## List of arrays is a good solution to assign arrays as value in dataframe
tf_idf_list = list(pipe_kw.transform(filtered_articles_df["keywords_str"]).toarray())
np.array(tf_idf_list).shape

In [None]:
filtered_articles_df["tf-idf_kw"] = tf_idf_list

In [None]:
filtered_articles_df[["doi", "BOWrepresentation_kw", "tf-idf_kw"]].tail()

##### Hierarchical Clustering

In [None]:
import scipy.cluster.hierarchy as shc
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.title("Article Dendrogram for TF-ISF BOW")

# Selecting Binary BOW as Information
selected_data = filtered_articles_df.iloc[:, 6].array
selected_data = np.array([np.array(el) for el in selected_data])# Algorithm expects 2D array as an observation vector
clusters = shc.linkage(selected_data, 
            method='ward', 
            metric="euclidean")# Euclidean distance 
shc.dendrogram(Z=clusters)
plt.show()