In [None]:
# All imports
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
import mwparserfromhell
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scripts.const import CITATION_TEMPLATES

import findspark
findspark.init('/Users/harshdeep/Downloads/spark-2.4.3-bin-hadoop2.7/')
from pyspark import SparkContext, SQLContext

import warnings
warnings.filterwarnings("ignore")

## Setting up various datasets

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
# Read dataset which contains content for each page - from which features can be extracted
page_content = sqlContext.read.parquet('data/citations_content.parquet/')

In [4]:
# Read dataset which has IDs such as DOI, ISBN
citation_with_ids = pd.read_csv('data/citations_with_ids.csv')
citation_with_ids.head()

Unnamed: 0.1,Unnamed: 0,id,page_title,citation,id_list,authors,citation_title,citation_type,publisher_name,sections,...,Vol.59,PMID,+lin,No.3(Summer,+david+w.,NewMexico,ISMN,9780896082755,ZBL,DOI
0,0,1831197,Ray Sharkey,u'{{cite journal|last=Gliatto|first=Tom|date=J...,"[['ISSN', '0093-7673']]","[{last=Gliatto, first=Tom}]",Fatal Deceit,cite journal,,Initial Section,...,,,,,,,,,,
1,1,1831220,Protamine sulfate,u'{{cite book|title=WHO Model Formulary 2008|d...,"[['ISBN', '9789241547659']]",,WHO Model Formulary 2008,cite book,World Health Organization,Initial Section,...,,,,,,,,,,
2,2,1831220,Protamine sulfate,u'{{cite journal | author= Kenneth Cornetta | ...,"[['PMID', '2786000'], ['DOI', '10.1016/0166-09...","[{last=Kenneth Cornetta}, {last=W.French Ander...",Protamine sulfate as an effective alternative ...,cite journal,,Initial Section,...,,2786000.0,,,,,,,,10.1016/0166-0934(89)90132-8
3,3,1831220,Protamine sulfate,u'{{cite journal|last=Sorgi|first=FL|author2=B...,"[['PMID', '9349433'], ['DOI', '10.1038/sj.gt.3...","[{last=Sorgi, first=FL}, {last=Bhattacharya, S...",Protamine sulfate enhances lipid-mediated gene...,cite journal,,Initial Section,...,,9349433.0,,,,,,,,10.1038/sj.gt.3300484
4,4,1831220,Protamine sulfate,u'{{cite journal|last=Walker|first=WS|author2=...,"[['PMID', '6743419'], ['PMC', '481594'], ['DOI...","[{last=Walker, first=WS}, {last=Reid, KG}, {la...",Successful cardiopulmonary bypass in diabetics...,cite journal,,Initial Section,...,,6743419.0,,,,,,,,10.1136/hrt.52.1.112


In [5]:
# Loading the NLP spacy model for similarity detection etc.
nlp = spacy.load('en_core_web_lg')

In [6]:
# Making an additional column for labeling purposes of our dataset
citation_with_ids['scientific_or_not'] = None

## Making a dataset which is `scientific` in nature

Make a dataset in which there are IDs - **1 million**.

1. It would be safe to say that all citations with PMCs and PMIDs are scientific in nature since they have been published within the biomedical and life sciences journal literature.

In [7]:
pmc_pmid_mask = citation_with_ids['PMID'].notnull() | citation_with_ids['PMC'].notnull()

citation_with_pmc_pmid = citation_with_ids[pmc_pmid_mask]
print('The total number of citations with PMC and PMID: {}'.format(len(citation_with_pmc_pmid)))

The total number of citations with PMC and PMID: 640538


As we can see that the citations with a defined PMC or PMID have the type of `cite journal`, but also we can see that the citation types which are considered to be in gray areas - `cite web` or `cite book` also have defined PMC/PMID.

In [8]:
citation_with_pmc_pmid.groupby('citation_type').count()['id']

citation_type
citation               4549
cite book               839
cite conference          11
cite encyclopedia         1
cite interview            1
cite journal         634700
cite news                99
cite report               5
cite web                333
Name: id, dtype: int64

Also, 80% of the citations with PMC/PMID have a defined DOI, and some of them have a defined ISBN as well.

In [9]:
print('Total number of citations with PMC/PMID with a non null DOI: {}'.format(citation_with_pmc_pmid['DOI'].count()))
print('Total number of citations with PMC/PMID with a non null ISBN: {}'.format(citation_with_pmc_pmid['ISBN'].count()))

Total number of citations with PMC/PMID with a non null DOI: 576486
Total number of citations with PMC/PMID with a non null ISBN: 1587


In [10]:
## Set the label for being scientific_or_not as True
citation_with_ids.loc[citation_with_pmc_pmid.index, 'scientific_or_not'] = True

2. Also there is a small set of `citations` with the type `cite conference` - which means they have been published in a conference. Conferences mainly happen for scientific events. We are only considering the citations which have a `defined DOI` so that they are more scientifcally inclined.

In [11]:
mask = citation_with_ids['citation_type'].isin(['cite conference']) & citation_with_ids['DOI'].notnull()

citation_in_conference = citation_with_ids[mask]

In [12]:
citation_with_ids.loc[citation_in_conference.index, 'scientific_or_not'] = True

3. Now we will look for other citations and try to compare the similarity of the title of the citation with the word `scientific` and check if they have a very high similarity, we can label them as scientific - since title of citation would be more robust than title of the page.

In [13]:
def get_similarity_p_name(c_title):
    try:
        c_title = str(c_title)
    except UnicodeDecodeError:
        return 0.0
    search_doc = nlp(c_title)
    main_doc = nlp(u'scientific')
    return main_doc.similarity(search_doc)

In [14]:
citation_with_ids['similarity'] = None

In [15]:
# Get all citation titles which have not been labeled yet but have defined DOIs
mask = citation_with_ids['scientific_or_not'].isnull() & citation_with_ids['DOI'].notnull()
citation_titles = citation_with_ids[mask]['citation_title'].unique().tolist()

In [16]:
# Get similarity from the method as defined above
results = []
for i in tqdm(citation_titles):
    results.append((i, get_similarity_p_name(i)))

In [17]:
# citation_title_similarity = pd.read_csv('data/citation_title_similarity.csv')
citation_title_similarity = pd.DataFrame(results, columns=['citation_title', 'similarity'])

This was just for DOIs where we take titles which have more than 0.45 similarity with the word scientific and label it `True` as far as scientific_or_not is concerned

In [18]:
# Titles which have similarity of more than 0.45 or more being scientific in nature
titles_which_are_scientific = citation_title_similarity[
    citation_title_similarity['similarity'] > 0.45]['citation_title']

In [19]:
citation_with_ids.loc[
    citation_with_ids['citation_title'].isin(titles_which_are_scientific), 'scientific_or_not'] = True

4. Now lets take a lot at some ISBNs since we are a bit short than 1 million data points to be scientific or not..

We apply the same methodology as above as we did for the citation titles with valid DOIs.

In [20]:
mask = citation_with_ids['scientific_or_not'].isnull() & citation_with_ids['ISBN'].notnull()
citation_titles_with_isbn = citation_with_ids[mask]['citation_title'].unique().tolist()

In [21]:
results = []
for i in tqdm(citation_titles_with_isbn):
    results.append((i, get_similarity_p_name(i)))

In [22]:
citation_title_isbn_similarity = pd.DataFrame(results, columns=['citation_title', 'similarity'])

In [23]:
# Titles which have similarity of more than 0.44 or more being scientific in nature
titles_which_are_isbn_scientific = citation_title_isbn_similarity[
    citation_title_isbn_similarity['similarity'] > 0.45]['citation_title']

In [24]:
citation_with_ids.loc[
    citation_with_ids['citation_title'].isin(titles_which_are_isbn_scientific), 'scientific_or_not'] = True

In [25]:
print('The dataset which has been labeled as scientific: {}'.format(
    len(citation_with_ids[citation_with_ids['scientific_or_not'] == True])))
print('Total number of unique pages associated with the scientific titles: {}'.format(
    citation_with_ids[citation_with_ids['scientific_or_not'] == True]['page_title'].nunique()
))

The dataset which has been labeled as scientific: 1154252
Total number of unique pages associated with the scientific titles: 294490


In [26]:
scientific_dataset = citation_with_ids[citation_with_ids['scientific_or_not'] == True]

In [27]:
# citation_title_isbn_similarity.to_csv('data/citation_title_isbn_similarity.csv')
# citation_title_similarity.to_csv('data/citation_title_similarity.csv')
# scientific_dataset.to_csv('data/scientific_dataset.csv')

In [28]:
scientific_dataset.head()

Unnamed: 0.1,Unnamed: 0,id,page_title,citation,id_list,authors,citation_title,citation_type,publisher_name,sections,...,+lin,No.3(Summer,+david+w.,NewMexico,ISMN,9780896082755,ZBL,DOI,scientific_or_not,similarity
2,2,1831220,Protamine sulfate,u'{{cite journal | author= Kenneth Cornetta | ...,"[['PMID', '2786000'], ['DOI', '10.1016/0166-09...","[{last=Kenneth Cornetta}, {last=W.French Ander...",Protamine sulfate as an effective alternative ...,cite journal,,Initial Section,...,,,,,,,,10.1016/0166-0934(89)90132-8,True,
3,3,1831220,Protamine sulfate,u'{{cite journal|last=Sorgi|first=FL|author2=B...,"[['PMID', '9349433'], ['DOI', '10.1038/sj.gt.3...","[{last=Sorgi, first=FL}, {last=Bhattacharya, S...",Protamine sulfate enhances lipid-mediated gene...,cite journal,,Initial Section,...,,,,,,,,10.1038/sj.gt.3300484,True,
4,4,1831220,Protamine sulfate,u'{{cite journal|last=Walker|first=WS|author2=...,"[['PMID', '6743419'], ['PMC', '481594'], ['DOI...","[{last=Walker, first=WS}, {last=Reid, KG}, {la...",Successful cardiopulmonary bypass in diabetics...,cite journal,,Initial Section,...,,,,,,,,10.1136/hrt.52.1.112,True,
5,5,1831220,Protamine sulfate,u'{{cite journal|last=Campbell|first=FW|author...,"[['PMID', '6334459'], ['DOI', '10.1097/0000054...","[{last=Campbell, first=FW}, {last=Goldstein, M...",Management of the patient with protamine hyper...,cite journal,,Initial Section,...,,,,,,,,10.1097/00000542-198412000-00021,True,
6,6,1831220,Protamine sulfate,u'{{cite journal|last=Welsby|first=IJ|author2=...,"[['PMID', '15681944'], ['DOI', '10.1097/000005...","[{last=Welsby, first=IJ}, {last=Newman, MF}, {...",Hemodynamic changes after protamine administra...,cite journal,,Initial Section,...,,,,,,,,10.1097/00000542-200502000-00011,True,


## Making a dataset which is `non-scientific` in nature

In [29]:
from pyspark.sql.functions import col

In [30]:
# Read dataset which contains each column of the citation separated from STEP 3 of data extraction
citations_separated = sqlContext.read.parquet('data/citations_separated.parquet/')

1. For the non scientific case, we are first taking citations which have a URL only but there are no IDs present in the ID_list. Too make it more specific, we will using the type of citation as the ones which have citation type as `cite web` since it will boil it down to only citations which are associated with less scientific publication and more newspapers etc.

In [31]:
citations_with_url_no_id = citations_separated.filter(
    col('URL').isNotNull() & col('ID_list').isNull() & (col('type_of_citation') == 'cite web') & col('Authors').isNull()
)

In [32]:
# Sampling the data rows and getting only the necessary columns and converting it to Pandas
sampled_rows_url_no_id = citations_with_url_no_id.sample(False, 0.065, 0)
sampled_rows_url_no_id = sampled_rows_url_no_id.select(
    'id', 'page_title', 'citations', 'Title', 'type_of_citation', 'sections').toPandas()

In [33]:
sampled_rows_url_no_id.groupby('type_of_citation').count()

Unnamed: 0_level_0,id,page_title,citations,Title
type_of_citation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cite web,690728,690728,690728,688524


2. We will not also consider cases where ID_list is present, such as the case for ISBN and use the similarity ratio which we figured out in the previous section and use it to get `non-scientific` citations.

In [34]:
titles_which_are_isbn_non_scientific = citation_title_isbn_similarity[
    citation_title_isbn_similarity['similarity'] < 0.2]['citation_title']

In [35]:
citation_with_ids.loc[
    citation_with_ids['citation_title'].isin(titles_which_are_isbn_non_scientific), 'scientific_or_not'] = False

In [36]:
print('Total number of citations which have ISBN but non scientific: {}'.format(
    len(citation_with_ids[citation_with_ids['scientific_or_not'] == False]['citation_title'])))

Total number of citations which have ISBN but non scientific: 242148


In [37]:
citation_with_ids[citation_with_ids['scientific_or_not']==False].groupby('citation_type').count()['id']

citation_type
citation                14046
cite arxiv                  1
cite av media              64
cite av media notes        12
cite book              215678
cite conference            65
cite dvd notes              2
cite encyclopedia        6618
cite episode                3
cite interview              8
cite journal             3136
cite map                 1181
cite news                 182
cite newsgroup              1
cite report                25
cite thesis                32
cite web                 1089
harvnb                      5
Name: id, dtype: int64

3. We will not also consider cases where ID_list is present, such as the case for DOI and use the similarity ratio which we figured out in the previous section and use it to get `non-scientific` citations.

In [38]:
titles_which_are_non_scientific = citation_title_similarity[
    (citation_title_similarity['similarity'] > 0.08) & (citation_title_similarity['similarity'] < 0.23)]['citation_title']

In [39]:
citation_with_ids.loc[
    citation_with_ids['citation_title'].isin(titles_which_are_non_scientific), 'scientific_or_not'] = False

In [40]:
print('Total number of citations which have DOI but non-scientific: {}'.format(
    len(citation_with_ids[citation_with_ids['scientific_or_not'] == False]['citation_title'])))

Total number of citations which have DOI but non-scientific: 267562


4. We will now consider other `citation types` to make the dataset more balanced..

We are considering `cite news` since it is not present in other sections for which we have `non-scientific` data

In [41]:
citations_news_no_id= citations_separated.filter(
    col('URL').isNotNull() & col('ID_list').isNull() & 
    (col('type_of_citation') == 'cite news') & col('Authors').isNull()
)

In [42]:
# Sampling the data rows and getting only the necessary columns and converting it to Pandas
sampled_rows_news_url_no_id = citations_news_no_id.sample(False, 0.065, 0)
sampled_rows_news_url_no_id = sampled_rows_news_url_no_id.select(
    'id', 'page_title', 'citations', 'Title', 'type_of_citation', 'sections').toPandas()

In [43]:
sampled_rows_news_url_no_id.groupby('type_of_citation').count()

Unnamed: 0_level_0,id,page_title,citations,Title
type_of_citation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cite news,131972,131972,131972,131706


### Making an aggregated dataset which is non scientific

In [44]:
non_scientific_part_one = pd.concat([sampled_rows_url_no_id, sampled_rows_news_url_no_id])

In [45]:
non_scientific_part_two = citation_with_ids[
    citation_with_ids['scientific_or_not'] == False][
    ['id', 'page_title', 'citation', 'citation_title', 'citation_type', 'sections', 'DOI', 'ISBN', 'PMC', 'PMID']]
non_scientific_part_one.columns = ['id', 'page_title', 'citation', 'citation_title', 'citation_type']

In [46]:
non_scientific_dataset = pd.concat([non_scientific_part_one, non_scientific_part_two], axis=0, ignore_index=True)

## Aggregating the scientific and non scientific dataset

In [47]:
scientific_dataset = citation_with_ids[
    citation_with_ids['scientific_or_not'] == True][
    ['id', 'page_title', 'citation', 'citation_title', 'citation_type', 'sections', 'DOI', 'ISBN', 'PMC', 'PMID']]

In [48]:
scientific_dataset['scientific_or_not'] = True
non_scientific_dataset['scientific_or_not'] = False

In [49]:
# Aggregating the scientific and non scientific dataset by stacking rows
dataset = pd.concat([scientific_dataset, non_scientific_dataset], axis=0)

In [50]:
print('The total number of citations in the dataset are: {}'.format(len(dataset)))
print('The total number of unique pages in the dataset are: {}'.format(dataset['page_title'].nunique()))

The total number of citations in the dataset are: 2243762
The total number of unique pages in the dataset are: 883610


In [51]:
# Remove the unicode quotes
dataset['citation'] = dataset['citation'].apply(lambda x: x[2:len(x) - 1])

In [52]:
# Saving the dataset for further use
# dataset.to_csv('data/dataset.csv')

## Verification of the dataset

So, we have 2,243,960 million data points out of which 2,028,821 million data citations are unique. The others are not. Since, we labeled them using similarity techniques, we can check if there any labels which contradict each other and hence this would act as a verification step.

In [53]:
print('Number of unique elements in the dataset are: {}'.format(dataset['citation'].nunique()))

Number of unique elements in the dataset are: 2028821


In [54]:
# Aggrgeating the boolean labels for citations which are not unique and getting their length
any_conflicting_labels = dataset[['citation', 'scientific_or_not']].groupby('citation').agg(
    {'scientific_or_not': lambda x: tuple(set(x))}
)
any_conflicting_labels['len'] = any_conflicting_labels['scientific_or_not'].apply(lambda x: len(x))

No labels are found to have conflicting labels i.e. no citation is scientific and non scientific at the same time.

In [55]:
any_conflicting_labels[any_conflicting_labels['len'] != 1]

Unnamed: 0_level_0,scientific_or_not,len
citation,Unnamed: 1_level_1,Unnamed: 2_level_1


## Getting features from the dataset

Now for each of the unique pages, we get the content and get all the features.

In [56]:
scientific_or_not_titles = dataset['page_title'].unique().tolist()
titles_df = pd.DataFrame(scientific_or_not_titles, columns=['titles'])
# titles_df.to_csv('titles_df.csv')

The features were extracted on the DLAB - EPFL Cluster using the scripts in the features folder.

Now we will need to have a dataframe for **citations +  titles** - so that we get the features only for the citations we need.

Note: Incase one faces a TypeError while converting a Pandas Dataframe into a Spark Dataframe:

```
TypeError: field page_title: Can not merge type <class 'pyspark.sql.types.StringType'> and <class 'pyspark.sql.types.DoubleType'>
```

Convert the columns of the Pandas Dataframe into what is their defined type - for example where citation can be converted into a string

```
df['citation'] = df['citation'].astype(str)
```

In [68]:
# page_content = page_content.filter(page_content['page_title'].isin(scientific_or_not_titles))
titles_df_spark = sqlContext.createDataFrame(titles_df, StringType())
titles_df_spark.write.mode('overwrite').parquet('./titles_df.parquet')

In [74]:
# ids_and_citations = dataset[['id', 'citation']]
# ids_and_citations['page_title'] = ids_and_citations['page_title'].astype(str)
# ids_and_citations['citation'] = ids_and_citations['citation'].astype(str)

In [79]:
# ids_and_citations_spark = sqlContext.createDataFrame(ids_and_citations)
# ids_and_citations_spark.write.mode('overwrite').parquet('./ids_and_citations.parquet')

Now let's load the csv file directory which we got from the cluster and combine it into one so that we can merge it with the `dataset` variable above to get the labels associated with them.

In [57]:
# Read the csv file which contains the citations and the features
citations_features = pd.read_parquet('./citations_features.parquet/', engine='pyarrow') # pip install pyarrow

In [58]:
# citations_features.drop('Unnamed: 0', axis=1, inplace=True)
citations_features.head()

Unnamed: 0,id,citation,page_title,page_id,ref_index,total_words,neighboring_words,neighboring_tags
0,25,"{{cite journal | vauthors = Myers SM, Johnson ...",Autism,25,1229,25544,"[disorders, journal, Pediatrics, volume, 120, ...","[NNS, JJ, NNPS, NN, CD, NN, NNS, CD, NN, CD, N..."
1,308,{{cite book |last1=Garver |first1=Eugene |titl...,Aristotle,308,16337,33810,"[of, Chicago, Press, isbn978-0-226-28425-5, pa...","[IN, NNP, NNP, JJ, NN, JJ, NN, NN, NN, JJ, NN,..."
2,573,{{Cite book|title=The Jewish Alchemists: A His...,Alchemy,573,10659,14747,"[in, France, as, the, bain-marie, ,, is, said,...","[IN, NNP, IN, DT, NN, ,, VBZ, VBD, TO, VB, VBN..."
3,597,{{Citation |last1=Chaubey|first1=G.|last2=Mets...,Austroasiatic languages,597,5952,6935,"[,, Byomkes, ., 1994, ., ''A, Comparative, Stu...","[,, NNP, ., CD, ., CC, NNP, NNP, IN, NNP, CC, ..."
4,627,{{cite journal |last1=Denham |first1=T. P. |ti...,Agriculture,627,842,26410,"[origin, ,, and, Papua, New, Guinea, area, ,, ...","[NN, ,, CC, NNP, NNP, NNP, NN, ,, NN, VBD, VBN..."


Now lets merge the 2 variables `dataset` and `citations_features`...

In [59]:
dataset_with_features = pd.merge(
    dataset, citations_features, how='inner', left_on=['id','citation'], right_on = ['id','citation']
)
dataset_with_features.drop('page_title_y', axis=1, inplace=True)

In [60]:
dataset_with_features.head()

Unnamed: 0,DOI,ISBN,PMC,PMID,citation,citation_title,citation_type,id,page_title_x,scientific_or_not,sections,page_id,ref_index,total_words,neighboring_words,neighboring_tags
0,10.1016/0166-0934(89)90132-8,,,2786000,{{cite journal | author= Kenneth Cornetta | au...,Protamine sulfate as an effective alternative ...,cite journal,1831220,Protamine sulfate,True,Initial Section,1831220,941,1661,"[Methods, year, 1989, volume, 23, issue, pages...","[NNP, NN, CD, NN, CD, NN, NNS, CD, JJ, VBD, JJ..."
1,10.1038/sj.gt.3300484,,,9349433,{{cite journal|last=Sorgi|first=FL|author2=Bha...,Protamine sulfate enhances lipid-mediated gene...,cite journal,1831220,Protamine sulfate,True,Initial Section,1831220,1025,1661,"[journallastSorgifirstFLauthor2Bhattacharya, ,...","[NN, ,, NNP, ,, NNP, NN, NNS, JJ, NN, NN, VBD,..."
2,10.1136/hrt.52.1.112,,481594.0,6743419,{{cite journal|last=Walker|first=WS|author2=Re...,Successful cardiopulmonary bypass in diabetics...,cite journal,1831220,Protamine sulfate,True,Initial Section,1831220,1187,1661,"[,, FE, ., titleSuccessful, cardiopulmonary, b...","[,, NNP, ., JJ, JJ, NN, IN, NNS, IN, JJ, NNS, ..."
3,10.1097/00000542-198412000-00021,,,6334459,{{cite journal|last=Campbell|first=FW|author2=...,Management of the patient with protamine hyper...,cite journal,1831220,Protamine sulfate,True,Initial Section,1831220,1267,1661,"[,, MFauthor3Atkins, ,, PC, ., titleManagement...","[,, NNP, ,, NN, ., NN, IN, DT, NN, IN, JJ, NN,..."
4,10.1097/00000542-200502000-00011,,,15681944,{{cite journal|last=Welsby|first=IJ|author2=Ne...,Hemodynamic changes after protamine administra...,cite journal,1831220,Protamine sulfate,True,Initial Section,1831220,1364,1661,"[titleHemodynamic, changes, after, protamine, ...","[JJ, NNS, IN, JJ, NN, NN, IN, NN, IN, JJ, NN, ..."


In [69]:
# dataset_with_features.to_csv('data/dataset_with_features.csv')