In [1]:
import glob
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from itertools import chain
from keras.models import Model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.manifold import TSNE
from gensim.models import FastText
from sklearn.decomposition import PCA
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split 
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Initializing tqdm for pandas
tqdm.pandas()

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos if x.device_type == 'GPU'])

[]


In [3]:
np.random.seed(0)

In [4]:
citations_features = pd.read_parquet('./citations_features.parquet/', engine='pyarrow')
dataset = pd.read_csv('dataset.csv')

In [5]:
# Merging the citation and their corresponding features which have been extracted
book_journal_features = pd.merge(
    dataset, citations_features, how='inner', left_on=['id','citation'], right_on = ['id','citation']
)
book_journal_features.drop('page_title_y', axis=1, inplace=True)
book_journal_features.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
book_journal_features.shape

(2202646, 16)

In [7]:
# Only consider unique citations so that the dataset is more varied
book_journal_features = book_journal_features.set_index(['id', 'citation'])
book_journal_features = book_journal_features[~book_journal_features.index.duplicated(keep='first')]
book_journal_features = book_journal_features.reset_index()

## Get auxiliary features and divide them into labels

1. `ref_index`
2. `total_words`
3. `tags`
4. `type_of_citation`

#### can include `section` of the page in which the citation belongs to

In [8]:
book_journal_features['actual_label'] = 'rest'

In [9]:
book_journal_features.loc[~pd.isna(book_journal_features['PMC']), ['actual_label']] = 'journal'
book_journal_features.loc[~pd.isna(book_journal_features['PMID']), ['actual_label']] = 'journal'

In [10]:
only_doi = (
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMC']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['ISBN'])
)
book_journal_features.loc[only_doi, ['actual_label']] = 'journal'

In [11]:
only_book = (
    ~pd.isna(book_journal_features['ISBN']) & 
    pd.isna(book_journal_features['PMC']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['DOI'])
)
book_journal_features.loc[only_book, ['actual_label']] = 'book'

In [12]:
both_book_and_doi_journal = (
    ~pd.isna(book_journal_features['ISBN']) & 
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['PMC']) &
    book_journal_features['citation_type'].isin(['cite journal', 'cite conference'])
)
book_journal_features.loc[both_book_and_doi_journal, ['actual_label']] = 'journal'

In [13]:
both_book_and_doi_book = (
    ~pd.isna(book_journal_features['ISBN']) & 
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['PMC']) &
    book_journal_features['citation_type'].isin(['cite book', 'cite encyclopedia'])
)
book_journal_features.loc[both_book_and_doi_book, ['actual_label']] = 'book'

In [14]:
## Made the dataset which contains citations book and journal labeled
book_journal_features = book_journal_features[book_journal_features['actual_label'].isin(['book', 'journal'])]
book_journal_features = book_journal_features[[
    'sections', 'citation_type', 'citation', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'actual_label', 'neighboring_words'
]]
book_journal_features.shape

(1402511, 9)

In [15]:
## loading the dataset of newspapers which was generated from the citations_separated dataset
li = []
all_files = glob.glob('/dlabdata1/harshdee/newspapers_citations_features.csv/' + "/*.csv")

for filename in all_files:
    df = pd.read_csv(filename, header=None, sep='\t')
    li.append(df)

newspaper_data = pd.concat(li, axis=0, ignore_index=True)
newspaper_data.shape

(1388908, 35)

In [16]:
newspaper_data = newspaper_data[[0, 1, 2, 3, 4, 28, 32, 33]]
newspaper_data.rename({
    0: 'citation', 1: 'ref_index', 2: 'total_words',
    3: 'neighboring_words', 4: 'neighboring_tags', 
    28: 'id', 32: 'sections', 33: 'citation_type'}, axis=1, inplace=True)
newspaper_data['actual_label'] = 'rest'

In [17]:
dataset_with_features = pd.concat([book_journal_features, newspaper_data])
dataset_with_features.shape

(2791419, 9)

In [18]:
le = preprocessing.LabelEncoder()
le.fit(dataset_with_features['actual_label'])
dataset_with_features['label_category'] = le.transform(dataset_with_features['actual_label'])

In [19]:
dataset_with_features[dataset_with_features['actual_label'] == 'rest']

Unnamed: 0,actual_label,citation,citation_type,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,label_category
0,rest,{{Citation | date = 10 December 2012 | url = h...,citation,469553,"[CD,NN,NN,CD,NN,NN,NN,'',NNP,NNP,'',NN,NN,CD,N...","[99,issue,page,463,ref,ref,name=,'',Lovelace,G...",1427,Initial Section,2322,2
1,rest,{{Citation | last = | first = | author-link ...,citation,566081,"[RB,VBD,VBN,.,NNP,VBD,PRP,MD,VB,IN,PRP,TO,NNP,...","[Keely,had,gone,.,Edey,said,he,would,speak,abo...",3926,Initial Section,8729,2
2,rest,{{Citation | last = ''[[The New York Times]]''...,citation,28571997,"['',DT,NNP,NNP,NNP,'',NNP,RB,VBP,JJ,JJ,JJ,NN,M...","['',The,New,York,Times,'',Staff,first,author-l...",554,Initial Section,1680,2
3,rest,{{Citation | last = Schwartz | first = John |...,citation,17608461,"[FW,NNP,'',NN,JJ,NNP,RB,NNP,JJ,NN,:,NN,.,VBD,C...","[name=,NYT,'',Citation,last,Schwartz,first,Joh...",679,Controversies,2392,2
4,rest,{{Citation | title = Doctor Who recap series 3...,citation,43275595,"[TO,VB,JJR,IN,PRP,MD,VB,VBN,.,'',PRP,VBD,RP,DT...","[to,be,better,than,we,might,have,expected,.,''...",2664,Initial Section,3333,2
5,rest,"{{Citation | title = Tom Dolan, Husband of Aar...",citation,2850681,"[IN,NNP,JJ,NN,NN,NNP,'',DT,NNP,NNP,'',DT,NNP,N...","[by,Brooklyn,graffiti,art,ist,BAMN,'',The,Huff...",14537,Software developments,26326,2
6,rest,{{Citation | url = https://www.washingtonpost....,citation,44330919,"[NN,NNP,CD,NN,NNP,RB,VBD,DT,FW,NNP,'',NNP,NNP,...","[accessdate,October,2015,ref,Carson,also,told,...",11001,Political and related positions,22209,2
7,rest,{{Citation | url=https://www.theguardian.com/b...,citation,47896187,"[VBN,RB,RB,IN,NN,IN,NN,TO,DT,NNS,NN,.,NN,NN,''...","[taken,so,far,by,government,in,reaction,to,the...",22638,Consequences,37031,2
8,rest,{{Citation |contribution-url=http://www.bbc.co...,citation,37669,"[DT,JJ,TO,VB,VBN,IN,NNP,CC,VBD,NN,IN,JJ,JJ,NNS...","[the,first,to,be,printed,in,English,and,became...",918,Initial Section,9687,2
9,rest,{{Citation |first=Andrea |last=Levy |url=https...,citation,711824,"[WDT,PRP,VBP,JJ,JJ,CC,JJ,NNS,.,NN,NN,'',DT,VBZ...","[which,they,negotiate,racial,cultural,and,nati...",191,Initial Section,2486,2


In [20]:
## clearing up memory
del citations_features
del dataset
del book_journal_features
del newspaper_data

import gc
gc.collect()

35

In [23]:
## Remove rows which have duplicate ID and citations since they are just the same examples
dataset_with_features = dataset_with_features.drop_duplicates(subset=['id', 'citation']) ## keeps first row

In [88]:
dataset_with_features.shape

(2453971, 10)

### Taking the unique `sections` and one hot encoding it to get a vector

In [26]:
# Only processing auxiliary features which are going to be used in the neural network
auxiliary_features = dataset_with_features[
    ['sections', 'citation_type', 'citation', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'label_category']]

In [27]:
auxiliary_features['sections'] = auxiliary_features['sections'].astype(str)
auxiliary_features['sections'] = auxiliary_features['sections'].apply(lambda x: x.split(', '))

In [28]:
section_counts = pd.Series(Counter(chain.from_iterable(x for x in auxiliary_features.sections)))
largest_sections = section_counts.nlargest(150)

In [29]:
# Change section to `OTHERS` if occurence of the section is not in the 150 largest sections
auxiliary_features['sections'] = auxiliary_features['sections'].progress_apply(
    lambda x: list(set(['Others' if i not in largest_sections else i for i in x]))
)

100%|██████████| 2453971/2453971 [00:07<00:00, 320080.88it/s]


In [30]:
section_dummies = pd.get_dummies(auxiliary_features.sections.apply(pd.Series).stack())

In [31]:
auxiliary_features = auxiliary_features.join(section_dummies.sum(level=0))

In [32]:
auxiliary_features.drop('sections', axis=1, inplace=True)
auxiliary_features.head()

Unnamed: 0,citation_type,citation,id,ref_index,total_words,neighboring_tags,label_category,20th century,21st century,Activities,...,Taxonomy,Terminology,Timeline,Transfers,Treatment,Types,Usage,Uses,Work,Works
0,cite journal,{{cite journal | author= Kenneth Cornetta | au...,1831220,946,1649,"[IN, DT, JJ, NN, TO, VB, IN, JJ, NN, :, NNS, I...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,citation,{{Citation | date = 10 December 2012 | url = h...,469553,1427,2322,"[CD,NN,NN,CD,NN,NN,NN,'',NNP,NNP,'',NN,NN,CD,N...",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,cite journal,{{cite journal|last=Sorgi|first=FL|author2=Bha...,1831220,1034,1649,"[CD, JJ, NN, CD, NN, CC, JJ, NN, ., VBG, JJ, N...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,citation,{{Citation | last = | first = | author-link ...,566081,3926,8729,"[RB,VBD,VBN,.,NNP,VBD,PRP,MD,VB,IN,PRP,TO,NNP,...",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,cite journal,{{cite journal|last=Walker|first=WS|author2=Re...,1831220,1194,1649,"[JJ, NNS, VBG, NN, CC, VBD, CC, JJ, NNS, ., NN...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Taking the `type of citations` and one hot encoding it to get a vector

In [33]:
# Get one hot encoding of citation_type column
citation_type_encoding = pd.get_dummies(auxiliary_features['citation_type'])

In [34]:
# Drop column citation_type as it is now encoded and join it
auxiliary_features = auxiliary_features.drop('citation_type', axis = 1)

In [35]:
# Concat columns of the dummies along the axis with the matching index
auxiliary_features = pd.concat([auxiliary_features, citation_type_encoding], axis=1)
auxiliary_features.head()

Unnamed: 0,citation,id,ref_index,total_words,neighboring_tags,label_category,20th century,21st century,Activities,Adverse effects,...,cite podcast,cite press release,cite report,cite serial,cite speech,cite sports-reference,cite techreport,cite thesis,cite web,harvnb
0,{{cite journal | author= Kenneth Cornetta | au...,1831220,946,1649,"[IN, DT, JJ, NN, TO, VB, IN, JJ, NN, :, NNS, I...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,{{Citation | date = 10 December 2012 | url = h...,469553,1427,2322,"[CD,NN,NN,CD,NN,NN,NN,'',NNP,NNP,'',NN,NN,CD,N...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,{{cite journal|last=Sorgi|first=FL|author2=Bha...,1831220,1034,1649,"[CD, JJ, NN, CD, NN, CC, JJ, NN, ., VBG, JJ, N...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,{{Citation | last = | first = | author-link ...,566081,3926,8729,"[RB,VBD,VBN,.,NNP,VBD,PRP,MD,VB,IN,PRP,TO,NNP,...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,{{cite journal|last=Walker|first=WS|author2=Re...,1831220,1194,1649,"[JJ, NNS, VBG, NN, CC, VBD, CC, JJ, NNS, ., NN...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


As we can see for the feature `total_number_of_words`, the mean and median **(since it is more robust in nature!)** are pretty high for articles which are `not` journal or books

In [36]:
print('Total mean length of journal articles: {}'.format( ## Journal - length is less
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].mean()))
print('Total median length of journal articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].median()))

Total mean length of journal articles: 6599.744173897159
Total median length of journal articles: 3011.0


In [37]:
print('Total mean length of book articles: {}'.format( ## Rest of the article have larger length
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].mean()))
print('Total median length of book articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].median()))

Total mean length of book articles: 11543.160542483785
Total median length of book articles: 5151.0


In [38]:
print('Total mean length of book articles: {}'.format( ## Books - length is less
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].mean()))
print('Total median length of book articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].median()))

Total mean length of book articles: 6832.2215568965275
Total median length of book articles: 2758.0


### Taking the `neighboring_tags` and making an encoder dictionary for it

To have more info about how what tag mean what: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [39]:
citation_tag_features = dataset_with_features[['id', 'citation', 'neighboring_tags']]

In [40]:
citation_tag_features['neighboring_tags'].iloc[0]

array(['IN', 'DT', 'JJ', 'NN', 'TO', 'VB', 'IN', 'JJ', 'NN', ':', 'NNS',
       'IN', 'JJ', 'NN', 'JJ', 'JJ', 'NNP', 'IN', 'NNP', 'NNP', 'NN',
       'CD', 'NN', 'CD', 'NN', 'NN', 'CD', 'NN', ':', 'JJ', 'JJ', 'CD',
       'JJ', 'NN', 'CD', 'NN', 'NN', 'NN', 'NNP', 'CD'], dtype=object)

In [41]:
# Get the count for each POS tag so that we have an estimation as to how many are there
tag_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_tag_features.neighboring_tags)))

In [42]:
# Considering the 10 smallest tags and checking which one does not have resemblance
tag_counts.nsmallest(10) 

L         2
``      234
UH      650
`       678
SYM    1426
Y      1466
U      1503
H      1503
WP$    2106
PDT    3416
dtype: int64

We are going to replace `LS`, `the 2 backquotes` and the `the dollar symbol` since they do not have too much use case and do not give too much information about the context of the neighboring citation text.

In [43]:
OTHER_TAGS = ['LS', '``', '$']
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: [i if i not in OTHER_TAGS else 'Others' for i in x]
)

100%|██████████| 2453971/2453971 [00:46<00:00, 53093.96it/s] 


Now, we can use the `count vectorizer` to represent the `POS tags` as a vector where each element of the vector represents the count of that tag in that particular citation.

In [44]:
cv = CountVectorizer() # Instantiate the vectorizer

In [45]:
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 2453971/2453971 [00:04<00:00, 493203.69it/s]


In [46]:
transformed_neighboring_tags = cv.fit_transform(citation_tag_features['neighboring_tags'])
transformed_neighboring_tags = pd.DataFrame(transformed_neighboring_tags.toarray(), columns=cv.get_feature_names())

In [47]:
citation_tag_features = pd.concat([citation_tag_features, transformed_neighboring_tags], join='inner', axis=1)

In [48]:
citation_tag_features.drop('neighboring_tags', axis=1, inplace=True)
citation_tag_features.head()

Unnamed: 0,id,citation,cc,cd,dt,ex,fw,in,jj,jjr,...,vb,vbd,vbg,vbn,vbp,vbz,wdt,wikicode,wp,wrb
0,1831220,{{cite journal | author= Kenneth Cornetta | au...,0,6,1,0,0,4,8,0,...,1,0,0,0,0,0,0,0,0,0
1,1831220,{{cite journal|last=Sorgi|first=FL|author2=Bha...,1,5,0,0,0,0,6,0,...,0,1,1,0,1,0,0,0,0,0
2,1831220,{{cite journal|last=Walker|first=WS|author2=Re...,2,2,0,0,0,2,5,0,...,1,1,1,0,0,0,0,0,0,0
3,1831220,{{cite journal|last=Campbell|first=FW|author2=...,0,8,1,0,0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
4,1831220,{{cite journal|last=Welsby|first=IJ|author2=Ne...,0,4,0,0,0,3,6,0,...,0,0,0,0,0,0,0,0,0,0


## Features for the LSTM - more time sequence related

### Citation's original text features

In [49]:
# Create a separate dataframe for preprocessing citation text
citation_text_features = dataset_with_features[['id', 'citation', 'label_category']]

In [50]:
# Convert the citation into a list by breaking it down into characters
citation_text_features['characters'] = citation_text_features['citation'].progress_apply(lambda x: list(x))

100%|██████████| 2453971/2453971 [00:40<00:00, 61044.87it/s] 


In [51]:
# Get the character counts for each unique character
char_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_text_features.characters)))
char_counts.index

Index(['{', 'c', 'i', 't', 'e', ' ', 'j', 'o', 'u', 'r', 'n', 'a', 'l', '|',
       'h', '=', 'K', 'C', '2', 'W', '.', 'F', 'A', 'd', 's', 'P', 'm', 'f',
       'v', 'p', 'y', 'b', '-', 'g', ':', 'J', 'V', 'M', '1', '9', '8', '3',
       '7', '\', '0', '4', '/', 'w', '6', '(', ')', '}', 'S', 'L', 'B', ',',
       'H', 'G', 'k', 'R', 'D', 'I', '5', 'E', 'N', 'z', 'T', 'x', 'O', '&',
       'Z', '?', 'Y', 'q', 'U', 'Q', '_', 'X', ';', '[', ']', '+', '#', '%',
       '!', ''', '"', '~', '<', '>', '*', '`', '^', '@', '$'],
      dtype='object')

In [52]:
print('The max length of the longest citation in terms of characters is: {}'.format(
    max(citation_text_features.characters.apply(lambda x: len(x)))))

print('The mean length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).mean()))

print('The median length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).median()))

The max length of the longest citation in terms of characters is: 34353
The mean length of the longest citation in terms of characters is: 293.35183708364934
The median length of the longest citation in terms of characters is: 271.0


In [53]:
# Make a dictionary for creating a mapping between the char and the corresponding index
char2ind = {char: i for i, char in enumerate(char_counts.index)}
ind2char = {i: char for i, char in enumerate(char_counts.index)}

In [54]:
# Map each character into the citation to its corresponding index and store it in a list
X_char = []
for citation in citation_text_features.citation:
    citation_chars = []
    for character in citation:
        citation_chars.append(char2ind[character])
        
    X_char.append(citation_chars)

Since the median length of the citation is 282, we have padded the input till 400 to get extra information which would be fed into the character embedding neural network.

In [55]:
with tf.device('/gpu:0'):
    X_char = pad_sequences(X_char, maxlen=400)

In [56]:
# Append the citation character list with their corresponding lists for making a dataset
# for getting the character embeddings
data = []
for i in tqdm(range(len(X_char))):
    data.append((X_char[i], int(citation_text_features.iloc[i]['label_category'])))

100%|██████████| 2453971/2453971 [08:04<00:00, 5069.97it/s]


In [57]:
# Separate out the training data and labels for further verification use
training_data = [i[0] for i in data]
training_labels = [i[1] for i in data]

We are going to feed in the 400 character input since our median length comes out to be approximately 282 and train it on a dummy task - if the citation is scientific or not and get the embedding layer which would contain the representation for each character.

In [58]:
from keras.utils import to_categorical

categorical_labels = to_categorical(training_labels, num_classes=3)

In [59]:
def citation_embedding_model():
    """
    Citation embedding generator model where the dimension of the embedding is 50.
    """
    main_input = Input(shape=(400, ), name='characters')
    # input dim is basically the vocab size
    emb = Embedding(input_dim=95, output_dim = 300, name='citation_embedding')(main_input)
    rnn = Bidirectional(LSTM(20))
    x = rnn(emb)
    de = Dense(3, activation='softmax')(x)
    model = Model(inputs = main_input, outputs = de)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [60]:
# Instantiate the model and generate the summary
model = citation_embedding_model()
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
characters (InputLayer)      (None, 400)               0         
_________________________________________________________________
citation_embedding (Embeddin (None, 400, 300)          28500     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 123       
Total params: 79,983
Trainable params: 79,983
Non-trainable params: 0
_________________________________________________________________


In [61]:
def generator(features, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features = np.zeros((batch_size, 400))
    batch_labels = np.zeros((batch_size, 3))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features), 1)[0]
            batch_features[i] = features[index]
            batch_labels[i] = categorical_labels[index]
        yield batch_features, batch_labels

In [62]:
# Run the model with the data being generated by the generator with a batch size of 64
# and number of epochs to be set to 15
hist = model.fit_generator(generator(training_data, categorical_labels, 64), samples_per_epoch=30, nb_epoch=15)

Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [63]:
# Save the model so that we can retrieve it later
# model.save('./embedding_model.h5')

In [64]:
# Get the `citation_embedding` layer and get the weights for each character
citation_layer = model.get_layer('citation_embedding')
citation_weights = citation_layer.get_weights()[0]
citation_weights.shape

(95, 300)

In [65]:
# An example of the first element of an embedding
citation_weights[0][:100]

array([ 0.01189123,  0.04370288,  0.00031026, -0.04214857, -0.00431195,
        0.00619524, -0.0269199 ,  0.01572377, -0.02040187, -0.01892583,
       -0.01155559, -0.01423857,  0.01058181,  0.05017758,  0.0124469 ,
        0.05356099, -0.02875755, -0.00841217,  0.02235851,  0.03364012,
        0.04286821,  0.0133015 ,  0.00080163, -0.04185148,  0.02404357,
       -0.03019644, -0.01477948,  0.03001074,  0.03345497, -0.02667047,
        0.04956942,  0.00269952,  0.01288244,  0.04655018, -0.00432052,
        0.0125154 , -0.02914265, -0.00929398, -0.03683035, -0.03522526,
       -0.01796045,  0.01024454, -0.01842368, -0.01174657,  0.0652336 ,
        0.01618789,  0.04744877,  0.00754394, -0.04456269,  0.0070715 ,
       -0.01405442, -0.0543743 ,  0.01465258,  0.04732801, -0.01740676,
        0.03404899,  0.00033237, -0.02001428,  0.00364981, -0.00930269,
       -0.02321461, -0.0186593 , -0.01450481, -0.02492204,  0.03149597,
        0.02592646, -0.028067  ,  0.03134247,  0.02240428, -0.00

In [66]:
# Map the embedding of each character to the character in each corresponding citation and aggregate (sum)
citation_text_features['embedding'] = citation_text_features['characters'].progress_apply(
    lambda x: sum([citation_weights[char2ind[c]] for c in x])
)

100%|██████████| 2453971/2453971 [12:38<00:00, 3234.61it/s]


In [67]:
# Normalize the citation embeddings so that we can check for their similarity later
citation_text_features['embedding'] = citation_text_features['embedding'].progress_apply(
    lambda x: x/ np.linalg.norm(x, axis=0).reshape((-1, 1))
)

100%|██████████| 2453971/2453971 [00:37<00:00, 65439.79it/s]


In [68]:
# Make the sum of the embedding to be summed up to 1
np.sum(np.square(citation_text_features['embedding'].iloc[0]))

1.0

### Similarity Graph for citation text embeddings

In [69]:
# Just considering 20 since otherwise it will be computationally extensive
# citation_text_and_embeddings = citation_text_features[['citation', 'embedding']][:500]

In [70]:
# citation_text_and_embeddings['embedding'] = citation_text_and_embeddings['embedding'].progress_apply(
#     lambda x: x[0].tolist()
# )

In [71]:
# def tsne_embedding_plot():
#     labels = []
#     tokens = []

#     index = 0
#     for row in citation_text_and_embeddings:
#         tokens.append(row['embedding'])
#         labels.append(str(index))
#         index += 1
    
#     # Perplexity takes into account the global and local features
#     # We are using dimensionality reduciton for 2 features and taking 2500 iterations into account
#     tsne_model = TSNE(perplexity=40, n_components=2, n_iter=2500, random_state=0)
#     new_values = tsne_model.fit_transform(tokens)

#     x = []
#     y = []
#     for value in new_values:
#         x.append(value[0])
#         y.append(value[1])
        
#     plt.figure(figsize=(10, 10)) 
#     for i in range(len(x)):
#         plt.scatter(x[i],y[i])
#         plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2),
#                      textcoords='offset points', ha='right', va='bottom')
#     plt.show()

In [72]:
# tsne_embedding_plot()

In [73]:
# an example of citation embeddings which is close to each other
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([14, 477])] # (51, 243), (0, 13)

In [74]:
# # Similiarity of 2 citations which are very similar
# result_similar = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[14]['embedding'],
#     citation_text_and_embeddings.iloc[477]['embedding']
# )
# result_similar

In [75]:
# an example of citation embeddings which is NOT close to each other and are different
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([42, 124])] # (6, 42)

In [76]:
# Similiarity of 2 citations which are not similar
# result_different = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[42]['embedding'],
#     citation_text_and_embeddings.iloc[124]['embedding']
# )
# result_different

### FastText embeddings for neighboring words

In [77]:
# Load the pretrained embedding model on wikipedia
model = FastText.load_fasttext_format('/dlabdata1/harshdee/wiki.en.bin')

In [78]:
# Create a separate dataframe for preprocessing citation words
citation_word_features = dataset_with_features[['id', 'citation', 'neighboring_words', 'label_category']]

In [79]:
# Lowercase all the neighboring words for each of the citations
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i.lower() for i in x]
)

100%|██████████| 2453971/2453971 [01:34<00:00, 26005.98it/s]


Get the total unique words with their respective counts in the total dataset. This is done in order to remove words which are of low frequency and will potentially act as noise to the model.

In [80]:
word_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words)))

In [81]:
threshold = 4

x = len(word_counts)
y = len(word_counts[word_counts <= threshold])
print('Total words: {}\nTotal number of words whose occurence is less than 4: {}\nDifference: {}'.format(x, y, x-y))
words_less_than_threshold = word_counts[word_counts <= threshold]

Total words: 4213281
Total number of words whose occurence is less than 4: 3858368
Difference: 354913


In [82]:
# Remove the words which have a count of less than 4 and replace them with the unique <UNK> symbol
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i if i not in words_less_than_threshold else '<UNK>' for i in x]
)

100%|██████████| 2453971/2453971 [08:19<00:00, 4913.03it/s]  


In [83]:
# creating a mapping between word and index or vice versa
words = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words))).index
word2ind = {w: i for i, w in enumerate(words)}
ind2words = {i: w for i, w in enumerate(words)}

In [84]:
word_embedding_matrix = np.zeros((len(word2ind), 300))
for w in tqdm(word2ind):
    index = word2ind[w]
    word_embedding_matrix[index] = model.wv[w]

100%|██████████| 354914/354914 [00:23<00:00, 15190.71it/s]


Once we have the word embedding for each word in the neighboring words, we sum the embeddings for each word together in neighboring words to get an embedding which represents the past 40 words.

In [85]:
citation_word_features['words_embedding'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: sum([word_embedding_matrix[word2ind[w]] for w in x])
)

100%|██████████| 2453971/2453971 [09:35<00:00, 4261.63it/s] 


Now we have the `citation_word_features` and `citation_tag_features`, so we can join them together to form `time_sequence_features` which would be fed later into the LSTM..

In [86]:
# Join time sequence features with the citations dataset
time_sequence_features = pd.concat([citation_tag_features, citation_word_features], keys=['id', 'citation'], axis=1)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [87]:
print('Total number of samples in time features are: {}'.format(time_sequence_features.shape))

Total number of samples in time features are: (2453971, 42)


In [89]:
# citation_text = auxiliary_features.iloc[:,0]
# auxiliary_features['citation_text'] = citation_text
# auxiliary_features.drop('citation', axis=1, inplace=True)
# auxiliary_features.rename({'citation_text': 'citation'}, axis=1, inplace=True)

In [95]:
# Join auxiliary features with the citations dataset
citation_text_features.reset_index(drop=True, inplace=True)
auxiliary_features.reset_index(drop=True, inplace=True)

auxiliary_features = pd.concat([auxiliary_features, citation_text_features], keys=['id', 'citation'], axis=1)
auxiliary_features = pd.concat([auxiliary_features['citation'], auxiliary_features['id']], axis=1)
auxiliary_features = auxiliary_features.loc[:, ~auxiliary_features.columns.duplicated()]
auxiliary_features.shape

(2453971, 185)

In [105]:
# Drop columns with are duplicates
auxiliary_features.drop(['neighboring_tags', 'characters'], axis=1, inplace=True)

In [106]:
del word_embedding_matrix
del citation_word_features
del citation_text_features

gc.collect()

2090

## Making sets for `auxiliary` and `time sequence` features

In [111]:
data = dataset_with_features[['id', 'citation', 'label_category']]

In [116]:
# Join the time sequence features for the data
time_sequence_features = pd.concat([time_sequence_features['id'], time_sequence_features['citation']], axis=1)
time_sequence_features = pd.concat([time_sequence_features, data], keys=['id', 'citation'], axis=1)
time_sequence_features.columns = time_sequence_features.columns.droplevel(0)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [124]:
time_sequence_features['words_embedding'] = time_sequence_features['words_embedding'].apply(lambda x: x.tolist())

In [125]:
auxiliary_features['embedding'] = auxiliary_features['embedding'].apply(lambda x: x.tolist())

In [126]:
len(time_sequence_features), len(auxiliary_features)

(2453971, 2453971)

## Splitting the dataset into training, testing and validation 

The split is done into 80-10-10 ratio so that we have more training data to train on and have validation dataset to make sure that the model is working as anticipated.

In [150]:
TOTAL_SAMPLES = time_sequence_features.shape[0]
indices = np.arange(TOTAL_SAMPLES)
np.random.shuffle(indices)

In [151]:
# Split indices into training and other sets (validation, testing)
training_end_index = int((TOTAL_SAMPLES * 80) / 100)
training_indices = indices[:training_end_index]
other_indices = indices[training_end_index:]

In [152]:
len(training_indices), len(other_indices)

(1963176, 490795)

In [153]:
# Get validation and testing indices
half_threshold = int(len(other_indices)/2)

validation_indices = other_indices[:half_threshold]
test_indices = other_indices[half_threshold:]

In [154]:
len(validation_indices), len(test_indices)

(245397, 245398)

In [155]:
# Get the labels for training, validation and testing since they are going to be the same
y_train = auxiliary_features.loc[training_indices, 'label_category'].astype(int).tolist()
y_val = auxiliary_features.loc[validation_indices, 'label_category'].astype(int).tolist()
y_test = auxiliary_features.loc[test_indices, 'label_category'].astype(int).tolist()

In [156]:
# Make a mask for auxiliary dataset to get all features except the one below
column_mask_aux = ~auxiliary_features.columns.isin(['id', 'citation', 'label_category'])

In [157]:
# # Get the columns of those auxiliary features and covert them into a list
training_auxiliary = auxiliary_features.loc[training_indices, column_mask_aux].values.tolist()
validation_auxiliary = auxiliary_features.loc[validation_indices, column_mask_aux].values.tolist()
testing_auxiliary = auxiliary_features.loc[test_indices, column_mask_aux].values.tolist()

In [181]:
# # Convert them into numpy array (for Keras) and stack them (if needed) as suited for the model's format
training_auxiliary = [np.hstack(training_auxiliary[i][0] + training_auxiliary[i][1:]) for i in range(len(training_auxiliary))]

In [182]:
validation_auxiliary = [
    np.hstack(validation_auxiliary[i][0] + validation_auxiliary[i][1:]) for i in range(len(validation_auxiliary))]
testing_auxiliary = [
    np.hstack(testing_auxiliary[i][0] + testing_auxiliary[i][1:]) for i in range(len(testing_auxiliary))]

In [196]:
# # Make a mask for time sequences features dataset to get all features except the one below
cols = [col for col in time_sequence_features.columns if col not in ['id', 'citation', 'label_category', 'neighboring_words']]
stripped_tsf = time_sequence_features[cols]

In [202]:
training_time = stripped_tsf.iloc[training_indices].values.tolist()
validation_time = stripped_tsf.iloc[validation_indices].values.tolist()
testing_time = stripped_tsf.iloc[test_indices].values.tolist()

In [203]:
def make_structure_time_features(time_features):
    """
    Concatenate features which are numbers and lists together by checking the type:
    
    param: time_features: the features which are considered time sequence.
    """
    feature_one = np.array([i for i in time_features if isinstance(i, int)])
    feature_two = np.array([i for i in time_features if isinstance(i, list)][0])
    return np.array([feature_one, feature_two])

In [205]:
training_time = [make_structure_time_features(i) for i in training_time]
validation_time = [make_structure_time_features(i) for i in validation_time]
testing_time = [make_structure_time_features(i) for i in testing_time]

## LSTM/Neural Network Model

In [212]:
# Instantiating PCA to 35 components since it should be equal to the size of the vector of the tags
pca = PCA(n_components=35)

In [213]:
def get_reduced_words_dimension(data):
    """
    Get the aggregated dataset of words and tags which has the
    same dimensionality using PCA.
    
    :param: data: data which needs to be aggregated.
    """
    tags = [i for i, _ in data]
    word_embeddings = [j for _,j in data]
    pca.fit(word_embeddings)
    
    word_embeddings_pca = pca.transform(word_embeddings)
    tags = np.array(tags)
    return np.dstack((word_embeddings_pca, tags))

In [214]:
test_pca = get_reduced_words_dimension(testing_time)

In [215]:
# Apply PCA on all the sets of data to have the dimensions of the data to be the same
trained_pca = get_reduced_words_dimension(training_time)
val_pca = get_reduced_words_dimension(validation_time)


In [216]:
y_train_oh = np.eye(3)[y_train]
y_val_oh = np.eye(3)[y_val]
y_test_oh = np.eye(3)[y_test]

In [228]:
def classification_model():
    """
    Model for classifying whether a citation is scientific or not.
    """
    main_input = Input(shape=(35, 2), name='time_input')
    lstm_out = LSTM(32)(main_input)

    auxiliary_input = Input(shape=(476,), name='aux_input')
    # Converging the auxiliary input with the LSTM output
    x = keras.layers.concatenate([lstm_out, auxiliary_input])

    # 4 fully connected layer
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    main_output = Dense(3, activation='softmax', name='main_output')(x)
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])
    
    model.compile(
        optimizer='adam', loss={'main_output': 'categorical_crossentropy'},
        loss_weights={'main_output': 1.}, metrics=['acc']
    )
    return model

In [229]:
# Instantiating the classification model
model = classification_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
time_input (InputLayer)         (None, 35, 2)        0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 32)           4480        time_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 476)          0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 508)          0           lstm_4[0][0]                     
                                                                 aux_input[0][0]                  
__________

We use `ReduceLRonPlateau` so that the model does not overshoot the optimal minimum point and hence by default we start with a learning rate of 0.01 but as soon as the accuracy stop increasing the learning rate does not change which helps us converge better.

In [230]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)

In [231]:
# Running the model with the training, testing and validation set and for 10 epocsh and size 256
model.fit({'time_input': trained_pca,
           'aux_input': np.array(training_auxiliary)
    }, {'main_output': y_train_oh }, 
          validation_data=([val_pca, np.array(validation_auxiliary)], [y_val_oh]),
          epochs=5, batch_size=256, callbacks=[reduce_lr], shuffle=True
)

Train on 1963176 samples, validate on 245397 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f53a54707b8>

In [260]:
## Saving the model and its architecture, so that it can be loaded in later
model.save('/dlabdata1/harshdee/citation_model.h5')
json_string = model.to_json()
with open("/dlabdata1/harshdee/model_num.json", "w") as json_file:
    json_file.write(json_string)

In [232]:
prediction = model.predict([test_pca, np.array(testing_auxiliary)])

In [234]:
prediction.shape

(245398, 3)

In [235]:
# np.amax(prediction, axis=1)
y_pred = np.argmax(prediction, axis=1)

In [254]:
print("Accuracy of the Neural network model:", accuracy_score(y_test, y_pred))
# confusion matrix of the results/testing set
res = pd.DataFrame(confusion_matrix(y_test, y_pred))
res.index = ['book', 'journal', 'newspapers/rest']
res.columns = ['book', 'journal', 'newspapers/rest']

res

Accuracy of the Neural network model: 0.9696248543182912


Unnamed: 0,book,journal,newspapers/rest
book,53891,2136,1728
journal,2747,78774,816
newspapers/rest,25,2,105279


In [244]:
# Make a dataframe which contains the predicted and true labels for the testing set
predict_val = pd.DataFrame(
    list(zip(test_indices, y_pred.ravel().astype(int).tolist(), y_test)),
    columns=['index_number', 'predicted_label', 'actual_label'])
predict_val = predict_val.set_index(['index_number'])
predict_val.head()

Unnamed: 0_level_0,predicted_label,actual_label
index_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1796644,2,2
2424867,2,2
941168,1,1
408993,1,1
1097016,1,1


In [246]:
# Get labels with the predicted label and actual label and merge it with the citation which we have trained it on
labels_with_info = pd.merge(predict_val, ids_and_citations, left_index=True, right_index=True)

In [264]:
labels_with_info[((labels_with_info['predicted_label'] == 1) & (labels_with_info['actual_label'] == 2))]

Unnamed: 0,predicted_label,actual_label,id,citation
1690888,1,2,37288587,{{Cite book | title = Quattroruote: Tutte le A...


In [263]:
labels_with_info[((labels_with_info['predicted_label'] == 0) & (labels_with_info['actual_label'] == 2))]

Unnamed: 0,predicted_label,actual_label,id,citation
1664001,0,2,12916885,"{{Cite journal | author = Ashton, P. | title =..."
1730809,0,2,22332593,{{cite book| title=East Fishkill| page=109| fi...
1762490,0,2,164968,{{Cite book|title=Steno on Muscles|author=Troe...
1791742,0,2,3162850,{{cite book |last=|first=|authorlink=|editor=S...
1842133,0,2,5689694,"{{cite journal|author1=Salkin, Alf |author2=Ha..."
1851799,0,2,33277,{{cite book |title=William Morris and Edward B...
1855144,0,2,10074626,{{cite book |last=Espelid |first=Harald |title...


In [266]:
labels_with_info[((labels_with_info['predicted_label'] == 0) & (labels_with_info['actual_label'] == 1))].head(5)

Unnamed: 0,predicted_label,actual_label,id,citation
92,0,1,1833304,"{{cite conference|authors=Potter, R., Weldon, ..."
92,0,1,46923213,{{Cite news |url= https://www.washingtonpost.c...
371,0,1,1840387,{{Cite journal|last=Silk|first=Joan B.|last2=A...
371,0,1,49433544,{{Cite news|url=https://www.nytimes.com/2003/0...
385,0,1,1840387,{{Cite journal|last=Kemp|first=Andrew H.|last2...


In [267]:
labels_with_info[((labels_with_info['predicted_label'] == 1) & (labels_with_info['actual_label'] == 0))].head(5)

Unnamed: 0,predicted_label,actual_label,id,citation
127,1,0,1834251,{{citation|title=Islam in World Cultures: Comp...
127,1,0,22754875,{{Cite news| issn = 0261-3077| last = Monbiot|...
1156,1,0,1855357,"{{cite journal|last=Jones|author2=Lawton, and ..."
1156,1,0,45391560,{{cite news |first=Clive |last=Lindsay |title=...
1215,1,0,1856816,{{cite journal |last1=Crepineau |first1=Floren...


## Training/Testing split different - 80/20 ratio

This is done for consistency in results and comparing it with the Random Forest model we generated in the other notebook.

In [None]:
# Instantiating the model with a different split
# model_split_different = classification_model()
# model_split_different.summary()

In [None]:
# Appending validation and testing set together to make testing set of 20%
# new_test_pca = np.concatenate((val_pca,test_pca))
# new_testing_auxiliary = np.concatenate((validation_auxiliary, testing_auxiliary))
# new_y_test = y_val + y_test

In [None]:
# Fitting the model for 10 epochs and a batch size of 256
# hist = model_split_different.fit({'time_input': trained_pca,
#            'aux_input': np.array(training_auxiliary)
#     }, {'main_output': np.array(y_train) }, 
#           epochs=10, batch_size=256, callbacks=[reduce_lr], shuffle=True
# )

In [None]:
# Do predictions on the testing set
# prediction2 = model2.predict([new_test_pca, np.array(new_testing_auxiliary)])
# y_pred2 = prediction2 > 0.5

In [None]:
# print("Accuracy of the Neural network model:", metrics.accuracy_score(new_y_test, y_pred2))
# # confusion matrix of the results/testing set
# pd.DataFrame(confusion_matrix(new_y_test, y_pred2))