In [1]:
import re
import json
import glob
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from itertools import chain
from keras.models import Model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.manifold import TSNE
from gensim.models import FastText
from sklearn.decomposition import PCA
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split 
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Initializing tqdm for pandas
tqdm.pandas()

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos if x.device_type == 'GPU'])

[]


In [3]:
np.random.seed(0)

In [5]:
citations_features = pd.read_parquet('/dlabdata1/harshdee/citations_features.parquet/', engine='pyarrow')
dataset = pd.read_csv('dataset.csv')

In [6]:
# Merging the citation and their corresponding features which have been extracted
book_journal_features = pd.merge(
    dataset, citations_features, how='inner', left_on=['id','citation'], right_on = ['id','citation']
)
book_journal_features.drop('page_title_y', axis=1, inplace=True)
book_journal_features.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
book_journal_features.shape

(1754055, 16)

In [8]:
# Only consider unique citations so that the dataset is more varied
book_journal_features = book_journal_features.set_index(['id', 'citation'])
book_journal_features = book_journal_features[~book_journal_features.index.duplicated(keep='first')]
book_journal_features = book_journal_features.reset_index()

## Get auxiliary features and divide them into labels

1. `ref_index`
2. `total_words`
3. `tags`
4. `type_of_citation`

#### can include `section` of the page in which the citation belongs to

In [9]:
book_journal_features['actual_label'] = 'rest'

In [10]:
book_journal_features.loc[~pd.isna(book_journal_features['PMC']), ['actual_label']] = 'journal'
book_journal_features.loc[~pd.isna(book_journal_features['PMID']), ['actual_label']] = 'journal'

In [11]:
only_doi = (
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMC']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['ISBN'])
)
book_journal_features.loc[only_doi, ['actual_label']] = 'journal'

In [12]:
only_book = (
    ~pd.isna(book_journal_features['ISBN']) & 
    pd.isna(book_journal_features['PMC']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['DOI'])
)
book_journal_features.loc[only_book, ['actual_label']] = 'book'

In [13]:
both_book_and_doi_journal = (
    ~pd.isna(book_journal_features['ISBN']) & 
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['PMC']) &
    book_journal_features['citation_type'].isin(['cite journal', 'cite conference'])
)
book_journal_features.loc[both_book_and_doi_journal, ['actual_label']] = 'journal'

In [14]:
both_book_and_doi_book = (
    ~pd.isna(book_journal_features['ISBN']) & 
    ~pd.isna(book_journal_features['DOI']) & 
    pd.isna(book_journal_features['PMID']) &
    pd.isna(book_journal_features['PMC']) &
    book_journal_features['citation_type'].isin(['cite book', 'cite encyclopedia'])
)
book_journal_features.loc[both_book_and_doi_book, ['actual_label']] = 'book'

In [15]:
## Made the dataset which contains citations book and journal labeled
book_journal_features = book_journal_features[book_journal_features['actual_label'].isin(['book', 'journal'])]
book_journal_features = book_journal_features[[
    'sections', 'citation_type', 'citation', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'actual_label', 'neighboring_words'
]]
book_journal_features.shape

(1402511, 9)

In [16]:
book_journal_features.iloc[0]['citation']

'{{cite journal | author= Kenneth Cornetta | author2 = W.French Anderson | title = Protamine sulfate as an effective alternative to polybrene in retroviral-mediated gene-transfer: implications for human gene therapy | journal = Journal of Virological Methods | year= 1989 | volume= 23 | issue= 2 | pages= 187\\u2013194 | url=http://www.sciencedirect.com/science/article/pii/0166093489901328 | doi=10.1016/0166-0934(89)90132-8 | pmid= 2786000}}'

In [17]:
book_journal_features['citation'] = book_journal_features['citation'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
book_journal_features['citation'] = book_journal_features['citation'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
book_journal_features['citation'] = book_journal_features['citation'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
book_journal_features['citation'] = book_journal_features['citation'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 1402511/1402511 [00:03<00:00, 391859.57it/s]
100%|██████████| 1402511/1402511 [00:03<00:00, 383214.35it/s]
100%|██████████| 1402511/1402511 [00:04<00:00, 326432.12it/s]
100%|██████████| 1402511/1402511 [00:03<00:00, 355127.63it/s]


In [18]:
book_journal_features.iloc[0]['citation']

'{{cite journal | author= Kenneth Cornetta | author2 = W.French Anderson | title = Protamine sulfate as an effective alternative to polybrene in retroviral-mediated gene-transfer: implications for human gene therapy | journal = Journal of Virological Methods | year= 1989 | volume= 23 | issue= 2 | pages= 187\\u2013194 | url = | doi=10.1016/0166-0934(89)90132-8 | pmid= 2786000}}'

In [19]:
## loading the dataset of newspapers which was generated from the citations_separated dataset
li = []
all_files = glob.glob('/dlabdata1/harshdee/newspapers_citations_features.csv/' + "/*.csv")

for filename in all_files:
    df = pd.read_csv(filename, header=None, sep='\t')
    li.append(df)

newspaper_data = pd.concat(li, axis=0, ignore_index=True)
newspaper_data.shape

(1388908, 35)

In [20]:
newspaper_data = newspaper_data[[0, 1, 2, 3, 4, 28, 32, 33]]
newspaper_data.rename({
    0: 'citation', 1: 'ref_index', 2: 'total_words',
    3: 'neighboring_words', 4: 'neighboring_tags', 
    28: 'id', 32: 'sections', 33: 'citation_type'}, axis=1, inplace=True)
newspaper_data['actual_label'] = 'newspaper'

In [21]:
newspaper_data.iloc[819218]['citation'] ## Example before removing the fields

'{{cite news | accessdate=January 30, 2016|work= New York Times| first = Henry | last = Raymont | url = https://timesmachine.nytimes.com/timesmachine/1971/11/14/79406266.pdf | title = U.S. Shift on Cuba in 1960 Detailed | date = November 14, 1971}}'

In [22]:
newspaper_data['citation'] = newspaper_data['citation'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
newspaper_data['citation'] = newspaper_data['citation'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
newspaper_data['citation'] = newspaper_data['citation'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
newspaper_data['citation'] = newspaper_data['citation'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 1388908/1388908 [00:04<00:00, 280428.82it/s]
100%|██████████| 1388908/1388908 [00:03<00:00, 366677.25it/s]
100%|██████████| 1388908/1388908 [00:03<00:00, 424954.28it/s]
100%|██████████| 1388908/1388908 [00:03<00:00, 447678.22it/s]


In [23]:
newspaper_data.iloc[819218]['citation'] ## Example after removing the fields

'{{cite news | accessdate=January 30, 2016|work = | first = Henry | last = Raymont | url = | title = U.S. Shift on Cuba in 1960 Detailed | date = November 14, 1971}}'

In [24]:
entertainment_features = pd.read_parquet(
    '/dlabdata1/harshdee/entertainment_citations_complete.parquet/', engine='pyarrow')

In [25]:
entertainment_features = entertainment_features[[
    'ref_index', 'total_words', 'neighboring_words', 'neighboring_tags', 'id', 'sections', 'citations']]
entertainment_features.rename({'citations': 'citation'}, axis=1, inplace=True)

In [26]:
entertainment_features.iloc[23787]['citation'] ## Example before removing the fields

'{{cite web|url=http://www.billboard.com/charts/2010-03-06/latin-albums|title=Latin Albums: Week of March 06, 2010|date=March 6, 2010|work=Billboard|publisher=Prometheus Global Media|accessdate=March 15, 2012}}'

In [27]:
entertainment_features['actual_label'] = 'entertainment'
newspaper_data.drop('citation_type', axis=1, inplace=True)
book_journal_features.drop('citation_type', axis=1, inplace=True)

In [28]:
entertainment_features['citation'] = entertainment_features['citation'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
entertainment_features['citation'] = entertainment_features['citation'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
entertainment_features['citation'] = entertainment_features['citation'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
entertainment_features['citation'] = entertainment_features['citation'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 609218/609218 [00:01<00:00, 354227.78it/s]
100%|██████████| 609218/609218 [00:01<00:00, 354009.94it/s]
100%|██████████| 609218/609218 [00:01<00:00, 354752.91it/s]
100%|██████████| 609218/609218 [00:01<00:00, 384490.74it/s]


In [29]:
entertainment_features.iloc[23787]['citation'] ## Example after removing the fields

'{{cite web|url = |title=Latin Albums: Week of March 06, 2010|date=March 6, 2010|work = |publisher=Prometheus Global Media|accessdate=March 15, 2012}}'

In [30]:
entertainment_features['citation'].progress_apply(lambda x: re.findall('{{\s{0,10}([^|]+)', x)[0].strip()).value_counts()

100%|██████████| 609218/609218 [00:01<00:00, 409267.60it/s]


cite web               489586
cite news               39991
Cite web                33099
Citation                18787
Cite news               13013
cite journal             5078
cite AV media            4489
citation                 1074
cite episode              703
Cite AV media notes       577
Cite AV media             532
cite press release        455
cite interview            451
Cite episode              410
cite AV media notes       324
cite book                 266
Cite journal              175
cite podcast               51
cite speech                49
cite conference            28
Cite book                  20
Cite press release         19
Cite interview             16
cite serial                 8
Cite speech                 4
Cite conference             3
cite report                 2
cite encyclopedia           2
cite DVD notes              2
Cite report                 2
Cite encyclopedia           1
Cite podcast                1
Name: citation, dtype: int64

In [31]:
dataset_with_features = pd.concat([book_journal_features, newspaper_data, entertainment_features])
dataset_with_features.shape

(3400637, 8)

In [32]:
le = preprocessing.LabelEncoder()
le.fit(dataset_with_features['actual_label'])
dataset_with_features['label_category'] = le.transform(dataset_with_features['actual_label'])

In [33]:
dataset_with_features[dataset_with_features['actual_label'] == 'entertainment'].head(1)

Unnamed: 0,actual_label,citation,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,label_category
0,entertainment,{{Citation | title = YouTube | url = | contrib...,15511,"[JJ, NNP, VBD, RB, JJ, NN, :, NN, :, JJ, NN, :...","[last, Horsley, deadurl, yes, archiveurl, http...",1625,Initial Section,8158,1


In [34]:
dataset_with_features[dataset_with_features['actual_label'] == 'newspaper'].head(1)

Unnamed: 0,actual_label,citation,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,label_category
0,newspaper,{{Citation | date = 10 December 2012 | url = |...,469553,"[CD,NN,NN,CD,NN,NN,NN,'',NNP,NNP,'',NN,NN,CD,N...","[99,issue,page,463,ref,ref,name=,'',Lovelace,G...",1427,Initial Section,2322,3


In [35]:
dataset_with_features[dataset_with_features['actual_label'] == 'book'].head(1)

Unnamed: 0,actual_label,citation,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,label_category
34,book,{{cite book|author1=David John Cole|author2=Ev...,1831574,"[NN, NNP, NNP, NNP, NNP, NNP, NNP, IN, NNP, NN...","[bookauthor1David, John, Coleauthor2Eve, Brown...",110,Initial Section,1242,0


In [36]:
dataset_with_features[dataset_with_features['actual_label'] == 'journal'].head(1)

Unnamed: 0,actual_label,citation,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,label_category
0,journal,{{cite journal | author= Kenneth Cornetta | au...,1831220,"[NNP, NN, CD, NN, CD, NN, NNS, CD, JJ, VBD, JJ...","[Methods, year, 1989, volume, 23, issue, pages...",941,Initial Section,1661,2


In [37]:
## clearing up memory
del citations_features
del dataset
del book_journal_features
del newspaper_data
del entertainment_features

import gc
gc.collect()

56

In [38]:
## Remove rows which have duplicate ID and citations since they are just the same examples
dataset_with_features = dataset_with_features.drop_duplicates(subset=['id', 'citation']) ## keeps first row
dataset_with_features = dataset_with_features.reset_index(drop=True)

In [39]:
dataset_with_features.shape

(3053972, 9)

In [40]:
## Please save this file and use it - as an intermediate file if you want to use it somewhere else
## dataset_with_features.to_csv('dataset_with_features.csv', index=False)

### Taking the unique `sections` and one hot encoding it to get a vector

In [41]:
# Only processing auxiliary features which are going to be used in the neural network
auxiliary_features = dataset_with_features[
    ['sections', 'citation', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'label_category']]

In [42]:
auxiliary_features['sections'] = auxiliary_features['sections'].astype(str)
auxiliary_features['sections'] = auxiliary_features['sections'].apply(lambda x: x.split(', '))

In [43]:
section_counts = pd.Series(Counter(chain.from_iterable(x for x in auxiliary_features.sections)))
largest_sections = section_counts.nlargest(150)

In [44]:
# Change section to `OTHERS` if occurence of the section is not in the 150 largest sections
auxiliary_features['sections'] = auxiliary_features['sections'].progress_apply(
    lambda x: list(set(['Others' if i not in largest_sections else i for i in x]))
)

100%|██████████| 3053972/3053972 [00:09<00:00, 306834.44it/s]


In [45]:
auxiliary_features.head()

Unnamed: 0,sections,citation,id,ref_index,total_words,neighboring_tags,label_category
0,[Initial Section],{{cite journal | author= Kenneth Cornetta | au...,1831220,941,1661,"[NNP, NN, CD, NN, CD, NN, NNS, CD, JJ, VBD, JJ...",2
1,[Initial Section],{{cite journal|last=Sorgi|first=FL|author2=Bha...,1831220,1025,1661,"[NN, ,, NNP, ,, NNP, NN, NNS, JJ, NN, NN, VBD,...",2
2,[Initial Section],{{cite journal|last=Walker|first=WS|author2=Re...,1831220,1187,1661,"[,, NNP, ., JJ, JJ, NN, IN, NNS, IN, JJ, NNS, ...",2
3,[Initial Section],{{cite journal|last=Campbell|first=FW|author2=...,1831220,1267,1661,"[,, NNP, ,, NN, ., NN, IN, DT, NN, IN, JJ, NN,...",2
4,[Initial Section],{{cite journal|last=Welsby|first=IJ|author2=Ne...,1831220,1364,1661,"[JJ, NNS, IN, JJ, NN, NN, IN, NN, IN, JJ, NN, ...",2


In [46]:
section_dummies = pd.get_dummies(auxiliary_features.sections.apply(pd.Series).stack())

In [47]:
auxiliary_features = auxiliary_features.join(section_dummies.sum(level=0))

In [48]:
auxiliary_features.drop('sections', axis=1, inplace=True)
auxiliary_features.head()

Unnamed: 0,citation,id,ref_index,total_words,neighboring_tags,label_category,20th century,21st century,Accolades,Activities,...,Taxonomy,Terminology,Timeline,Track listing,Transfers,Treatment,Types,Uses,Work,Works
0,{{cite journal | author= Kenneth Cornetta | au...,1831220,941,1661,"[NNP, NN, CD, NN, CD, NN, NNS, CD, JJ, VBD, JJ...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,{{cite journal|last=Sorgi|first=FL|author2=Bha...,1831220,1025,1661,"[NN, ,, NNP, ,, NNP, NN, NNS, JJ, NN, NN, VBD,...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,{{cite journal|last=Walker|first=WS|author2=Re...,1831220,1187,1661,"[,, NNP, ., JJ, JJ, NN, IN, NNS, IN, JJ, NNS, ...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,{{cite journal|last=Campbell|first=FW|author2=...,1831220,1267,1661,"[,, NNP, ,, NN, ., NN, IN, DT, NN, IN, JJ, NN,...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,{{cite journal|last=Welsby|first=IJ|author2=Ne...,1831220,1364,1661,"[JJ, NNS, IN, JJ, NN, NN, IN, NN, IN, JJ, NN, ...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Taking the `type of citations` and one hot encoding it to get a vector

In [49]:
## Get one hot encoding of citation_type column
# citation_type_encoding = pd.get_dummies(auxiliary_features['citation_type'])

In [50]:
## Drop column citation_type as it is now encoded and join it
# auxiliary_features = auxiliary_features.drop('citation_type', axis=1)

In [51]:
## Concat columns of the dummies along the axis with the matching index
# auxiliary_features = pd.concat([auxiliary_features, citation_type_encoding], axis=1)
# auxiliary_features.head()

As we can see for the feature `total_number_of_words`, the mean and median **(since it is more robust in nature!)** are pretty high for articles which are `not` journal or books

In [52]:
print('Total mean length of journal articles: {}'.format( ## Journal - length is less
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].mean()))
print('Total median length of journal articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].median()))

Total mean length of journal articles: 11569.929999268748
Total median length of journal articles: 5371.0


In [53]:
print('Total mean length of book articles: {}'.format( ## Rest of the article have larger length
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].mean()))
print('Total median length of book articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].median()))

Total mean length of book articles: 6761.968974319627
Total median length of book articles: 3097.0


In [54]:
print('Total mean length of book articles: {}'.format( ## Books - length is less
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].mean()))
print('Total median length of book articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].median()))

Total mean length of book articles: 6899.136715744908
Total median length of book articles: 2782.0


### Taking the `neighboring_tags` and making an encoder dictionary for it

To have more info about how what tag mean what: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [55]:
citation_tag_features = dataset_with_features[['id', 'citation', 'neighboring_tags']]

In [56]:
citation_tag_features['neighboring_tags'].iloc[0]

array(['NNP', 'NN', 'CD', 'NN', 'CD', 'NN', 'NNS', 'CD', 'JJ', 'VBD',
       'JJ', 'CD', 'JJ', 'NN', 'CD', 'NN', 'NN', 'NN', 'NNP', 'CD'],
      dtype=object)

In [57]:
# Get the count for each POS tag so that we have an estimation as to how many are there
tag_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_tag_features.neighboring_tags)))

In [58]:
# Considering the 10 smallest tags and checking which one does not have resemblance
tag_counts.nsmallest(10) 

L         2
LS        3
``      321
`       678
UH     1258
Y      1463
U      1502
H      1502
WP$    1543
PDT    3661
dtype: int64

We are going to replace `LS`, `the 2 backquotes` and the `the dollar symbol` since they do not have too much use case and do not give too much information about the context of the neighboring citation text.

In [59]:
OTHER_TAGS = ['LS', '``', '$']
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: [i if i not in OTHER_TAGS else 'Others' for i in x]
)

100%|██████████| 3053972/3053972 [00:49<00:00, 62247.91it/s] 


Now, we can use the `count vectorizer` to represent the `POS tags` as a vector where each element of the vector represents the count of that tag in that particular citation.

In [60]:
cv = CountVectorizer() # Instantiate the vectorizer

In [61]:
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 3053972/3053972 [00:05<00:00, 573030.54it/s]


In [62]:
transformed_neighboring_tags = cv.fit_transform(citation_tag_features['neighboring_tags'])
transformed_neighboring_tags = pd.DataFrame(transformed_neighboring_tags.toarray(), columns=cv.get_feature_names())

In [63]:
citation_tag_features.head()

Unnamed: 0,id,citation,neighboring_tags
0,1831220,{{cite journal | author= Kenneth Cornetta | au...,NNP NN CD NN CD NN NNS CD JJ VBD JJ CD JJ NN C...
1,1831220,{{cite journal|last=Sorgi|first=FL|author2=Bha...,"NN , NNP , NNP NN NNS JJ NN NN VBD CD JJ JJ JJ..."
2,1831220,{{cite journal|last=Walker|first=WS|author2=Re...,", NNP . JJ JJ NN IN NNS IN JJ NNS TO VB NNP NN..."
3,1831220,{{cite journal|last=Campbell|first=FW|author2=...,", NNP , NN . NN IN DT NN IN JJ NN IN JJ JJ NN ..."
4,1831220,{{cite journal|last=Welsby|first=IJ|author2=Ne...,JJ NNS IN JJ NN NN IN NN IN JJ NN NN NN VBZ JJ...


In [64]:
citation_tag_features = pd.concat([citation_tag_features, transformed_neighboring_tags], join='inner', axis=1)

In [65]:
citation_tag_features.drop('neighboring_tags', axis=1, inplace=True)
citation_tag_features.head()

Unnamed: 0,id,citation,cc,cd,dt,ex,fw,in,jj,jjr,...,vb,vbd,vbg,vbn,vbp,vbz,wdt,wikicode,wp,wrb
0,1831220,{{cite journal | author= Kenneth Cornetta | au...,0,6,0,0,0,0,3,0,...,0,1,0,0,0,0,0,0,0,0
1,1831220,{{cite journal|last=Sorgi|first=FL|author2=Bha...,0,2,0,0,0,0,4,0,...,0,1,0,0,0,0,0,0,0,0
2,1831220,{{cite journal|last=Walker|first=WS|author2=Re...,0,1,0,0,0,2,3,0,...,1,0,0,0,0,1,0,0,0,0
3,1831220,{{cite journal|last=Campbell|first=FW|author2=...,0,1,1,0,0,3,3,0,...,0,0,0,0,0,0,0,0,0,0
4,1831220,{{cite journal|last=Welsby|first=IJ|author2=Ne...,0,1,0,0,0,3,4,0,...,0,0,0,0,0,1,0,0,0,0


## Features for the LSTM - more time sequence related

### Citation's original text features

In [66]:
# Create a separate dataframe for preprocessing citation text
citation_text_features = dataset_with_features[['id', 'citation', 'label_category']]

In [67]:
# Convert the citation into a list by breaking it down into characters
citation_text_features['characters'] = citation_text_features['citation'].progress_apply(lambda x: list(x))

100%|██████████| 3053972/3053972 [00:24<00:00, 122249.48it/s]


In [68]:
# Get the character counts for each unique character
char_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_text_features.characters)))
char_counts.index

Index(['{', 'c', 'i', 't', 'e', ' ', 'j', 'o', 'u', 'r', 'n', 'a', 'l', '|',
       'h', '=', 'K', 'C', '2', 'W', '.', 'F', 'A', 'd', 's', 'P', 'm', 'f',
       'v', 'p', 'y', 'b', '-', 'g', ':', 'J', 'V', 'M', '1', '9', '8', '3',
       '7', '\', '0', '4', '6', '/', '(', ')', '}', 'S', 'L', 'B', ',', 'H',
       'G', 'k', 'R', 'D', 'I', '5', 'E', 'w', 'N', 'z', 'T', 'x', 'O', '&',
       'Z', '?', 'Y', 'q', 'U', 'Q', 'X', ';', '[', ']', '_', '!', ''', '"',
       '<', '>', '+', '%', '#', '~', '*', '`', '^', '@', '$'],
      dtype='object')

In [69]:
print('The max length of the longest citation in terms of characters is: {}'.format(
    max(citation_text_features.characters.apply(lambda x: len(x)))))

print('The mean length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).mean()))

print('The median length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).median()))

The max length of the longest citation in terms of characters is: 40755
The mean length of the longest citation in terms of characters is: 221.33374929436158
The median length of the longest citation in terms of characters is: 185.0


In [70]:
# Make a dictionary for creating a mapping between the char and the corresponding index
char2ind = {char: i for i, char in enumerate(char_counts.index)}
ind2char = {i: char for i, char in enumerate(char_counts.index)}

In [71]:
# Map each character into the citation to its corresponding index and store it in a list
X_char = []
for citation in citation_text_features.citation:
    citation_chars = []
    for character in citation:
        citation_chars.append(char2ind[character])
        
    X_char.append(citation_chars)

Since the median length of the citation is 282, we have padded the input till 400 to get extra information which would be fed into the character embedding neural network.

In [72]:
with tf.device('/gpu:0'):
    X_char = pad_sequences(X_char, maxlen=400)

In [73]:
# Append the citation character list with their corresponding lists for making a dataset
# for getting the character embeddings
data = []
for i in tqdm(range(len(X_char))):
    data.append((X_char[i], int(citation_text_features.iloc[i]['label_category'])))

100%|██████████| 3053972/3053972 [10:57<00:00, 4642.70it/s]


In [74]:
# Separate out the training data and labels for further verification use
training_data = [i[0] for i in data]
training_labels = [i[1] for i in data]
training_labels = [0 if i in [0,2] else 1 for i in training_labels] ## Changing it to dummy labels - identifier vs non identifier

We are going to feed in the 400 character input since our median length comes out to be approximately 282 and train it on a dummy task - if the citation is scientific or not and get the embedding layer which would contain the representation for each character.

In [81]:
from keras.utils import to_categorical

categorical_labels = to_categorical(training_labels, num_classes=2)

In [86]:
def citation_embedding_model():
    """
    Citation embedding generator model where the dimension of the embedding is 50.
    """
    main_input = Input(shape=(400, ), name='characters')
    # input dim is basically the vocab size
    emb = Embedding(input_dim=95, output_dim = 300, name='citation_embedding')(main_input)
    rnn = Bidirectional(LSTM(20))
    x = rnn(emb)
    de = Dense(2, activation='softmax')(x)
    model = Model(inputs = main_input, outputs = de)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [87]:
# Instantiate the model and generate the summary
model = citation_embedding_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
characters (InputLayer)      (None, 400)               0         
_________________________________________________________________
citation_embedding (Embeddin (None, 400, 300)          28500     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 82        
Total params: 79,942
Trainable params: 79,942
Non-trainable params: 0
_________________________________________________________________


In [88]:
def generator(features, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features = np.zeros((batch_size, 400))
    batch_labels = np.zeros((batch_size, 2))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features), 1)[0]
            batch_features[i] = features[index]
            batch_labels[i] = categorical_labels[index]
        yield batch_features, batch_labels

In [89]:
# Run the model with the data being generated by the generator with a batch size of 64
# and number of epochs to be set to 15
hist = model.fit_generator(generator(training_data, categorical_labels, 64), samples_per_epoch=30, nb_epoch=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [90]:
# Save the model so that we can retrieve it later
model.save('/dlabdata1/harshdee/embedding_model.h5')
# from keras.models import load_model
# model = load_model('/dlabdata1/harshdee/embedding_model.h5')

In [91]:
# Get the `citation_embedding` layer and get the weights for each character
citation_layer = model.get_layer('citation_embedding')
citation_weights = citation_layer.get_weights()[0]
citation_weights.shape

(95, 300)

In [92]:
# An example of the first element of an embedding
citation_weights[0][:100]

array([-6.3487798e-02, -2.2857914e-02,  7.2305627e-02,  7.5458631e-02,
        3.5729304e-02, -4.7548639e-04, -6.9378056e-02, -2.3916585e-02,
        5.3427406e-03,  2.3297900e-03,  1.5415391e-02,  6.4162754e-02,
       -1.7194109e-02, -8.7533649e-03,  9.1980938e-03,  2.2650823e-02,
        2.9702580e-02, -1.0351910e-02, -8.3165988e-03,  4.4419640e-03,
        5.4581347e-03,  3.7447099e-02,  8.4146438e-03,  1.7995673e-05,
        6.7927120e-03, -3.2515638e-02, -2.5981069e-02, -1.0615817e-02,
       -3.4218505e-02, -1.2037923e-02, -1.9652506e-02, -4.9414057e-03,
       -5.4916978e-02, -4.0766921e-02,  4.5642130e-02, -4.0086053e-02,
       -5.7412632e-02,  7.2119152e-03, -3.6836632e-02, -2.9153578e-02,
       -6.2424974e-03, -8.4446585e-03, -4.7941878e-02, -2.1400314e-03,
       -2.7946520e-03, -5.1635080e-03, -2.4884988e-02,  6.3802965e-02,
       -4.4594160e-03,  1.6389739e-02,  1.5342609e-02, -1.7282961e-02,
       -1.2013974e-02, -4.5360099e-03, -1.9862046e-02,  1.7599603e-02,
      

In [93]:
# Map the embedding of each character to the character in each corresponding citation and aggregate (sum)
citation_text_features['embedding'] = citation_text_features['characters'].progress_apply(
    lambda x: sum([citation_weights[char2ind[c]] for c in x])
)

100%|██████████| 3053972/3053972 [11:29<00:00, 4429.95it/s]


In [94]:
# Normalize the citation embeddings so that we can check for their similarity later
citation_text_features['embedding'] = citation_text_features['embedding'].progress_apply(
    lambda x: x/ np.linalg.norm(x, axis=0).reshape((-1, 1))
)

100%|██████████| 3053972/3053972 [00:48<00:00, 63107.32it/s]


In [95]:
# Make the sum of the embedding to be summed up to 1
np.sum(np.square(citation_text_features['embedding'].iloc[0]))

1.0000001

### Similarity Graph for citation text embeddings

In [85]:
# Just considering 20 since otherwise it will be computationally extensive
# citation_text_and_embeddings = citation_text_features[['citation', 'embedding']][:500]

In [86]:
# citation_text_and_embeddings['embedding'] = citation_text_and_embeddings['embedding'].progress_apply(
#     lambda x: x[0].tolist()
# )

In [87]:
# def tsne_embedding_plot():
#     labels = []
#     tokens = []

#     index = 0
#     for row in citation_text_and_embeddings:
#         tokens.append(row['embedding'])
#         labels.append(str(index))
#         index += 1
    
#     # Perplexity takes into account the global and local features
#     # We are using dimensionality reduciton for 2 features and taking 2500 iterations into account
#     tsne_model = TSNE(perplexity=40, n_components=2, n_iter=2500, random_state=0)
#     new_values = tsne_model.fit_transform(tokens)

#     x = []
#     y = []
#     for value in new_values:
#         x.append(value[0])
#         y.append(value[1])
        
#     plt.figure(figsize=(10, 10)) 
#     for i in range(len(x)):
#         plt.scatter(x[i],y[i])
#         plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2),
#                      textcoords='offset points', ha='right', va='bottom')
#     plt.show()

In [88]:
# tsne_embedding_plot()

In [89]:
# an example of citation embeddings which is close to each other
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([14, 477])] # (51, 243), (0, 13)

In [90]:
# # Similiarity of 2 citations which are very similar
# result_similar = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[14]['embedding'],
#     citation_text_and_embeddings.iloc[477]['embedding']
# )
# result_similar

In [91]:
# an example of citation embeddings which is NOT close to each other and are different
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([42, 124])] # (6, 42)

In [92]:
# Similiarity of 2 citations which are not similar
# result_different = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[42]['embedding'],
#     citation_text_and_embeddings.iloc[124]['embedding']
# )
# result_different

### FastText embeddings for neighboring words

In [93]:
# Load the pretrained embedding model on wikipedia
model = FastText.load_fasttext_format('/dlabdata1/harshdee/wiki.en.bin')

In [94]:
# Create a separate dataframe for preprocessing citation words
citation_word_features = dataset_with_features[['id', 'citation', 'neighboring_words', 'label_category']]

In [95]:
# Lowercase all the neighboring words for each of the citations
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i.lower() for i in x]
)

100%|██████████| 3053972/3053972 [01:13<00:00, 41763.78it/s] 


Get the total unique words with their respective counts in the total dataset. This is done in order to remove words which are of low frequency and will potentially act as noise to the model.

In [96]:
word_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words)))

In [97]:
threshold = 4

x = len(word_counts)
y = len(word_counts[word_counts <= threshold])
print('Total words: {}\nTotal number of words whose occurence is less than 4: {}\nDifference: {}'.format(x, y, x-y))
words_less_than_threshold = word_counts[word_counts <= threshold]

Total words: 3853631
Total number of words whose occurence is less than 4: 3568808
Difference: 284823


In [98]:
# Remove the words which have a count of less than 4 and replace them with the unique <UNK> symbol
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i if i not in words_less_than_threshold else '<UNK>' for i in x]
)

100%|██████████| 3053972/3053972 [07:28<00:00, 6808.36it/s] 


In [99]:
# creating a mapping between word and index or vice versa
words = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words))).index
word2ind = {w: i for i, w in enumerate(words)}
ind2words = {i: w for i, w in enumerate(words)}

In [100]:
word_embedding_matrix = np.zeros((len(word2ind), 300))
for w in tqdm(word2ind):
    index = word2ind[w]
    word_embedding_matrix[index] = model.wv[w]

100%|██████████| 284824/284824 [00:18<00:00, 15644.88it/s]


Once we have the word embedding for each word in the neighboring words, we sum the embeddings for each word together in neighboring words to get an embedding which represents the past 40 words.

In [101]:
citation_word_features['words_embedding'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: sum([word_embedding_matrix[word2ind[w]] for w in x])
)

100%|██████████| 3053972/3053972 [08:17<00:00, 6139.90it/s] 


Now we have the `citation_word_features` and `citation_tag_features`, so we can join them together to form `time_sequence_features` which would be fed later into the LSTM..

In [102]:
# Join time sequence features with the citations dataset
time_sequence_features = pd.concat([citation_tag_features, citation_word_features], keys=['id', 'citation'], axis=1)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [103]:
print('Total number of samples in time features are: {}'.format(time_sequence_features.shape))

Total number of samples in time features are: (3053972, 42)


In [104]:
# citation_text = auxiliary_features.iloc[:,0]
# auxiliary_features['citation_text'] = citation_text
# auxiliary_features.drop('citation', axis=1, inplace=True)
# auxiliary_features.rename({'citation_text': 'citation'}, axis=1, inplace=True)

In [105]:
# Join auxiliary features with the citations dataset
citation_text_features.reset_index(drop=True, inplace=True)
auxiliary_features.reset_index(drop=True, inplace=True)

auxiliary_features = pd.concat([auxiliary_features, citation_text_features], keys=['id', 'citation'], axis=1)
auxiliary_features = pd.concat([auxiliary_features['citation'], auxiliary_features['id']], axis=1)
auxiliary_features = auxiliary_features.loc[:, ~auxiliary_features.columns.duplicated()]
auxiliary_features.shape

(3053972, 159)

In [106]:
# Drop columns with are duplicates
auxiliary_features.drop(['neighboring_tags', 'characters'], axis=1, inplace=True)

In [107]:
del model
del word_embedding_matrix
del citation_word_features
del citation_text_features

gc.collect()

7

## Making sets for `auxiliary` and `time sequence` features

In [108]:
data = dataset_with_features[['id', 'citation', 'label_category']]

In [109]:
# Join the time sequence features for the data
time_sequence_features = pd.concat([time_sequence_features['id'], time_sequence_features['citation']], axis=1)
time_sequence_features = pd.concat([time_sequence_features, data], keys=['id', 'citation'], axis=1)
time_sequence_features.columns = time_sequence_features.columns.droplevel(0)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [110]:
time_sequence_features['words_embedding'] = time_sequence_features['words_embedding'].progress_apply(lambda x: x.tolist())

100%|██████████| 3053972/3053972 [01:01<00:00, 49823.03it/s]


In [111]:
auxiliary_features['embedding'] = auxiliary_features['embedding'].progress_apply(lambda x: x.tolist())

100%|██████████| 3053972/3053972 [01:56<00:00, 26212.19it/s]


In [112]:
len(time_sequence_features), len(auxiliary_features)

(3053972, 3053972)

## Splitting the dataset into training, testing and validation 

The split is done into 80-10-10 ratio so that we have more training data to train on and have validation dataset to make sure that the model is working as anticipated.

In [113]:
del data
# del word_embedding_matrix
gc.collect()

0

In [114]:
# Get the labels which will be split later
y = auxiliary_features.loc[:, 'label_category'].astype(int).tolist()

In [115]:
# Make a mask for auxiliary dataset to get all features except the one below
column_mask_aux = ~auxiliary_features.columns.isin(['id', 'citation', 'label_category'])

In [116]:
# # Get the columns of those auxiliary features and covert them into a list
auxiliary = auxiliary_features.loc[:, column_mask_aux].values.tolist()

In [117]:
# # Convert them into numpy array (for Keras) and stack them (if needed) as suited for the model's format
auxiliary = [np.array(auxiliary[i][0][0] + auxiliary[i][1:]) for i in tqdm(range(len(auxiliary)))]

100%|██████████| 3053972/3053972 [01:55<00:00, 26360.71it/s]


In [118]:
# # Make a mask for time sequences features dataset to get all features except the one below
cols = [col for col in time_sequence_features.columns if col not in ['id', 'citation', 'label_category', 'neighboring_words']]
stripped_tsf = time_sequence_features[cols]

In [119]:
time = stripped_tsf.values.tolist()

In [120]:
def make_structure_time_features(time_features):
    """
    Concatenate features which are numbers and lists together by checking the type:
    
    param: time_features: the features which are considered time sequence.
    """
    feature_one = np.array([i for i in time_features if isinstance(i, int)])
    feature_two = np.array([i for i in time_features if isinstance(i, list)][0])
    return np.array([feature_one, feature_two])

In [121]:
time = [make_structure_time_features(time[i]) for i in tqdm(range(len(time)))]

100%|██████████| 3053972/3053972 [01:31<00:00, 33558.31it/s]


In [122]:
# Instantiating PCA to 35 components since it should be equal to the size of the vector of the tags
pca = PCA(n_components=35)

def get_reduced_words_dimension(data):
    """
    Get the aggregated dataset of words and tags which has the
    same dimensionality using PCA.
    
    :param: data: data which needs to be aggregated.
    """
    tags = [i for i, _ in data]
    word_embeddings = [j for _,j in data]
    pca.fit(word_embeddings)
    
    word_embeddings_pca = pca.transform(word_embeddings)
    tags = np.array(tags)
    return np.dstack((word_embeddings_pca, tags))

In [123]:
# Apply PCA on all the sets of data to have the dimensions of the data to be the same
time_pca = get_reduced_words_dimension(time)

In [124]:
del time_sequence_features
del auxiliary_features

## LSTM/Neural Network Model

In [125]:
def generator_nn(features_aux, features_time, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features_aux = np.zeros((batch_size, 453))
    batch_features_time =  np.zeros((batch_size, 35, 2))
    batch_labels = np.zeros((batch_size, 4))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features_aux), 1)[0]
            batch_features_aux[i] = features_aux[index]
            batch_features_time[i] = features_time[index]
            batch_labels[i] = labels[index]
        yield [batch_features_time, np.asarray(batch_features_aux)], batch_labels

In [126]:
from keras.optimizers import Adam

In [127]:
def classification_model():
    """
    Model for classifying whether a citation is scientific or not.
    """
    main_input = Input(shape=(35, 2), name='time_input')
    lstm_out = LSTM(32)(main_input)

    auxiliary_input = Input(shape=(453,), name='aux_input') ## 454 without citation type, 476 with citation type
    # Converging the auxiliary input with the LSTM output
    x = keras.layers.concatenate([lstm_out, auxiliary_input])

    # 4 fully connected layer
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)

    main_output = Dense(4, activation='softmax', name='main_output')(x)
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])
    
    opt = Adam(0.001, decay=1e-2/5)
    model.compile(
        optimizer=opt, loss={'main_output': 'categorical_crossentropy'},
        loss_weights={'main_output': 1.}, metrics=['acc']
    )
    return model

In [128]:
# Instantiating the classification model
model = classification_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
time_input (InputLayer)         (None, 35, 2)        0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 32)           4480        time_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 453)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 485)          0           lstm_2[0][0]                     
                                                                 aux_input[0][0]                  
__________

We use `ReduceLRonPlateau` so that the model does not overshoot the optimal minimum point and hence by default we start with a learning rate of 0.01 but as soon as the accuracy stop increasing the learning rate does not change which helps us converge better.

In [129]:
## Convert auxiliary into numpy array for indexing
auxiliary = np.asarray(auxiliary)
y = np.asarray(y)

In [130]:
EPOCHS = 5

In [131]:
x_train_indices, x_test_indices, y_train_indices, y_test_indices = train_test_split(
    range(auxiliary.shape[0]), range(y.shape[0]), train_size=0.9, stratify=y, shuffle=True
)

In [132]:
aux_train = auxiliary[x_train_indices]
time_train = time_pca[x_train_indices]
y_train = np.eye(4)[y[x_train_indices]]

In [133]:
aux_test = auxiliary[x_test_indices]
time_test = time_pca[x_test_indices]
y_test = y[x_test_indices]

In [134]:
# predictions = []
# for index, (train_indices, val_indices) in enumerate(skf.split(auxiliary, y)):
#     aux_train, aux_val = auxiliary[train_indices], auxiliary[val_indices]
#     time_train, time_val = time_pca[train_indices], time_pca[val_indices]
#     y_train = np.eye(4)[y[train_indices]]
#     y_val = y[val_indices]
    
BATCH_SIZE = 256
print('Running model with epochs: {}'.format(EPOCHS))

model = None
model = classification_model()
training_generator = generator_nn(aux_train, time_train, y_train, BATCH_SIZE)

history_callback = model.fit_generator(
    training_generator,
    steps_per_epoch=len(x_train_indices) // 256,
    epochs=EPOCHS, verbose=1, shuffle=True
)
history_dict = history_callback.history
json.dump(history_dict, open('/dlabdata1/harshdee/results/citation_model_loss_{}.json'.format(EPOCHS), 'w'))
print('\n\nDoing prediction with training done for epochs: {}\n'.format(EPOCHS))

Running model with epochs: 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Doing prediction with training done for epochs: 5



In [135]:
prediction_for_folds = model.predict([time_test, aux_test])
y_pred = np.argmax(prediction_for_folds, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Neural network model for epochs {}: {}".format(EPOCHS, accuracy))

res = pd.DataFrame(confusion_matrix(y_test, y_pred))
res.index = ['book', 'entertainment', 'journal', 'newspapers']
res.columns = ['book', 'entertainment', 'journal', 'newspapers']
res['accuracy'] = accuracy
res.to_csv('/dlabdata1/harshdee/results/citation_model_result_{}.csv'.format(EPOCHS))
print(res)

Accuracy of the Neural network model for epochs 5: 0.9832120708059646
                book  entertainment  journal  newspapers  accuracy
book           55316             24     2764           1  0.983212
entertainment     65          60074       21          11  0.983212
journal         2155             72    79879           3  0.983212
newspapers         1             10        0      105002  0.983212


In [136]:
model.save('/dlabdata1/harshdee/results/citation_model_epochs_{}.h5'.format(EPOCHS))
json_string = model.to_json()
with open("/dlabdata1/harshdee/results/citation_model_epochs_{}.json".format(EPOCHS), "w") as json_file:
    json_file.write(json_string)

print('\n\nDone with the prediction and saving model with epochs: {}\n'.format(EPOCHS))



Done with the prediction and saving model with epochs: 5

