In [1]:
import re
import json
import glob
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from itertools import chain
from keras.models import Model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.manifold import TSNE
from gensim.models import FastText
from sklearn.decomposition import PCA
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split 
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Initializing tqdm for pandas
tqdm.pandas()

Using TensorFlow backend.


In [2]:
keras.backend.backend()

u'tensorflow'

In [3]:
from tensorflow.python.client import device_lib

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos if x.device_type == 'GPU'])

[]


In [4]:
np.random.seed(0)

In [5]:
# from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("wiki qss").getOrCreate()

## Get auxiliary features and divide them into labels

1. `ref_index`
2. `total_words`
3. `tags`
4. `type_of_citation`

#### can include `section` of the page in which the citation belongs to

In [6]:
book_journal_features = pd.read_parquet('/dlabdata1/harshdee/book_journal_features.parquet', engine='pyarrow')

In [7]:
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('doi\s{0,10}=\s{0,10}([^|]+)', 'doi = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('isbn\s{0,10}=\s{0,10}([^|]+)', 'isbn = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('pmc\s{0,10}=\s{0,10}([^|]+)', 'pmc = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('pmid\s{0,10}=\s{0,10}([^|]+)', 'pmid = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
book_journal_features['citations'] = book_journal_features['citations'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 1700000/1700000 [00:09<00:00, 177310.70it/s]
100%|██████████| 1700000/1700000 [00:08<00:00, 203001.12it/s]
100%|██████████| 1700000/1700000 [00:06<00:00, 281456.58it/s]
100%|██████████| 1700000/1700000 [00:06<00:00, 246838.58it/s]
100%|██████████| 1700000/1700000 [00:07<00:00, 214028.83it/s]
100%|██████████| 1700000/1700000 [00:05<00:00, 298426.77it/s]
100%|██████████| 1700000/1700000 [00:06<00:00, 278950.14it/s]
100%|██████████| 1700000/1700000 [00:05<00:00, 284484.88it/s]


In [8]:
book_journal_features['actual_label'].value_counts()

book       951991
journal    748009
Name: actual_label, dtype: int64

In [9]:
journal_features = book_journal_features[book_journal_features['actual_label'] == 'journal']
book_features = book_journal_features[book_journal_features['actual_label'] == 'book']

In [10]:
newspaper_data = pd.read_parquet('/dlabdata1/harshdee/newspapers_citations_features.parquet', engine='pyarrow')

In [11]:
print('The total number of newspapers: {}'.format(newspaper_data.shape))

The total number of newspapers: (1945390, 34)


In [12]:
newspaper_data.head(2)

Unnamed: 0,ref_index,total_words,neighboring_words,neighboring_tags,AccessDate,Authors,Chapter,Chron,City,Date,...,URL,Volume,citations,id,page_title,r_id,r_parentid,sections,type_of_citation,tld
0,1432,2326,"[99, issue, page, 463, ref, ref, name=, '', Lo...","[CD, NN, NN, CD, NN, NN, NN, '', NNP, NNP, '',...",,,,,,2012-12-10,...,https://www.theguardian.com/technology/2012/de...,,{{Citation | date = 10 December 2012 | url = h...,469553,Earl of Lovelace,938238885,936441641.0,Initial Section,citation,theguardian
1,3923,8571,"[Keely, had, gone, ., Edey, said, he, would, s...","[RB, VBD, VBN, ., NNP, VBD, PRP, MD, VB, IN, P...",,,,,,1884-09-24,...,https://timesmachine.nytimes.com/timesmachine/...,,{{Citation | last = | first = | author-link ...,566081,John Ernst Worrell Keely,939390025,938972992.0,Initial Section,citation,nytimes


In [13]:
newspaper_data = newspaper_data[[
    'citations', 'ref_index', 'total_words', 'neighboring_words',
    'neighboring_tags', 'id', 'sections', 'type_of_citation'
]]
newspaper_data['actual_label'] = 'web'

In [14]:
newspaper_data.iloc[819218]['citations'] ## Example before removing the fields

u'{{cite web |last1=Rosen |first1=Jody |title=Here Are Hundreds More Artists Whose Tapes Were Destroyed in the UMG Fire |url=https://www.nytimes.com/2019/06/25/magazine/universal-music-fire-bands-list-umg.html |website=The New York Times |accessdate=28 June 2019 |date=25 June 2019}}'

In [15]:
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('doi\s{0,10}=\s{0,10}([^|]+)', 'doi = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('isbn\s{0,10}=\s{0,10}([^|]+)', 'isbn = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('pmc\s{0,10}=\s{0,10}([^|]+)', 'pmc = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('pmid\s{0,10}=\s{0,10}([^|]+)', 'pmid = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
newspaper_data['citations'] = newspaper_data['citations'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 1945390/1945390 [00:07<00:00, 270738.64it/s]
100%|██████████| 1945390/1945390 [00:06<00:00, 292178.38it/s]
100%|██████████| 1945390/1945390 [00:06<00:00, 304236.05it/s]
100%|██████████| 1945390/1945390 [00:06<00:00, 303546.06it/s]
100%|██████████| 1945390/1945390 [00:10<00:00, 188816.65it/s]
100%|██████████| 1945390/1945390 [00:07<00:00, 262286.86it/s]
100%|██████████| 1945390/1945390 [00:07<00:00, 275739.52it/s]
100%|██████████| 1945390/1945390 [00:07<00:00, 268074.45it/s]


In [16]:
newspaper_data.iloc[819218]['citations'] ## Example after removing the fields

u'{{cite web |last1=Rosen |first1=Jody |title=Here Are Hundreds More Artists Whose Tapes Were Destroyed in the UMG Fire |url = |website = |accessdate=28 June 2019 |date=25 June 2019}}'

In [17]:
entertainment_features = pd.read_parquet(
    '/dlabdata1/harshdee/entertainment_citations_features.parquet', engine='pyarrow')

In [18]:
entertainment_features = entertainment_features[[
    'ref_index', 'total_words', 'neighboring_words', 'neighboring_tags', 'id', 'sections', 'citations']]

In [19]:
entertainment_features.iloc[23787]['citations'] ## Example before removing the fields

u'{{cite web|url=https://www.billboard.com/charts/year-end/2018/top-billboard-200-albums|title=Billboard 200 Albums \\u2013 Year-End 2018|work=Billboard|accessdate=December 5, 2018}}'

In [20]:
entertainment_features['actual_label'] = 'web'
newspaper_data.drop('type_of_citation', axis=1, inplace=True)
book_journal_features.drop('type_of_citation', axis=1, inplace=True)

In [21]:
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('doi\s{0,10}=\s{0,10}([^|]+)', 'doi = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('isbn\s{0,10}=\s{0,10}([^|]+)', 'isbn = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('pmc\s{0,10}=\s{0,10}([^|]+)', 'pmc = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('pmid\s{0,10}=\s{0,10}([^|]+)', 'pmid = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('url\s{0,10}=\s{0,10}([^|]+)', 'url = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('work\s{0,10}=\s{0,10}([^|]+)', 'work = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('newspaper\s{0,10}=\s{0,10}([^|]+)', 'newspaper = ', x))
entertainment_features['citations'] = entertainment_features['citations'].progress_apply(
    lambda x: re.sub('website\s{0,10}=\s{0,10}([^|]+)', 'website = ', x))

100%|██████████| 1463652/1463652 [00:04<00:00, 304915.31it/s]
100%|██████████| 1463652/1463652 [00:04<00:00, 299104.44it/s]
100%|██████████| 1463652/1463652 [00:04<00:00, 307030.15it/s]
100%|██████████| 1463652/1463652 [00:04<00:00, 309150.31it/s]
100%|██████████| 1463652/1463652 [00:07<00:00, 198594.02it/s]
100%|██████████| 1463652/1463652 [00:05<00:00, 257045.71it/s]
100%|██████████| 1463652/1463652 [00:04<00:00, 314958.81it/s]
100%|██████████| 1463652/1463652 [00:04<00:00, 294243.45it/s]


In [22]:
entertainment_features.iloc[23787]['citations'] ## Example after removing the fields

u'{{cite web|url = |title=Billboard 200 Albums \\u2013 Year-End 2018|work = |accessdate=December 5, 2018}}'

In [23]:
entertainment_features['citations'].progress_apply(
    lambda x: re.findall('{{\s{0,10}([^|]+)', x)[0].strip()).value_counts()

100%|██████████| 1463652/1463652 [00:04<00:00, 330150.29it/s]


cite web               1124902
Cite web                194956
cite news                64134
Citation                 34857
Cite news                21032
cite AV media             9288
cite journal              6328
Cite AV media             1499
citation                  1418
cite episode              1120
cite press release         826
Cite episode               660
cite interview             637
cite AV media notes        480
Cite AV media notes        407
cite book                  401
Cite journal               268
cite serial                 98
cite podcast                72
cite speech                 58
Cite interview              41
cite conference             40
Cite press release          32
Cite book                   30
cite DVD notes              15
cite report                 13
cite encyclopedia           11
Cite podcast                 9
Cite speech                  8
Cite conference              7
Cite report                  4
Cite encyclopedia            1
Name: ci

In [24]:
newspaper_data = newspaper_data.sample(n=550000)
entertainment_features = entertainment_features.sample(n=550000)

In [25]:
dataset_with_features = pd.concat([journal_features, book_features, newspaper_data, entertainment_features])
dataset_with_features.shape

(2800000, 10)

In [26]:
le = preprocessing.LabelEncoder()
le.fit(dataset_with_features['actual_label'])
dataset_with_features['label_category'] = le.transform(dataset_with_features['actual_label'])

In [27]:
dataset_with_features[dataset_with_features['actual_label'] == 'entertainment'].head(1)

Unnamed: 0,actual_label,actual_prob,citations,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,type_of_citation,label_category


In [28]:
dataset_with_features[dataset_with_features['actual_label'] == 'web'].head(1)

Unnamed: 0,actual_label,actual_prob,citations,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,type_of_citation,label_category
401438,web,,{{cite news|url = |title=Honeybee mobs overpow...,1002559,"[NN, NNP, VBZ, NNP, NN, NNP, NN, NNP, CD, NN, ...","[first1, Michio, last2, Sakamoto, first2, Fumi...",5179,Predation,6474,,2


In [29]:
dataset_with_features[dataset_with_features['actual_label'] == 'book'].head(1)

Unnamed: 0,actual_label,actual_prob,citations,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,type_of_citation,label_category
450150,book,0.45,{{cite book |title=Barris TV and Movie Cars |p...,68485,"[NNP, CD, CD, NN, DT, NNPS, VBD, VBN, IN, NNP,...","[February, 18, 2016, ref, The, Explorers, were...",5580,Initial Section,20807,cite book,0


In [30]:
dataset_with_features[dataset_with_features['actual_label'] == 'journal'].head(1)

Unnamed: 0,actual_label,actual_prob,citations,id,neighboring_tags,neighboring_words,ref_index,sections,total_words,type_of_citation,label_category
1847498,journal,0.55,{{Cite journal|last=Tauzin|first=Tibor|last2=G...,488083,"[CC, IN, PRP, MD, VB, JJ, NNS, VBG, IN, DT, JJ...","[and, therefore, it, can, have, different, mea...",3860,Initial Section,18454,cite journal,1


In [31]:
## Convert citations' text to UTF-8
dataset_with_features['citations'] = dataset_with_features['citations'].progress_apply(lambda x: x.encode("utf-8"))

100%|██████████| 2800000/2800000 [00:07<00:00, 394320.76it/s]


In [32]:
dataset_with_features['actual_label'].value_counts()

web        1100000
book        951991
journal     748009
Name: actual_label, dtype: int64

In [33]:
## clearing up memory
del book_journal_features
del newspaper_data
del entertainment_features

import gc
gc.collect()

117

In [34]:
## Remove rows which have duplicate ID and citations since they are just the same examples
# dataset_with_features = dataset_with_features.drop_duplicates(subset=['id', 'citations']) ## keeps first row
# dataset_with_features = dataset_with_features.reset_index(drop=True)
# dataset_with_features.shape

In [35]:
## Please save this file and use it - as an intermediate file if you want to use it somewhere else
## dataset_with_features.to_csv('dataset_with_features.csv', index=False)

### Taking the unique `sections` and one hot encoding it to get a vector

In [36]:
# Only processing auxiliary features which are going to be used in the neural network
auxiliary_features = dataset_with_features[
    ['sections', 'citations', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'label_category']]

In [37]:
auxiliary_features['sections'] = auxiliary_features['sections'].apply(
    lambda x: x.encode('utf-8') if isinstance(x, unicode) else str(x))
auxiliary_features['sections'] = auxiliary_features['sections'].astype(str)
auxiliary_features['sections'] = auxiliary_features['sections'].apply(lambda x: x.split(', '))

In [38]:
section_counts = pd.Series(Counter(chain.from_iterable(x for x in auxiliary_features.sections)))
largest_sections = section_counts.nlargest(150)

In [39]:
# largest_sections.to_csv('/dlabdata1/harshdee/largest_sections.csv', header=None)

In [40]:
# Change section to `OTHERS` if occurence of the section is not in the 150 largest sections
auxiliary_features['sections'] = auxiliary_features['sections'].progress_apply(
    lambda x: list(set(['Others' if i not in largest_sections else i for i in x]))
)

100%|██████████| 2800000/2800000 [00:11<00:00, 251219.47it/s]


In [41]:
auxiliary_features.head()

Unnamed: 0,sections,citations,id,ref_index,total_words,neighboring_tags,label_category
1847498,[Initial Section],{{Cite journal|last=Tauzin|first=Tibor|last2=G...,488083,3860,18454,"[CC, IN, PRP, MD, VB, JJ, NNS, VBG, IN, DT, JJ...",1
3219190,[Initial Section],"{{cite journal |vauthors=Byerley JS, Gable K |...",46431158,911,2000,"[NN, '', JJ, NN, NNP, NNP, NNP, NNP, NNS, JJ, ...",1
2180322,[Initial Section],{{cite journal |last1=Almaslamani |first1=Muna...,41089873,1952,2628,"[NNP, NNP, NNP, VBG, NN, IN, JJ, NN, VBN, IN, ...",1
2119748,[Function],"{{cite journal | vauthors = Rash BG, Lim HD, B...",5662589,1860,3336,"[VB, DT, NN, IN, JJ, NN, CC, DT, NN, IN, NN, W...",1
921257,[Initial Section],{{cite journal|last=Clark|first=Herbert H.|aut...,6470064,2011,4751,"[NNS, IN, NN, NN, TO, VB, DT, JJ, NN, VBN, IN,...",1


In [42]:
section_dummies = pd.get_dummies(auxiliary_features.sections.apply(pd.Series).stack())

In [43]:
auxiliary_features = auxiliary_features.join(section_dummies.sum(level=0))

In [44]:
auxiliary_features.drop('sections', axis=1, inplace=True)
auxiliary_features.head()

Unnamed: 0,citations,id,ref_index,total_words,neighboring_tags,label_category,20th century,Accolades,Adverse effects,Aftermath,...,Taxonomy,Terminology,Timeline,Track listing,Treatment,Types,Usage in media,Uses,Work,Works
0,{{Citation | date = 10 December 2012 | url = |...,469553,1432,2326,"[CD, NN, NN, CD, NN, NN, NN, '', NNP, NNP, '',...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"{{Citation | url = | title=Donny Osmond, Moon ...",17321557,1624,2013,"[:,JJ,NN,:,NN,WRB,NN,VB,JJ,JJ,NN,:,NN,:,JJ,NN,...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,{{cite journal|last=Stadler |first=L. J. |auth...,627,15433,26710,"[JJ, NN, NNS, CC, NN, IN, NNP, NNP, ., JJ, NN,...",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,{{Citation | last = ''[[The New York Times]]''...,28571997,546,1518,"['', DT, NNP, NNP, NNP, '', NNP, RB, VBP, JJ, ...",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,{{cite book|author1=Steven L. Small|author2=Ga...,677,605,5244,"[DT, NN, VBZ, ., VB, JJ, '', NNP, '', NN, RB, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Taking the `type of citations` and one hot encoding it to get a vector

In [45]:
## Get one hot encoding of citation_type column
# citation_type_encoding = pd.get_dummies(auxiliary_features['citation_type'])

In [46]:
## Drop column citation_type as it is now encoded and join it
# auxiliary_features = auxiliary_features.drop('citation_type', axis=1)

In [47]:
## Concat columns of the dummies along the axis with the matching index
# auxiliary_features = pd.concat([auxiliary_features, citation_type_encoding], axis=1)
# auxiliary_features.head()

As we can see for the feature `total_number_of_words`, the mean and median **(since it is more robust in nature!)** are pretty high for articles which are `not` journal or books

In [48]:
print('Total mean length of entertainment articles: {}'.format( ## Journal - length is less
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].mean()))
print('Total median length of entertainment articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].median()))

Total mean length of entertainment articles: 7388.19177443
Total median length of entertainment articles: 3215.0


In [49]:
print('Total mean length of journal articles: {}'.format( ## Rest of the article have larger length
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].mean()))
print('Total median length of journal articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].median()))

Total mean length of journal articles: 10300.7014891
Total median length of journal articles: 4819.0


In [50]:
print('Total mean length of book articles: {}'.format( ## Books - length is less
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].mean()))
print('Total median length of book articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].median()))

Total mean length of book articles: 7080.08309953
Total median length of book articles: 2893.0


### Taking the `neighboring_tags` and making an encoder dictionary for it

To have more info about how what tag mean what: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [51]:
citation_tag_features = dataset_with_features[['id', 'citations', 'neighboring_tags']]

In [52]:
# citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
#     lambda x: x.replace("'", "").replace('[', '').replace(']', '').replace('\n', '').split(' ')
# )

In [53]:
citation_tag_features.iloc[1]['neighboring_tags'][:10]

array([u'NN', u"''", u'JJ', u'NN', u'NNP', u'NNP', u'NNP', u'NNP', u'NNS',
       u'JJ'], dtype=object)

In [54]:
# Get the count for each POS tag so that we have an estimation as to how many are there
tag_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_tag_features.neighboring_tags)))

In [55]:
# Considering the 10 smallest tags and checking which one does not have resemblance
tag_counts.nsmallest(10) 

LS        3
`       278
``      433
H       603
U       603
UH     1427
Y      1618
SYM    1956
X      2258
WP$    3590
dtype: int64

In [56]:
# tag_counts.to_csv('/dlabdata1/harshdee/tag_counts.csv', header=None)

We are going to replace `LS`, `the 2 backquotes` and the `the dollar symbol` since they do not have too much use case and do not give too much information about the context of the neighboring citation text.

In [57]:
OTHER_TAGS = ['LS', '``', '$']
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: [i if i not in OTHER_TAGS else 'Others' for i in x]
)

100%|██████████| 2800000/2800000 [01:26<00:00, 32348.72it/s]


Now, we can use the `count vectorizer` to represent the `POS tags` as a vector where each element of the vector represents the count of that tag in that particular citation.

In [58]:
cv = CountVectorizer() # Instantiate the vectorizer

In [59]:
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 2800000/2800000 [00:09<00:00, 300476.03it/s]


In [60]:
transformed_neighboring_tags = cv.fit_transform(citation_tag_features['neighboring_tags'])
transformed_neighboring_tags = pd.DataFrame(transformed_neighboring_tags.toarray(), columns=cv.get_feature_names())

In [61]:
citation_tag_features.head()

Unnamed: 0,id,citations,neighboring_tags
1847498,488083,{{Cite journal|last=Tauzin|first=Tibor|last2=G...,CC IN PRP MD VB JJ NNS VBG IN DT JJ NN NN IN N...
3219190,46431158,"{{cite journal |vauthors=Byerley JS, Gable K |...",NN '' JJ NN NNP NNP NNP NNP NNS JJ NN NN NN IN...
2180322,41089873,{{cite journal |last1=Almaslamani |first1=Muna...,NNP NNP NNP VBG NN IN JJ NN VBN IN NNP VBP RB ...
2119748,5662589,"{{cite journal | vauthors = Rash BG, Lim HD, B...",VB DT NN IN JJ NN CC DT NN IN NN WDT VBZ DT NN...
921257,6470064,{{cite journal|last=Clark|first=Herbert H.|aut...,NNS IN NN NN TO VB DT JJ NN VBN IN DT NN DT IN...


In [62]:
transformed_neighboring_tags.shape, citation_tag_features.shape

((2800000, 35), (2800000, 3))

In [63]:
citation_tag_features = citation_tag_features.reset_index(drop=True)
citation_tag_features = pd.concat([citation_tag_features, transformed_neighboring_tags], axis=1)

In [64]:
citation_tag_features.drop('neighboring_tags', axis=1, inplace=True)
citation_tag_features.head()

Unnamed: 0,id,citations,cc,cd,dt,ex,fw,in,jj,jjr,...,vb,vbd,vbg,vbn,vbp,vbz,wdt,wikicode,wp,wrb
0,488083,{{Cite journal|last=Tauzin|first=Tibor|last2=G...,3,0,4,0,0,6,6,0,...,2,1,3,0,0,0,0,0,0,0
1,46431158,"{{cite journal |vauthors=Byerley JS, Gable K |...",0,1,1,0,0,2,7,0,...,0,0,0,0,0,0,0,0,0,0
2,41089873,{{cite journal |last1=Almaslamani |first1=Muna...,1,1,0,0,0,4,6,0,...,0,0,1,1,1,0,0,0,0,0
3,5662589,"{{cite journal | vauthors = Rash BG, Lim HD, B...",2,0,4,0,0,4,3,0,...,3,0,0,1,1,2,1,0,0,0
4,6470064,{{cite journal|last=Clark|first=Herbert H.|aut...,2,0,7,0,0,4,2,1,...,3,0,1,2,0,1,0,0,0,0


## Features for the LSTM - more time sequence related

### Citation's original text features

In [65]:
# Create a separate dataframe for preprocessing citation text
citation_text_features = dataset_with_features[['id', 'citations', 'label_category']]

In [66]:
# Convert the citation into a list by breaking it down into characters
citation_text_features['characters'] = citation_text_features['citations'].progress_apply(lambda x: list(x))

100%|██████████| 2800000/2800000 [00:28<00:00, 99473.56it/s] 


In [67]:
# Get the character counts for each unique character
char_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_text_features.characters)))
char_counts.index

Index([u' ', u'!', u'"', u'#', u'$', u'%', u'&', u''', u'(', u')', u'*', u'+',
       u',', u'-', u'.', u'/', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7',
       u'8', u'9', u':', u';', u'<', u'=', u'>', u'?', u'@', u'A', u'B', u'C',
       u'D', u'E', u'F', u'G', u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'O',
       u'P', u'Q', u'R', u'S', u'T', u'U', u'V', u'W', u'X', u'Y', u'Z', u'[',
       u'\', u']', u'^', u'_', u'`', u'a', u'b', u'c', u'd', u'e', u'f', u'g',
       u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's',
       u't', u'u', u'v', u'w', u'x', u'y', u'z', u'{', u'|', u'}', u'~'],
      dtype='object')

In [68]:
print('The max length of the longest citation in terms of characters is: {}'.format(
    max(citation_text_features.characters.apply(lambda x: len(x)))))

print('The mean length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).mean()))

print('The median length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).median()))

The max length of the longest citation in terms of characters is: 146206
The mean length of the longest citation in terms of characters is: 216.880559286
The median length of the longest citation in terms of characters is: 191.0


In [69]:
# Make a dictionary for creating a mapping between the char and the corresponding index
char2ind = {char: i for i, char in enumerate(char_counts.index)}
ind2char = {i: char for i, char in enumerate(char_counts.index)}

In [70]:
# Map each character into the citation to its corresponding index and store it in a list
X_char = []
for citation in citation_text_features.citations:
    citation_chars = []
    for character in citation:
        citation_chars.append(char2ind[character])
        
    X_char.append(citation_chars)

Since the median length of the citation is 282, we have padded the input till 400 to get extra information which would be fed into the character embedding neural network.

In [71]:
# X_char = pad_sequences(X_char, maxlen=400)

In [72]:
# # Append the citation character list with their corresponding lists for making a dataset
# # for getting the character embeddings
# data = []
# for i in tqdm(range(len(X_char))):
#     data.append((X_char[i], int(citation_text_features.iloc[i]['label_category'])))

In [73]:
# # Separate out the training data and labels for further verification use
# features = [i[0] for i in data]
# labels = [i[1] for i in data]
# ## Changing it to dummy labels - identifier vs non identifier
# labels = [i for i in labels]

In [74]:
# from collections import Counter
# Counter(labels) ## 1401521, 1651833

In [75]:
# ## Splitting the data into training and testing
# training_data, testing_data, training_labels, testing_labels = train_test_split(
#     features, labels, train_size=0.9, shuffle=True
# )

We are going to feed in the 400 character input since our median length comes out to be approximately 282 and train it on a dummy task - if the citation is scientific or not and get the embedding layer which would contain the representation for each character.

In [76]:
# from keras.utils import to_categorical

# categorical_labels = to_categorical(training_labels, num_classes=3)
# categorical_test_labels = to_categorical(testing_labels, num_classes=3)

In [77]:
def citation_embedding_model():
    """
    Citation embedding generator model where the dimension of the embedding is 50.
    """
    main_input = Input(shape=(400, ), name='characters')
    # input dim is basically the vocab size
    emb = Embedding(input_dim=95, output_dim = 300, name='citation_embedding')(main_input)
    rnn = Bidirectional(LSTM(20))
    x = rnn(emb)
    de = Dense(3, activation='softmax')(x)
    model = Model(inputs = main_input, outputs = de)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [78]:
# Instantiate the model and generate the summary
model = citation_embedding_model()

In [79]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
characters (InputLayer)      (None, 400)               0         
_________________________________________________________________
citation_embedding (Embeddin (None, 400, 300)          28500     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 123       
Total params: 79,983
Trainable params: 79,983
Non-trainable params: 0
_________________________________________________________________


In [80]:
def generator(features, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features = np.zeros((batch_size, 400))
    batch_labels = np.zeros((batch_size, 3))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features), 1)[0]
            batch_features[i] = features[index]
            batch_labels[i] = categorical_labels[index]
        yield batch_features, batch_labels

In [81]:
# Run the model with the data being generated by the generator with a batch size of 64
# and number of epochs to be set to 15
# hist = model.fit_generator(
#     generator(training_data, categorical_labels, 512), steps_per_epoch=4000, nb_epoch=2)

In [82]:
## Evaluation of embedding model
# y_predicted_proba = model.predict(np.array(testing_data))
# predicted_class = np.argmax(y_predicted_proba, axis=1)
# accuracy_score(testing_labels, predicted_class)

In [83]:
# Save the model so that we can retrieve it later
# model.save('/dlabdata1/harshdee/embedding_model.h5')
from keras.models import load_model
model = load_model('/dlabdata1/harshdee/embedding_model.h5')




In [84]:
# Get the `citation_embedding` layer and get the weights for each character
citation_layer = model.get_layer('citation_embedding')
citation_weights = citation_layer.get_weights()[0]
citation_weights.shape

(95, 300)

In [85]:
# An example of the first element of an embedding
citation_weights[0][:100]

array([-0.03709625,  0.04958485,  0.01190331, -0.01536058, -0.02582742,
       -0.00217483,  0.08311249,  0.06200508, -0.08459523, -0.04255919,
        0.0143231 , -0.01487149, -0.03491046, -0.04084954,  0.00435642,
       -0.03313474, -0.01999018, -0.05731481, -0.01956906,  0.0226227 ,
       -0.03271348,  0.00799032,  0.0249096 , -0.00482269, -0.03595483,
        0.09026573, -0.02702151,  0.0100947 , -0.05867961,  0.05779125,
        0.05369785,  0.10009865,  0.06406695,  0.02678208, -0.00046256,
        0.00182868, -0.06260174,  0.09839807, -0.03899828,  0.0603345 ,
        0.05612418,  0.14331605, -0.05818555,  0.09527376, -0.03321271,
       -0.00525861,  0.06318327, -0.01527527,  0.03295157, -0.01681761,
        0.08902843,  0.13985208,  0.035852  ,  0.0349258 , -0.03233176,
        0.00341616,  0.00032616,  0.09980483,  0.02019689, -0.02741743,
        0.01315569,  0.08160345,  0.05272087,  0.04602867,  0.04264879,
        0.03988118, -0.01963122, -0.02536223, -0.01573488, -0.05

In [86]:
# Map the embedding of each character to the character in each corresponding citation and aggregate (sum)
citation_text_features['embedding'] = citation_text_features['characters'].progress_apply(
    lambda x: sum([citation_weights[char2ind[c]] for c in x])
)

100%|██████████| 2800000/2800000 [13:26<00:00, 3469.95it/s]


In [87]:
# Normalize the citation embeddings so that we can check for their similarity later
citation_text_features['embedding'] = citation_text_features['embedding'].progress_apply(
    lambda x: x/ np.linalg.norm(x, axis=0).reshape((-1, 1))
)

100%|██████████| 2800000/2800000 [00:48<00:00, 58094.25it/s]


In [88]:
# Make the sum of the embedding to be summed up to 1
np.sum(np.square(citation_text_features['embedding'].iloc[0]))

0.99999994

### Similarity Graph for citation text embeddings

In [89]:
# Just considering 20 since otherwise it will be computationally extensive
# citation_text_and_embeddings = citation_text_features[['citation', 'embedding']][:500]

In [90]:
# citation_text_and_embeddings['embedding'] = citation_text_and_embeddings['embedding'].progress_apply(
#     lambda x: x[0].tolist()
# )

In [91]:
# def tsne_embedding_plot():
#     labels = []
#     tokens = []

#     index = 0
#     for row in citation_text_and_embeddings:
#         tokens.append(row['embedding'])
#         labels.append(str(index))
#         index += 1
    
#     # Perplexity takes into account the global and local features
#     # We are using dimensionality reduciton for 2 features and taking 2500 iterations into account
#     tsne_model = TSNE(perplexity=40, n_components=2, n_iter=2500, random_state=0)
#     new_values = tsne_model.fit_transform(tokens)

#     x = []
#     y = []
#     for value in new_values:
#         x.append(value[0])
#         y.append(value[1])
        
#     plt.figure(figsize=(10, 10)) 
#     for i in range(len(x)):
#         plt.scatter(x[i],y[i])
#         plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2),
#                      textcoords='offset points', ha='right', va='bottom')
#     plt.show()

In [92]:
# tsne_embedding_plot()

In [93]:
# an example of citation embeddings which is close to each other
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([14, 477])] # (51, 243), (0, 13)

In [94]:
# # Similiarity of 2 citations which are very similar
# result_similar = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[14]['embedding'],
#     citation_text_and_embeddings.iloc[477]['embedding']
# )
# result_similar

In [95]:
# an example of citation embeddings which is NOT close to each other and are different
# citation_text_and_embeddings[citation_text_and_embeddings.index.isin([42, 124])] # (6, 42)

In [96]:
# Similiarity of 2 citations which are not similar
# result_different = 1 - spatial.distance.cosine(
#     citation_text_and_embeddings.iloc[42]['embedding'],
#     citation_text_and_embeddings.iloc[124]['embedding']
# )
# result_different

### FastText embeddings for neighboring words

In [97]:
# Load the pretrained embedding model on wikipedia
model = FastText.load_fasttext_format('/dlabdata1/harshdee/wiki.en.bin')

In [98]:
# Create a separate dataframe for preprocessing citation words
citation_word_features = dataset_with_features[['id', 'citations', 'neighboring_words', 'label_category']]

In [99]:
# Lowercase all the neighboring words for each of the citations
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i.lower() for i in x]
)

100%|██████████| 2800000/2800000 [01:29<00:00, 31308.55it/s]


Get the total unique words with their respective counts in the total dataset. This is done in order to remove words which are of low frequency and will potentially act as noise to the model.

In [100]:
word_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words)))

In [101]:
threshold = 4

x = len(word_counts)
y = len(word_counts[word_counts <= threshold])
print('Total words: {}\nTotal number of words whose occurence is less than 4: {}\nDifference: {}'.format(x, y, x-y))
words_less_than_threshold = word_counts[word_counts <= threshold]

Total words: 5730137
Total number of words whose occurence is less than 4: 5287233
Difference: 442904


In [102]:
# Remove the words which have a count of less than 4 and replace them with the unique <UNK> symbol
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i if i not in words_less_than_threshold else '<UNK>' for i in x]
)

100%|██████████| 2800000/2800000 [08:15<00:00, 5646.11it/s] 


In [103]:
# creating a mapping between word and index or vice versa
words = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words))).index
word2ind = {w: i for i, w in enumerate(words)}
ind2words = {i: w for i, w in enumerate(words)}

In [104]:
word_embedding_matrix = np.zeros((len(word2ind), 300))
for w in tqdm(word2ind):
    index = word2ind[w]
    word_embedding_matrix[index] = model.wv[w]

100%|██████████| 442905/442905 [00:35<00:00, 12404.84it/s]


Once we have the word embedding for each word in the neighboring words, we sum the embeddings for each word together in neighboring words to get an embedding which represents the past 40 words.

In [105]:
citation_word_features['words_embedding'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: sum([word_embedding_matrix[word2ind[w]] for w in x])
)

100%|██████████| 2800000/2800000 [08:39<00:00, 5390.87it/s] 


Now we have the `citation_word_features` and `citation_tag_features`, so we can join them together to form `time_sequence_features` which would be fed later into the LSTM..

In [106]:
# Join time sequence features with the citations dataset
time_sequence_features = pd.concat([citation_tag_features, citation_word_features.reset_index(drop=True)], keys=['id', 'citations'], axis=1)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [107]:
print('Total number of samples in time features are: {}'.format(time_sequence_features.shape))

Total number of samples in time features are: (2800000, 42)


In [108]:
# citation_text = auxiliary_features.iloc[:,0]
# auxiliary_features['citation_text'] = citation_text
# auxiliary_features.drop('citation', axis=1, inplace=True)
# auxiliary_features.rename({'citation_text': 'citation'}, axis=1, inplace=True)

In [109]:
# Join auxiliary features with the citations dataset
citation_text_features.reset_index(drop=True, inplace=True)
auxiliary_features.reset_index(drop=True, inplace=True)

auxiliary_features = pd.concat([auxiliary_features, citation_text_features], keys=['id', 'citations'], axis=1)
auxiliary_features = pd.concat([auxiliary_features['citations'], auxiliary_features['id']], axis=1)
auxiliary_features = auxiliary_features.loc[:, ~auxiliary_features.columns.duplicated()]
auxiliary_features.shape

(2800000, 159)

In [110]:
# Drop columns with are duplicates
auxiliary_features.drop(['neighboring_tags', 'characters'], axis=1, inplace=True)

In [111]:
del model
del word_embedding_matrix
del citation_word_features
del citation_text_features

gc.collect()

7

## Making sets for `auxiliary` and `time sequence` features

In [112]:
data = dataset_with_features[['id', 'citations', 'label_category']]

In [113]:
# Join the time sequence features for the data
time_sequence_features = pd.concat([time_sequence_features['id'], time_sequence_features['citations']], axis=1)
time_sequence_features = pd.concat([time_sequence_features, data.reset_index(drop=True)], keys=['id', 'citations'], axis=1)
time_sequence_features.columns = time_sequence_features.columns.droplevel(0)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [114]:
time_sequence_features['words_embedding'] = time_sequence_features['words_embedding'].progress_apply(
    lambda x: x.tolist())

100%|██████████| 2800000/2800000 [01:21<00:00, 34148.91it/s]


In [115]:
auxiliary_features['embedding'] = auxiliary_features['embedding'].progress_apply(lambda x: x.tolist())

100%|██████████| 2800000/2800000 [01:36<00:00, 29060.21it/s]


In [116]:
len(time_sequence_features), len(auxiliary_features)

(2800000, 2800000)

In [117]:
del book_features
del journal_features
gc.collect()

7

## Splitting the dataset into training, testing and validation 

The split is done into 80-10-10 ratio so that we have more training data to train on and have validation dataset to make sure that the model is working as anticipated.

In [118]:
type(auxiliary_features)

pandas.core.frame.DataFrame

In [119]:
# Get the labels which will be split later
y = auxiliary_features.loc[:, 'label_category'].astype(int).tolist()

In [120]:
# Make a mask for auxiliary dataset to get all features except the one below
column_mask_aux = ~auxiliary_features.columns.isin(['id', 'citations', 'label_category'])

In [121]:
# # Get the columns of those auxiliary features and covert them into a list
auxiliary = auxiliary_features.loc[:, column_mask_aux].values.tolist()

In [122]:
# # Convert them into numpy array (for Keras) and stack them (if needed) as suited for the model's format
auxiliary = [np.array(auxiliary[i][0][0] + auxiliary[i][1:]) for i in tqdm(range(len(auxiliary)))]

100%|██████████| 2800000/2800000 [02:00<00:00, 23243.97it/s]


In [123]:
# # Make a mask for time sequences features dataset to get all features except the one below
cols = [col for col in time_sequence_features.columns if col not in ['id', 'citations', 'label_category', 'neighboring_words']]
stripped_tsf = time_sequence_features[cols]

In [124]:
time = stripped_tsf.values.tolist()

In [125]:
def make_structure_time_features(time_features):
    """
    Concatenate features which are numbers and lists together by checking the type:
    
    param: time_features: the features which are considered time sequence.
    """
    feature_one = np.array([int(i) for i in time_features if isinstance(i, long)])
    feature_two = np.array([i for i in time_features if isinstance(i, list)][0])
    return np.array([feature_one, feature_two])

In [126]:
time = [make_structure_time_features(time[i]) for i in tqdm(range(len(time)))]

100%|██████████| 2800000/2800000 [02:04<00:00, 22419.89it/s]


In [127]:
# Instantiating PCA to 35 components since it should be equal to the size of the vector of the tags
pca = PCA(n_components=35)

def get_reduced_words_dimension(data):
    """
    Get the aggregated dataset of words and tags which has the
    same dimensionality using PCA.
    
    :param: data: data which needs to be aggregated.
    """
    tags = [i for i, _ in data]
    word_embeddings = [j for _,j in data]
    pca.fit(word_embeddings)
    
    word_embeddings_pca = pca.transform(word_embeddings)
    tags = np.array(tags)
    return word_embeddings_pca, tags

In [128]:
# Apply PCA on all the sets of data to have the dimensions of the data to be the same
word_embeddings_pca, tags = get_reduced_words_dimension(time)

In [129]:
time_pca = np.dstack((word_embeddings_pca, tags))

In [130]:
word_embeddings_pca.shape, tags.shape, time_pca.shape

((2800000, 35), (2800000, 35), (2800000, 35, 2))

In [131]:
del time_sequence_features
del auxiliary_features

In [132]:
# del data
del word_embeddings_pca
del tags
del stripped_tsf
del column_mask_aux
gc.collect()

14

## LSTM/Neural Network Model

In [133]:
def generator_nn(features_aux, features_time, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features_aux = np.zeros((batch_size, 453))
    batch_features_time =  np.zeros((batch_size, 35, 2))
    batch_labels = np.zeros((batch_size, 3))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features_aux), 1)[0]
            batch_features_aux[i] = features_aux[index]
            batch_features_time[i] = features_time[index]
            batch_labels[i] = labels[index]
        yield [batch_features_time, np.asarray(batch_features_aux)], batch_labels

In [134]:
from keras.optimizers import Adam

In [135]:
def scheduler(epoch, lr):
    import math
    if epoch < 10:
        return lr
    else:
        return lr * math.exp(-0.1)

In [136]:
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [137]:
def classification_model():
    """
    Model for classifying whether a citation is scientific or not.
    """
    main_input = Input(shape=(35, 2), name='time_input')
    lstm_out = LSTM(64)(main_input)

    auxiliary_input = Input(shape=(453,), name='aux_input') ## 454 without citation type, 476 with citation type
    # Converging the auxiliary input with the LSTM output
    x = keras.layers.concatenate([lstm_out, auxiliary_input])

    # 4 fully connected layer
    x = Dense(256, activation='selu')(x)
    x = Dense(128, activation='selu')(x)
    x = Dense(128, activation='selu')(x)
    x = Dense(64, activation='selu')(x)

    main_output = Dense(3, activation='softmax', name='main_output')(x)
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])
    
    opt = Adam(0.001)
    model.compile(
        optimizer=opt, loss={'main_output': 'categorical_crossentropy'},
        loss_weights={'main_output': 1.}, metrics=['acc']
    )
    return model

In [138]:
# Instantiating the classification model
model = classification_model()
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
time_input (InputLayer)         (None, 35, 2)        0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 64)           17152       time_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 453)          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 517)          0           lstm_2[0][0]                    

We use `ReduceLRonPlateau` so that the model does not overshoot the optimal minimum point and hence by default we start with a learning rate of 0.01 but as soon as the accuracy stop increasing the learning rate does not change which helps us converge better.

In [139]:
## Convert auxiliary into numpy array for indexing
auxiliary = np.asarray(auxiliary)
y = np.asarray(y)

In [140]:
EPOCHS = 30

In [141]:
x_train_indices, x_test_indices, y_train_indices, y_test_indices = train_test_split(
    range(auxiliary.shape[0]), range(y.shape[0]), train_size=0.9, stratify=y, shuffle=True
)

In [142]:
aux_train = auxiliary[x_train_indices]
time_train = time_pca[x_train_indices]
y_train = np.eye(3)[y[x_train_indices]]

In [143]:
aux_test = auxiliary[x_test_indices]
time_test = time_pca[x_test_indices]
y_test = y[x_test_indices]

In [None]:
# predictions = []
# for index, (train_indices, val_indices) in enumerate(skf.split(auxiliary, y)):
#     aux_train, aux_val = auxiliary[train_indices], auxiliary[val_indices]
#     time_train, time_val = time_pca[train_indices], time_pca[val_indices]
#     y_train = np.eye(4)[y[train_indices]]
#     y_val = y[val_indices]
    
BATCH_SIZE = 256
print('Running model with epochs: {}'.format(EPOCHS))

model = None
model = classification_model()
training_generator = generator_nn(aux_train, time_train, y_train, BATCH_SIZE)

history_callback = model.fit_generator(
    training_generator,
    steps_per_epoch=len(x_train_indices) // 256,
    epochs=EPOCHS, verbose=1, shuffle=True, callbacks=[callback]
)

Running model with epochs: 30
Epoch 1/30

In [None]:
# Running model with epochs: 5
# Epoch 1/5
# 11601/11601 [==============================] - 421s 36ms/step - loss: 0.7328 - acc: 0.7955
# Epoch 2/5
# 11601/11601 [==============================] - 413s 36ms/step - loss: 0.3311 - acc: 0.8823
# Epoch 3/5
# 11601/11601 [==============================] - 413s 36ms/step - loss: 0.2957 - acc: 0.8946
# Epoch 4/5
# 11601/11601 [==============================] - 406s 35ms/step - loss: 0.2777 - acc: 0.9024
# Epoch 5/5
# 11601/11601 [==============================] - 410s 35ms/step - loss: 0.2617 - acc: 0.9085
# ---------------------------------------------------------------------------

In [None]:
history_dict = history_callback.history

In [None]:
f = open('/dlabdata1/harshdee/results/citation_model_loss_{}.json'.format(EPOCHS), 'w')
f.write(str(history_dict))
f.close()

In [None]:
prediction_for_folds = model.predict([time_test, aux_test])
y_pred = np.argmax(prediction_for_folds, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Neural network model for epochs {}: {}".format(EPOCHS, accuracy))

res = pd.DataFrame(confusion_matrix(y_test, y_pred))
res.index = ['book', 'journal', 'web']
res.columns = ['book', 'journal', 'web']
res['accuracy'] = accuracy
res.to_csv('/dlabdata1/harshdee/results/citation_model_result_{}.csv'.format(EPOCHS))
print(res)

In [None]:
model.save('/dlabdata1/harshdee/results/citation_model_epochs_{}.h5'.format(EPOCHS))
json_string = model.to_json()
with open("/dlabdata1/harshdee/results/citation_model_epochs_{}.json".format(EPOCHS), "w") as json_file:
    json_file.write(json_string)

print('\n\nDone with the prediction and saving model with epochs: {}\n'.format(EPOCHS))