In [1]:
import re
import json
import glob
import keras
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from itertools import chain
from keras.models import Model
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.manifold import TSNE
from gensim.models import FastText
from sklearn.decomposition import PCA
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split 
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import warnings
warnings.filterwarnings("ignore")

# Initializing tqdm for pandas
tqdm.pandas()

In [2]:
keras.backend.backend()

'tensorflow'

In [3]:
from tensorflow.python.client import device_lib

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos if x.device_type == 'GPU'])

[]


In [4]:
np.random.seed(0)

In [6]:
## Only for Google colab
# from google.colab import drive
# drive.mount('/content/drive')

## Get auxiliary features and divide them into labels

1. `ref_index`
2. `total_words`
3. `tags`
4. `type_of_citation`

#### can include `section` of the page in which the citation belongs to

In [7]:
citations_bias_features = pd.read_parquet('./mini_citations_bias_features.parquet/', engine='pyarrow')

In [8]:
citations_bias_features['label'].value_counts()

LIBR    224058
MODR    223476
CONS    219993
Name: label, dtype: int64

In [9]:
cons_citations = citations_bias_features[citations_bias_features['label'] == 'CONS']
modr_citations = citations_bias_features[citations_bias_features['label'] == 'MODR']
libr_citations = citations_bias_features[citations_bias_features['label'] == 'LIBR']

In [10]:
cons_citations = cons_citations.sample(n=210000)
modr_citations = modr_citations.sample(n=210000)
libr_citations = libr_citations.sample(n=210000)

In [11]:
dataset_with_features = pd.concat([cons_citations, modr_citations, libr_citations])
dataset_with_features.shape

(630000, 18)

In [12]:
le = preprocessing.LabelEncoder()
le.fit(dataset_with_features['label'])
dataset_with_features['label_category'] = le.transform(dataset_with_features['label'])

In [13]:
dataset_with_features[dataset_with_features['label'] == 'CONS'].head(1)

Unnamed: 0,URL,tld,citations,Title,sections,type_of_citation,ID_list,id,r_id,r_parentid,page_title,page_id,ref_index,total_words,neighboring_words,neighboring_tags,bias_score,label,label_category
167129,http://www.katv.com/story/22467593/update-scot...,katv,{{cite web|title=Scott County Sheriff drowns d...,"Scott County Sheriff drowns during rescue, 3 o...",Initial Section,cite web,,39534714,953947559,953937444.0,"Tornado outbreak of May 26–31, 2013",39534714,5906,6294,"[ref, {{cite web|title=Verona man killed by fa...","[VB, WIKICODE, NN, NN, NN, NNP, NN, VBN, IN, V...",0.5984,CONS,0


In [14]:
dataset_with_features[dataset_with_features['label'] == 'LIBR'].head(1)

Unnamed: 0,URL,tld,citations,Title,sections,type_of_citation,ID_list,id,r_id,r_parentid,page_title,page_id,ref_index,total_words,neighboring_words,neighboring_tags,bias_score,label,label_category
515909,http://www.liberation.fr/culture/0101328757-or...,liberation,{{cite web| url=http://www.liberation.fr/cultu...,L'Orchestre Andalous d'Isra\xebl r\xe9unit mus...,Initial Section,cite web,,23515089,925772300,917229142.0,Israeli Andalusian Orchestra,23515089,370,629,"[web, url, http, :, www.jpost.com/ArtsAndCultu...","[NN, JJ, NN, :, NN, ., JJ, NN, :, NN, ., JJ, N...",-1.4031,LIBR,1


In [15]:
dataset_with_features[dataset_with_features['label'] == 'MODR'].head(1)

Unnamed: 0,URL,tld,citations,Title,sections,type_of_citation,ID_list,id,r_id,r_parentid,page_title,page_id,ref_index,total_words,neighboring_words,neighboring_tags,bias_score,label,label_category
381685,https://www.manchestereveningnews.co.uk/whats-...,manchestereveningnews,{{Cite news|url=https://www.manchestereveningn...,Denise Welch strips off to star in Gary Barlow...,Early life and education,cite news,,1423334,953871470,953504322.0,Fern Britton,1423334,2152,3967,"[by, Mark, Davenport, called, ''Photoshopping,...","[IN, NNP, NNP, VBD, VBG, '', VBG, JJ, NN, NNP,...",0.0,MODR,2


In [16]:
## Convert citations' text to UTF-8
dataset_with_features['citations'] = dataset_with_features['citations'].progress_apply(lambda x: x.encode("utf-8"))

100%|██████████| 630000/630000 [00:04<00:00, 150288.84it/s]


In [17]:
dataset_with_features['label'].value_counts()

LIBR    210000
CONS    210000
MODR    210000
Name: label, dtype: int64

### Taking the unique `sections` and one hot encoding it to get a vector

In [18]:
# Only processing auxiliary features which are going to be used in the neural network
auxiliary_features = dataset_with_features[
    ['sections', 'citations', 'id', 'ref_index',
     'total_words', 'neighboring_tags', 'label_category']]

In [19]:
auxiliary_features['sections'] = auxiliary_features['sections'].apply(
    lambda x: x.encode('utf-8') if isinstance(x, str) else str(x))
auxiliary_features['sections'] = auxiliary_features['sections'].astype(str)
auxiliary_features['sections'] = auxiliary_features['sections'].apply(lambda x: x.split(', '))

In [20]:
section_counts = pd.Series(Counter(chain.from_iterable(x for x in auxiliary_features.sections)))
largest_sections = section_counts.nlargest(150)

In [21]:
# Change section to `OTHERS` if occurence of the section is not in the 150 largest sections
auxiliary_features['sections'] = auxiliary_features['sections'].progress_apply(
    lambda x: list(set(['Others' if i not in largest_sections else i for i in x]))
)

100%|██████████| 630000/630000 [00:03<00:00, 198912.38it/s]


In [22]:
auxiliary_features.head()

Unnamed: 0,sections,citations,id,ref_index,total_words,neighboring_tags,label_category
167129,[b'Initial Section'],b'{{cite web|title=Scott County Sheriff drowns...,39534714,5906,6294,"[VB, WIKICODE, NN, NN, NN, NNP, NN, VBN, IN, V...",0
189891,[b'Initial Section'],b'{{cite book|last=Villard|first=Erik|title=Un...,21683511,1694,3346,"[DT, NN, VBD, DT, CD, NNP, NN, NN, ., VB, JJ, ...",0
103624,[b'Initial Section'],b'{{cite web|url=http://www.cmt.com/news/17648...,47667920,23546,26675,"[NNP, NNP, CD, NN, IN, DT, CD, JJS, VBN, IN, N...",0
65374,[b'Initial Section'],b'{{cite news|title=Auer secures 2020 BMW driv...,61096497,3391,4723,"[NNP, NNP, NN, NNP, NNP, VBZ, TO, DT, NN, IN, ...",0
187981,[b'Initial Section'],"b""{{cite web|url=http://www.ynetnews.com/artic...",42184312,28319,39825,"[RB, IN, NN, ., VB, WIKICODE, NN, NN, JJ, NN, ...",0


In [23]:
section_dummies = pd.get_dummies(auxiliary_features.sections.apply(pd.Series).stack())

In [24]:
auxiliary_features = auxiliary_features.join(section_dummies.sum(level=0))

In [25]:
auxiliary_features.drop('sections', axis=1, inplace=True)
auxiliary_features.head()

Unnamed: 0,citations,id,ref_index,total_words,neighboring_tags,label_category,Others,b'2000s',b'2010s',b'20th century',...,b'Transactions',b'Transfers',b'U.S. House of Representatives',b'United States',b'Victims',b'Videography',b'Views',b'Winners',b'Work',b'Works'
167129,b'{{cite web|title=Scott County Sheriff drowns...,39534714,5906,6294,"[VB, WIKICODE, NN, NN, NN, NNP, NN, VBN, IN, V...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
189891,b'{{cite book|last=Villard|first=Erik|title=Un...,21683511,1694,3346,"[DT, NN, VBD, DT, CD, NNP, NN, NN, ., VB, JJ, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103624,b'{{cite web|url=http://www.cmt.com/news/17648...,47667920,23546,26675,"[NNP, NNP, CD, NN, IN, DT, CD, JJS, VBN, IN, N...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65374,b'{{cite news|title=Auer secures 2020 BMW driv...,61096497,3391,4723,"[NNP, NNP, NN, NNP, NNP, VBZ, TO, DT, NN, IN, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187981,"b""{{cite web|url=http://www.ynetnews.com/artic...",42184312,28319,39825,"[RB, IN, NN, ., VB, WIKICODE, NN, NN, JJ, NN, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


As we can see for the feature `total_number_of_words`, the mean and median **(since it is more robust in nature!)** are pretty high for articles which are liberal

In [26]:
print('Total mean length of liberal articles: {}'.format( ## liberal articles are longer
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].mean()))
print('Total median length of liberal articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 1]['total_words'].median()))

Total mean length of liberal articles: 11279.110685714286
Total median length of liberal articles: 4996.0


In [27]:
print('Total mean length of moderate articles: {}'.format( ## Moderate articles in general have a shorter length
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].mean()))
print('Total median length of moderate articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 2]['total_words'].median()))

Total mean length of moderate articles: 10091.52959047619
Total median length of moderate articles: 3823.0


In [28]:
print('Total mean length of conservative articles: {}'.format( ## slightly smaller than liberal
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].mean()))
print('Total median length of conservative articles: {}'.format(
    auxiliary_features[auxiliary_features['label_category'] == 0]['total_words'].median()))

Total mean length of conservative articles: 11413.3415
Total median length of conservative articles: 4587.0


### Taking the `neighboring_tags` and making an encoder dictionary for it

To have more info about how what tag mean what: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [29]:
citation_tag_features = dataset_with_features[['id', 'citations', 'neighboring_tags']]

In [30]:
# citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
#     lambda x: x.replace("'", "").replace('[', '').replace(']', '').replace('\n', '').split(' ')
# )

In [31]:
citation_tag_features.iloc[1]['neighboring_tags'][:10]

array(['DT', 'NN', 'VBD', 'DT', 'CD', 'NNP', 'NN', 'NN', '.', 'VB'],
      dtype=object)

In [32]:
# Get the count for each POS tag so that we have an estimation as to how many are there
tag_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_tag_features.neighboring_tags)))

In [33]:
# Considering the 10 smallest tags and checking which one does not have resemblance
tag_counts.nsmallest(10) 

LS        1
``      189
WP$    1006
SYM    1063
UH     1238
PDT    2216
$      2982
RBS    5880
EX     5936
RBR    7639
dtype: int64

In [34]:
# tag_counts.to_csv('/dlabdata1/harshdee/tag_counts.csv', header=None)

We are going to replace `LS`, `the 2 backquotes` and the `the dollar symbol` since they do not have too much use case and do not give too much information about the context of the neighboring citation text.

In [35]:
OTHER_TAGS = ['LS', '``', '$']
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: [i if i not in OTHER_TAGS else 'Others' for i in x]
)

100%|██████████| 630000/630000 [00:07<00:00, 88101.54it/s] 


Now, we can use the `count vectorizer` to represent the `POS tags` as a vector where each element of the vector represents the count of that tag in that particular citation.

In [36]:
cv = CountVectorizer() # Instantiate the vectorizer

In [37]:
citation_tag_features['neighboring_tags'] = citation_tag_features['neighboring_tags'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 630000/630000 [00:01<00:00, 368925.50it/s]


In [38]:
transformed_neighboring_tags = cv.fit_transform(citation_tag_features['neighboring_tags'])
transformed_neighboring_tags = pd.DataFrame(transformed_neighboring_tags.toarray(), columns=cv.get_feature_names())

In [39]:
citation_tag_features.head()

Unnamed: 0,id,citations,neighboring_tags
167129,39534714,b'{{cite web|title=Scott County Sheriff drowns...,VB WIKICODE NN NN NN NNP NN VBN IN VBG NN IN N...
189891,21683511,b'{{cite book|last=Villard|first=Erik|title=Un...,DT NN VBD DT CD NNP NN NN . VB JJ NN VBN NNPS ...
103624,47667920,b'{{cite web|url=http://www.cmt.com/news/17648...,NNP NNP CD NN IN DT CD JJS VBN IN NNP NNP TO N...
65374,61096497,b'{{cite news|title=Auer secures 2020 BMW driv...,NNP NNP NN NNP NNP VBZ TO DT NN IN NNP NNP NNP...
187981,42184312,"b""{{cite web|url=http://www.ynetnews.com/artic...",RB IN NN . VB WIKICODE NN NN JJ NN : JJ NN : J...


In [40]:
transformed_neighboring_tags.shape, citation_tag_features.shape

((630000, 35), (630000, 3))

In [41]:
citation_tag_features = citation_tag_features.reset_index(drop=True)
citation_tag_features = pd.concat([citation_tag_features, transformed_neighboring_tags], axis=1)

In [42]:
citation_tag_features.drop('neighboring_tags', axis=1, inplace=True)
citation_tag_features.head()

Unnamed: 0,id,citations,cc,cd,dt,ex,fw,in,jj,jjr,...,vb,vbd,vbg,vbn,vbp,vbz,wdt,wikicode,wp,wrb
0,39534714,b'{{cite web|title=Scott County Sheriff drowns...,0,2,0,0,0,3,3,0,...,2,1,2,1,0,0,0,1,0,0
1,21683511,b'{{cite book|last=Villard|first=Erik|title=Un...,0,3,3,0,0,2,3,0,...,1,1,1,1,0,0,0,0,0,0
2,47667920,b'{{cite web|url=http://www.cmt.com/news/17648...,1,5,3,0,0,6,1,0,...,1,0,0,2,0,0,0,0,0,0
3,61096497,b'{{cite news|title=Auer secures 2020 BMW driv...,0,1,3,0,0,4,1,0,...,2,1,1,0,0,2,0,0,1,0
4,42184312,"b""{{cite web|url=http://www.ynetnews.com/artic...",0,2,1,0,0,4,3,0,...,2,2,0,0,0,1,0,1,0,0


## Features for the LSTM - more time sequence related

### Citation's original text features

In [43]:
# Create a separate dataframe for preprocessing citation text
citation_text_features = dataset_with_features[['id', 'citations', 'label_category']]

In [44]:
# Convert the citation into a list by breaking it down into characters
citation_text_features['characters'] = citation_text_features['citations'].progress_apply(lambda x: list(x))

100%|██████████| 630000/630000 [00:13<00:00, 47304.01it/s]


In [45]:
# Get the character counts for each unique character
char_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_text_features.characters)))
char_counts.index

Int64Index([123,  99, 105, 116, 101,  32, 119,  98, 124, 108,  61,  83, 111,
             67, 117, 110, 121, 104, 114, 102, 100, 115, 103,  44,  51, 109,
            112,  58,  47,  46, 107,  97, 118,  50,  52,  54,  55,  53,  57,
             45,  75,  65,  84,  86,  74,  49,  48, 125,  69,  85,  79,  56,
             77,  72, 120,  68,  66,  87,  91,  78,  93,  76,  73,  39,  80,
            106,  71,  82,  95,  70,  89,  92, 113,  37,  36,  63, 122,  35,
             38,  34,  59,  90,  40,  41,  33,  88,  43,  62,  81,  60, 126,
             64,  42,  96,  94],
           dtype='int64')

In [46]:
print('The max length of the longest citation in terms of characters is: {}'.format(
    max(citation_text_features.characters.apply(lambda x: len(x)))))

print('The mean length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).mean()))

print('The median length of the longest citation in terms of characters is: {}'.format(
    citation_text_features.characters.apply(lambda x: len(x)).median()))

The max length of the longest citation in terms of characters is: 10057
The mean length of the longest citation in terms of characters is: 276.183353968254
The median length of the longest citation in terms of characters is: 257.0


In [47]:
# Make a dictionary for creating a mapping between the char and the corresponding index
char2ind = {char: i for i, char in enumerate(char_counts.index)}
ind2char = {i: char for i, char in enumerate(char_counts.index)}

In [48]:
# Map each character into the citation to its corresponding index and store it in a list
X_char = []
for citation in citation_text_features.citations:
    citation_chars = []
    for character in citation:
        citation_chars.append(char2ind[character])
        
    X_char.append(citation_chars)

Since the median length of the citation is 276, we have padded the input till 400 to get extra information which would be fed into the character embedding neural network.

In [49]:
X_char = pad_sequences(X_char, maxlen=300)

In [50]:
# Append the citation character list with their corresponding lists for making a dataset
# for getting the character embeddings
data = []
for i in tqdm(range(len(X_char))):
    data.append((X_char[i], int(citation_text_features.iloc[i]['label_category'])))

100%|██████████| 630000/630000 [01:21<00:00, 7734.90it/s]


In [51]:
# # Separate out the training data and labels for further verification use
features = [i[0] for i in data]
labels = [i[1] for i in data]

In [52]:
## Splitting the data into training and testing
training_data, testing_data, training_labels, testing_labels = train_test_split(
    features, labels, train_size=0.9, shuffle=True
)

We are going to feed in the 400 character input since our median length comes out to be approximately 282 and train it on different labels to get the embedding or the text

In [53]:
from keras.utils import to_categorical

categorical_labels = to_categorical(training_labels, num_classes=3)
categorical_test_labels = to_categorical(testing_labels, num_classes=3)

In [54]:
def citation_embedding_model():
    """
    Citation embedding generator model where the dimension of the embedding is 50.
    """
    main_input = Input(shape=(300, ), name='characters')
    # input dim is basically the vocab size
    emb = Embedding(input_dim=95, output_dim = 150, name='citation_embedding')(main_input)
    rnn = Bidirectional(LSTM(40))
    x = rnn(emb)
    y = Dense(20, activation='sigmoid')(x)
    de = Dense(3, activation='softmax')(y)
    model = Model(inputs = main_input, outputs = de)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [55]:
# Instantiate the model and generate the summary
model = citation_embedding_model()

In [56]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
characters (InputLayer)      [(None, 300)]             0         
_________________________________________________________________
citation_embedding (Embeddin (None, 300, 150)          14250     
_________________________________________________________________
bidirectional (Bidirectional (None, 80)                61120     
_________________________________________________________________
dense (Dense)                (None, 20)                1620      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 63        
Total params: 77,053
Trainable params: 77,053
Non-trainable params: 0
_________________________________________________________________


In [57]:
def generator(features, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.
    
    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    batch_features = np.zeros((batch_size, 300))
    batch_labels = np.zeros((batch_size, 3))
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features), 1)[0]
            batch_features[i] = features[index]
            batch_labels[i] = categorical_labels[index]
        yield batch_features, batch_labels

In [58]:
# # Run the model with the data being generated by the generator with a batch size of 64
# # and number of epochs to be set to 15
hist = model.fit_generator(
    generator(training_data, categorical_labels, 1024), steps_per_epoch=500, epochs=30)

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [59]:
# ## Evaluation of embedding model
y_predicted_proba = model.predict(np.array(testing_data))
predicted_class = np.argmax(y_predicted_proba, axis=1)
accuracy_score(testing_labels, predicted_class)

0.9430793650793651

In [50]:
# # Save the model so that we can retrieve it later
# model.save('/content/sample_data/embedding_bias_model2.h5')
from keras.models import load_model
model = load_model('./embedding_bias_model2.h5')

In [51]:
# # Get the `citation_embedding` layer and get the weights for each character
citation_layer = model.get_layer('citation_embedding')
citation_weights = citation_layer.get_weights()[0]
citation_weights.shape

(95, 150)

In [52]:
# An example of the first element of an embedding
citation_weights[0][:100]

array([-0.01804133, -0.03245307, -0.12361734,  0.04764726, -0.1037605 ,
       -0.18249515, -0.00202649, -0.02972538, -0.02778249,  0.04539751,
       -0.16687357,  0.00735664,  0.00089882,  0.10340928, -0.06338792,
        0.01505945, -0.0854198 ,  0.00249107, -0.21004422,  0.03546045,
        0.01065212,  0.00940321, -0.13779655, -0.02447194,  0.05968909,
        0.03105203, -0.01691033, -0.05628253, -0.02027912,  0.01662555,
       -0.14528242, -0.11639705, -0.0670182 , -0.07386018,  0.00341635,
       -0.2526719 ,  0.02036618,  0.06978438, -0.00052654,  0.06041639,
       -0.02310811,  0.12577496, -0.0981093 , -0.0460665 ,  0.07024252,
        0.1767161 ,  0.11854591, -0.25253728,  0.1438085 ,  0.00949295,
        0.01013856,  0.02204646,  0.1104747 ,  0.26085848,  0.08808202,
        0.21609005, -0.0105921 , -0.25423908, -0.09452852, -0.21711965,
        0.01737843, -0.17467532, -0.13874163, -0.3061189 ,  0.08532184,
       -0.0952365 ,  0.00962504, -0.03715321, -0.05805329, -0.17

In [53]:
# # Map the embedding of each character to the character in each corresponding citation and aggregate (sum)
citation_text_features['embedding'] = citation_text_features['characters'].progress_apply(
    lambda x: sum([citation_weights[char2ind[c]] for c in x])
)

100%|██████████| 630000/630000 [05:34<00:00, 1880.66it/s]


In [54]:
# # Normalize the citation embeddings so that we can check for their similarity later
citation_text_features['embedding'] = citation_text_features['embedding'].progress_apply(
    lambda x: x/ np.linalg.norm(x, axis=0).reshape((-1, 1))
)

100%|██████████| 630000/630000 [00:29<00:00, 21603.82it/s]


In [55]:
# # Make the sum of the embedding to be summed up to 1
np.sum(np.square(citation_text_features['embedding'].iloc[0]))

1.0

### FastText embeddings for neighboring words

In [57]:
import gc
del citations_bias_features
del section_dummies
gc.collect()

156

In [58]:
# Load the pretrained embedding model on wikipedia
model = FastText.load_fasttext_format('./wiki.en.bin')

In [59]:
# Create a separate dataframe for preprocessing citation words
citation_word_features = dataset_with_features[['id', 'citations', 'neighboring_words', 'label_category']]

In [60]:
# Lowercase all the neighboring words for each of the citations
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i.lower() for i in x]
)

100%|██████████| 630000/630000 [01:11<00:00, 8792.15it/s] 


Get the total unique words with their respective counts in the total dataset. This is done in order to remove words which are of low frequency and will potentially act as noise to the model.

In [61]:
word_counts = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words)))

In [62]:
threshold = 4

x = len(word_counts)
y = len(word_counts[word_counts <= threshold])
print('Total words: {}\nTotal number of words whose occurence is less than 4: {}\nDifference: {}'.format(x, y, x-y))
words_less_than_threshold = word_counts[word_counts <= threshold]

Total words: 1710005
Total number of words whose occurence is less than 4: 1573563
Difference: 136442


In [63]:
# Remove the words which have a count of less than 4 and replace them with the unique <UNK> symbol
citation_word_features['neighboring_words'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: [i if i not in words_less_than_threshold else '<UNK>' for i in x]
)

100%|██████████| 630000/630000 [00:46<00:00, 13530.97it/s]


In [64]:
# creating a mapping between word and index or vice versa
words = pd.Series(Counter(chain.from_iterable(x for x in citation_word_features.neighboring_words))).index
word2ind = {w: i for i, w in enumerate(words)}
ind2words = {i: w for i, w in enumerate(words)}

In [65]:
word_embedding_matrix = np.zeros((len(word2ind), 300))
for w in tqdm(word2ind):
    index = word2ind[w]
    word_embedding_matrix[index] = model.wv[w]

100%|██████████| 136443/136443 [14:44<00:00, 154.26it/s]


In [66]:
del model
gc.collect()

40

Once we have the word embedding for each word in the neighboring words, we sum the embeddings for each word together in neighboring words to get an embedding which represents the past 40 words.

In [67]:
citation_word_features['words_embedding'] = citation_word_features['neighboring_words'].progress_apply(
    lambda x: sum([word_embedding_matrix[word2ind[w]] for w in x])
)

100%|██████████| 630000/630000 [01:18<00:00, 8046.97it/s] 


Now we have the `citation_word_features` and `citation_tag_features`, so we can join them together to form `time_sequence_features` which would be fed later into the LSTM..

In [68]:
# Join time sequence features with the citations dataset
time_sequence_features = pd.concat([citation_tag_features, citation_word_features.reset_index(drop=True)], keys=['id', 'citations'], axis=1)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [69]:
print('Total number of samples in time features are: {}'.format(time_sequence_features.shape))

Total number of samples in time features are: (630000, 42)


In [70]:
# citation_text = auxiliary_features.iloc[:,0]
# auxiliary_features['citation_text'] = citation_text
# auxiliary_features.drop('citation', axis=1, inplace=True)
# auxiliary_features.rename({'citation_text': 'citation'}, axis=1, inplace=True)

In [71]:
# Join auxiliary features with the citations dataset
citation_text_features.reset_index(drop=True, inplace=True)
auxiliary_features.reset_index(drop=True, inplace=True)

auxiliary_features = pd.concat([auxiliary_features, citation_text_features], keys=['id', 'citations'], axis=1)
auxiliary_features = pd.concat([auxiliary_features['citations'], auxiliary_features['id']], axis=1)
auxiliary_features = auxiliary_features.loc[:, ~auxiliary_features.columns.duplicated()]
auxiliary_features.shape

(630000, 159)

In [72]:
# Drop columns with are duplicates
auxiliary_features.drop(['neighboring_tags', 'characters'], axis=1, inplace=True)

In [73]:
del word_embedding_matrix
del citation_word_features
del citation_text_features

gc.collect()

40

## Making sets for `auxiliary` and `time sequence` features

In [74]:
data = dataset_with_features[['id', 'citations', 'label_category']]

In [75]:
# Join the time sequence features for the data
time_sequence_features = pd.concat([time_sequence_features['id'], time_sequence_features['citations']], axis=1)
time_sequence_features = pd.concat([time_sequence_features, data.reset_index(drop=True)], keys=['id', 'citations'], axis=1)
time_sequence_features.columns = time_sequence_features.columns.droplevel(0)
time_sequence_features = time_sequence_features.loc[:, ~time_sequence_features.columns.duplicated()]

In [76]:
time_sequence_features['words_embedding'] = time_sequence_features['words_embedding'].progress_apply(
    lambda x: x.tolist())

100%|██████████| 630000/630000 [02:11<00:00, 4784.79it/s] 


In [77]:
auxiliary_features['embedding'] = auxiliary_features['embedding'].progress_apply(lambda x: x.tolist())

100%|██████████| 630000/630000 [06:58<00:00, 1503.78it/s] 


In [78]:
len(time_sequence_features), len(auxiliary_features)

(630000, 630000)

In [80]:
gc.collect()

68

## Splitting the dataset into training, testing and validation 

The split is done into 80-10-10 ratio so that we have more training data to train on and have validation dataset to make sure that the model is working as anticipated.

In [81]:
type(auxiliary_features)

pandas.core.frame.DataFrame

In [82]:
# Get the labels which will be split later
y = auxiliary_features.loc[:, 'label_category'].astype(int).tolist()

In [83]:
# Make a mask for auxiliary dataset to get all features except the one below
column_mask_aux = ~auxiliary_features.columns.isin(['id', 'citations', 'label_category'])

In [84]:
# # Get the columns of those auxiliary features and covert them into a list
auxiliary = auxiliary_features.loc[:, column_mask_aux].values.tolist()

In [85]:
# # Convert them into numpy array (for Keras) and stack them (if needed) as suited for the model's format
auxiliary = [np.array(auxiliary[i][0][0] + auxiliary[i][1:]) for i in tqdm(range(len(auxiliary)))]

100%|██████████| 630000/630000 [00:40<00:00, 15500.19it/s]


In [86]:
# # Make a mask for time sequences features dataset to get all features except the one below
cols = [col for col in time_sequence_features.columns if col not in ['id', 'citations', 'label_category', 'neighboring_words']]
stripped_tsf = time_sequence_features[cols]

In [87]:
time = stripped_tsf.values.tolist()

In [92]:
def make_structure_time_features(time_features):
    """
    Concatenate features which are numbers and lists together by checking the type:
    
    param: time_features: the features which are considered time sequence.
    """
    feature_one = np.array([int(i) for i in time_features if isinstance(i, int)])
    feature_two = np.array([i for i in time_features if isinstance(i, list)][0])
    return np.array([feature_one, feature_two])

In [94]:
time = [make_structure_time_features(time[i]) for i in tqdm(range(len(time)))]

In [95]:
# Instantiating PCA to 35 components since it should be equal to the size of the vector of the tags
pca = PCA(n_components=35)

def get_reduced_words_dimension(data):
    """
    Get the aggregated dataset of words and tags which has the
    same dimensionality using PCA.
    
    :param: data: data which needs to be aggregated.
    """
    tags = [i for i, _ in data]
    word_embeddings = [j for _,j in data]
    pca.fit(word_embeddings)
    
    word_embeddings_pca = pca.transform(word_embeddings)
    tags = np.array(tags)
    return word_embeddings_pca, tags

In [96]:
# Apply PCA on all the sets of data to have the dimensions of the data to be the same
word_embeddings_pca, tags = get_reduced_words_dimension(time)

In [97]:
time_pca = np.dstack((word_embeddings_pca, tags))

In [98]:
word_embeddings_pca.shape, tags.shape, time_pca.shape

((630000, 35), (630000, 35), (630000, 35, 2))

In [99]:
del time_sequence_features
del auxiliary_features

In [100]:
# del data
del word_embeddings_pca
del tags
del stripped_tsf
del column_mask_aux
gc.collect()

  0%|          | 0/630000 [08:00<?, ?it/s]


270

## LSTM/Neural Network Model

In [181]:
def generator_nn(features_aux, features_time, labels, batch_size):
    """
    Generator to create batches of data so that processing is easy.

    :param: features: the features of the model.
    :param: labels: the labels of the model.
    :param: batch_size: the size of the batch
    """
    # Create empty arrays to contain batch of features and labels
    # batch_features_aux = np.zeros((batch_size, 303))
    batch_features_time =  np.zeros((batch_size, 35, 2))
    batch_labels = np.zeros((batch_size, 3))
  
    while True:
        for i in range(batch_size):
            # choose random index in features
            index = np.random.choice(len(features_aux), 1)[0]
            # batch_features_aux[i] = features_aux[index]
            batch_features_time[i] = features_time[index]
            batch_labels[i] = labels[index]
        # yield [batch_features_time, np.asarray(batch_features_aux)], batch_labels
        yield [batch_features_time], batch_labels

In [182]:
from keras.optimizers import Adam

In [183]:
def scheduler(epoch, lr):
    import math
    if epoch <= 10:
        return lr
    else:
        return lr * math.exp(-0.1)

In [184]:
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [195]:
def classification_model():
    """
    Model for classifying whether a citation is scientific or not.
    """
    main_input = Input(shape=(35, 2), name='time_input')
    lstm_out = LSTM(128)(main_input)

    ## only using words and tags for nwo
    # auxiliary_input = Input(shape=(303,), name='aux_input')
    # Converging the auxiliary input with the LSTM output
    # x = keras.layers.concatenate([lstm_out, auxiliary_input])

    # 6 fully connected layer
    x = Dense(512, activation='relu')(lstm_out)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='sigmoid')(x)

    main_output = Dense(3, activation='softmax', name='main_output')(x)
    # model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])
    model = Model(inputs=[main_input], outputs=[main_output])
    
    opt = Adam(lr=0.001) # SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(
        optimizer=opt, loss={'main_output': 'categorical_crossentropy'},
        loss_weights={'main_output': 1.}, metrics=['acc']
    )
    return model

In [196]:
# Instantiating the classification model
model = classification_model()
model.summary()

Model: "functional_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
time_input (InputLayer)         [(None, 35, 2)]      0                                            
__________________________________________________________________________________________________
lstm_16 (LSTM)                  (None, 128)          67072       time_input[0][0]                 
__________________________________________________________________________________________________
aux_input (InputLayer)          [(None, 303)]        0                                            
__________________________________________________________________________________________________
concatenate_16 (Concatenate)    (None, 431)          0           lstm_16[0][0]                    
                                                                 aux_input[0][0]      

We use `ReduceLRonPlateau` so that the model does not overshoot the optimal minimum point and hence by default we start with a learning rate of 0.01 but as soon as the accuracy stop increasing the learning rate does not change which helps us converge better.

In [212]:
## Saving the data vars
# from numpy import savez_compressed
# savez_compressed('time_pca.npz', time_pca)
# savez_compressed('auxiliary.npz', auxiliary)
# savez_compressed('y_label.npz', y)

In [197]:
## Convert auxiliary into numpy array for indexing
auxiliary = np.asarray(auxiliary)
y = np.asarray(y)

In [198]:
x_train_indices, x_test_indices, y_train_indices, y_test_indices = train_test_split(
    range(auxiliary.shape[0]), range(y.shape[0]), train_size=0.9, stratify=y, shuffle=True
)

In [199]:
aux_train = auxiliary[x_train_indices]
time_train = time_pca[x_train_indices]
y_train = np.eye(3)[y[x_train_indices]]

In [200]:
aux_test = auxiliary[x_test_indices]
time_test = time_pca[x_test_indices]
y_test = y[x_test_indices]

In [201]:
len(x_train_indices) // 512

1107

In [202]:
EPOCHS = 50
BATCH_SIZE = 512
print('Running model with epochs: {}'.format(EPOCHS))

model = None
model = classification_model()
training_generator = generator_nn(aux_train, time_train, y_train, BATCH_SIZE)

Running model with epochs: 50


In [203]:
history_callback = model.fit_generator(
    training_generator,
    steps_per_epoch=len(x_train_indices) // BATCH_SIZE,
    epochs=EPOCHS, verbose=1, shuffle=True, callbacks=[callback]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [204]:
history_dict = history_callback.history

In [205]:
f = open('./citation_model_loss_{}.json'.format(EPOCHS), 'w')
f.write(str(history_dict))
f.close()

In [206]:
model.save('./citation_model_epochs_{}.h5'.format(EPOCHS))

In [208]:
prediction_for_folds = model.predict([time_test, aux_test])
y_pred = np.argmax(prediction_for_folds, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Neural network model for epochs {}: {}".format(EPOCHS, accuracy))

res = pd.DataFrame(confusion_matrix(y_test, y_pred))
res.index = ['CONS', 'LIBR', 'MODR']
res.columns = ['CONS', 'LIBR', 'MODR']
res['accuracy'] = accuracy
res.to_csv('./citation_model_result_{}.csv'.format(EPOCHS))
print(res)

Accuracy of the Neural network model for epochs 50: 0.6293968253968254
       CONS   LIBR   MODR  accuracy
CONS  15017   4029   1954  0.629397
LIBR   5036  13231   2733  0.629397
MODR   4345   5251  11404  0.629397


In [None]:
model.save('./citation_model_epochs_{}.h5'.format(EPOCHS))
json_string = model.to_json()
with open("./citation_model_epochs_{}.json".format(EPOCHS), "w") as json_file:
    json_file.write(json_string)

print('\n\nDone with the prediction and saving model with epochs: {}\n'.format(EPOCHS))