In [1]:
# Get all the imports
import scipy as sp 
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
from itertools import chain
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
tqdm.pandas()
np.random.seed(0)

## Reading and sampling in the data

In [3]:
# Read the files which we have extracted from other notebooks
citations_features = pd.read_parquet('./citations_features.parquet/', engine='pyarrow')
dataset = pd.read_csv('dataset.csv')

In [4]:
# Merge the citations with their corresponding features
dataset_with_features = pd.merge(
    dataset, citations_features, how='inner', left_on=['id','citation'], right_on = ['id','citation']
)
dataset_with_features.drop('page_title_y', axis=1, inplace=True)
dataset_with_features.drop('Unnamed: 0', axis=1, inplace=True)

We are just considering the citations which are unique in nature so that are model cannot be biased.

In [5]:
dataset_with_features  = dataset_with_features.set_index(['id', 'citation'])
dataset_with_features = dataset_with_features[~dataset_with_features.index.duplicated(keep='first')]
dataset_with_features = dataset_with_features.reset_index()

In [6]:
# Get 760,000 samples of scientific data
scientific_data = dataset_with_features[dataset_with_features['scientific_or_not'] == True]
scientific_samples = scientific_data.sample(n=760000)

In [7]:
# Get 760,000 samples of NON-scientific data
non_scientific_data = dataset_with_features[dataset_with_features['scientific_or_not'] == False]
non_scientific_samples = non_scientific_data.sample(n=760000)

In [8]:
# Concat the scientific and non scientific samples and get unique 1.52 million data samples
dataset_with_features = pd.concat([scientific_samples, non_scientific_samples])

In [10]:
dataset_with_features.reset_index(drop=True, inplace=True)

In [9]:
len(dataset_with_features)

1520000

## Random Forest features

We will use the `sklearn` library to vectorize features such as `words` and `tags` and use the Bag of Words model and check for the accuracy.

In [11]:
# Instantiating the vectorizer
cv = CountVectorizer()

In [12]:
# Getting the required columns for words and tags
shortened_time_features = dataset_with_features[
    ['neighboring_words', 'neighboring_tags', 'citation', 'id', 'scientific_or_not']]

### Processing TAGs

Since a Random Forest takes a lot of time to train and construct the trees, we have decided to consider instead of the last 40 words just the last 20 words so that we don't have any memory problems. Also, this is because of the fact that the model is being trained locally.

In [14]:
shortened_time_features['neighboring_words'] = shortened_time_features['neighboring_words'].progress_apply(
    lambda x: x[-20:])
shortened_time_features['neighboring_tags'] = shortened_time_features['neighboring_tags'].progress_apply(
    lambda x: x[-20:])

100%|██████████| 1520000/1520000 [00:03<00:00, 448486.36it/s]
100%|██████████| 1520000/1520000 [00:03<00:00, 413814.18it/s]


In [15]:
# Get the shortened tag counts for the last 20 words
shortened_tag_counts = pd.Series(Counter(chain.from_iterable(x for x in shortened_time_features.neighboring_tags)))

In [74]:
shortened_tag_counts.nsmallest(10)

``       125
WP$      951
UH      1117
SYM     1535
PDT     1860
$       5502
EX      6100
RBS     6117
RBR    10730
RP     12564
dtype: int64

In [73]:
shortened_tag_counts.nlargest(10) 

NNP    7720415
NN     7172442
JJ     2825144
CD     2709039
IN     1763863
NNS    1124781
.      1095751
DT      799749
:       789523
VB      618022
dtype: int64

We will remove the `TAGs` which we think are not that useful such as `backquotes` and the `dollar` sing and replace it with the `Others` tag.

In [17]:
OTHER_TAGS = ['LS', '``', '$']
shortened_time_features['neighboring_tags'] = shortened_time_features['neighboring_tags'].progress_apply(
    lambda x: [i if i not in OTHER_TAGS else 'Others' for i in x]
)

100%|██████████| 1520000/1520000 [00:13<00:00, 111149.88it/s]


In [18]:
# Making a string out of the tags since that is the format of the input for `CountVectorizer`
shortened_time_features['neighboring_tags'] = shortened_time_features['neighboring_tags'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 1520000/1520000 [00:03<00:00, 480622.82it/s]


Now, we get the tags and one hot encode them depending on if they are present in the list of neighboring tags for the corresponding citation.

In [19]:
shortened_transformed_tags = cv.fit_transform(shortened_time_features['neighboring_tags'])

In [20]:
shortened_transformed_tags = pd.DataFrame(shortened_transformed_tags.toarray(), columns=cv.get_feature_names())

In [21]:
# Merge the results with the features
shortened_time_features = pd.concat([shortened_time_features, shortened_transformed_tags], axis=1)

### Processing WORDs

Now, we need to process the `words` - because of resource constraints we have already decreased the number of number of neighboring words we are considering.

Since the size of the vocabulary is too high - which is nearly half a million, it will create another memory constraint since the dataframe will be too big. So we use the Hash Vectorizer which hashes the one hot encoding and hence the issue of space is resolved.

In [22]:
# Lowercasing all the neighboring words for each of the citation
shortened_time_features['neighboring_words'] = shortened_time_features['neighboring_words'].progress_apply(
    lambda x: [i.lower() for i in x]
)

100%|██████████| 1520000/1520000 [00:19<00:00, 78795.83it/s] 


In [23]:
# Now need to preprocess it to use Count Vectorizer from SKLearn
shortened_time_features['neighboring_words'] = shortened_time_features['neighboring_words'].progress_apply(
    lambda x: " ".join(x))

100%|██████████| 1520000/1520000 [00:03<00:00, 395592.05it/s]


In [24]:
# instantiate the Hash Vectorizer with the number of components for the vectors to be set as 500
vectorizer = HashingVectorizer(n_features=500)
corpus = shortened_time_features['neighboring_words'].tolist()

In [27]:
# Transform the corpus and we get a matrix of the size of the sample set with 500 components each
X = vectorizer.fit_transform(corpus)
X.shape

(1520000, 500)

## Splitting the data into training and testing set

For making the training and testing sets, we take all the total indices of the sampled set and shuffle them.

Then, we take 80% of it to be the training data and the other 20% to be the testing set. We get the corresponding `words/tags` features we create and stack/merge them together.

In [29]:
# Shuffle the indices
TOTAL_SAMPLES = X.shape[0]
indices = np.arange(TOTAL_SAMPLES)
np.random.shuffle(indices)

In [30]:
# Split the indices for training and testing sets
training_end_index = int(((TOTAL_SAMPLES * 80) / 100))
training_indices = indices[:training_end_index]
testing_indices = indices[training_end_index:]

In [31]:
# Get the labels for the corresponding indices from the features data
y_train = shortened_time_features.loc[training_indices, 'scientific_or_not'].astype(int).tolist()
y_test = shortened_time_features.loc[testing_indices, 'scientific_or_not'].astype(int).tolist()

In [32]:
# Get the tags for the corresponding set
training_tags = shortened_transformed_tags.loc[training_indices, :].astype(int).values.tolist()
testing_tags = shortened_transformed_tags.loc[testing_indices, :].astype(int).values.tolist()

In [33]:
# Get the words for the set
training_words = X[training_indices, :]
testing_words = X[testing_indices, :]

Since a `HashVectorizer` returns a `csr.matrix` which is Compressed Sparse Matrix as the output, we need to stack the columns together so that we have an accumualated dataset for the features for each citation.

In [34]:
training_features = sp.sparse.hstack((training_tags, training_words))
testing_features = sp.sparse.hstack((testing_tags, testing_words))

## Model time - just with 2 features `words` and `tags`

In [35]:
# Instantiate an ensemble random forest classifier with no of estimator with 2 jobs and train
clf = RandomForestClassifier(n_estimators=150, verbose=100, n_jobs=2)
clf.fit(training_features, y_train)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
building tree 1 of 150
building tree 2 of 150
building tree 3 of 150[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  3.8min

building tree 4 of 150[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:  3.9min

building tree 5 of 150[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:  7.6min

building tree 6 of 150[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  7.7min

building tree 7 of 150[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed: 11.2min

building tree 8 of 150[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed: 11.3min

building tree 9 of 150[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed: 15.1min

building tree 10 of 150[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed: 15.3min

building tree 11 of 150[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 18.6min

building tree 12 of 150[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed: 19.1min

building tree 13 of 150[Paralle

building tree 98 of 150[Parallel(n_jobs=2)]: Done  96 tasks      | elapsed: 180.1min

building tree 99 of 150[Parallel(n_jobs=2)]: Done  97 tasks      | elapsed: 183.6min

building tree 100 of 150[Parallel(n_jobs=2)]: Done  98 tasks      | elapsed: 184.0min

building tree 101 of 150[Parallel(n_jobs=2)]: Done  99 tasks      | elapsed: 187.4min

building tree 102 of 150[Parallel(n_jobs=2)]: Done 100 tasks      | elapsed: 187.7min

building tree 103 of 150[Parallel(n_jobs=2)]: Done 101 tasks      | elapsed: 191.1min

building tree 104 of 150[Parallel(n_jobs=2)]: Done 102 tasks      | elapsed: 191.4min

building tree 105 of 150[Parallel(n_jobs=2)]: Done 103 tasks      | elapsed: 194.5min

building tree 106 of 150[Parallel(n_jobs=2)]: Done 104 tasks      | elapsed: 195.2min

building tree 107 of 150[Parallel(n_jobs=2)]: Done 105 tasks      | elapsed: 198.2min

building tree 108 of 150[Parallel(n_jobs=2)]: Done 106 tasks      | elapsed: 199.1min

building tree 109 of 150[Parallel(n_jobs=2)]:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=2,
                       oob_score=False, random_state=None, verbose=100,
                       warm_start=False)

In [36]:
# Predict given the testing features
y_pred = clf.predict(testing_features)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done  11 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done  12 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done  13 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done  15 tasks      | elapsed:    2.4s
[Para

[Parallel(n_jobs=2)]: Done 135 tasks      | elapsed:   20.0s
[Parallel(n_jobs=2)]: Done 136 tasks      | elapsed:   20.0s
[Parallel(n_jobs=2)]: Done 137 tasks      | elapsed:   20.3s
[Parallel(n_jobs=2)]: Done 138 tasks      | elapsed:   20.3s
[Parallel(n_jobs=2)]: Done 139 tasks      | elapsed:   20.5s
[Parallel(n_jobs=2)]: Done 140 tasks      | elapsed:   20.5s
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed:   20.8s
[Parallel(n_jobs=2)]: Done 142 tasks      | elapsed:   20.8s
[Parallel(n_jobs=2)]: Done 143 tasks      | elapsed:   21.1s
[Parallel(n_jobs=2)]: Done 144 tasks      | elapsed:   21.1s
[Parallel(n_jobs=2)]: Done 145 tasks      | elapsed:   21.4s
[Parallel(n_jobs=2)]: Done 146 tasks      | elapsed:   21.4s
[Parallel(n_jobs=2)]: Done 147 tasks      | elapsed:   21.7s
[Parallel(n_jobs=2)]: Done 150 out of 150 | elapsed:   22.0s finished


In [37]:
# Model Accuracy, how much is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8641875


Sometimes accuracy could be a bit misleading and hence we calculate the confusion matrix which gives us a better sense as to how many were classified correctly and how many were not..

In [39]:
# Get the confusion matrix
confusion_matrix(y_test, y_pred)

array([[134611,  17304],
       [ 23983, 128102]])

In [40]:
# Convert the confusion matrix into pandas dataframe
y_true = pd.Series(y_test)
y_predicted = pd.Series(y_pred)

pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,134611,17304,151915
1,23983,128102,152085
All,158594,145406,304000


Now in this section, we calculate the importance of the features. We are **not particularly interested** in the `neghboring_words` since they are just bag of words and we have used hashing so it is hard to trace. But, we could check the importance of the `neighboring_tags` since these were the first 35 columns in the stacked feature set.

In [42]:
clf.feature_importances_.shape

(535,)

In [67]:
# Get the importances and sort them according to decreasing order
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [65]:
# Get the index of the only the neighboring_tags feature which we are interested in
interested_indices = list(range(35))

get_ranking_tags = [(i,j) for i, j in enumerate(indices) if j in interested_indices]
print(get_ranking_tags)

[(4, 11), (9, 13), (10, 1), (11, 26), (15, 10), (19, 6), (24, 5), (28, 0), (36, 25), (38, 2), (44, 17), (47, 30), (58, 28), (60, 18), (62, 29), (78, 23), (89, 32), (115, 27), (122, 16), (139, 4), (184, 12), (277, 9), (406, 31), (512, 7), (524, 8), (525, 19), (526, 33), (527, 34), (528, 21), (529, 20), (530, 3), (531, 15), (532, 14), (533, 22), (534, 24)]


As we can see that for `neighboring_tags` the highest importance is hold by tags such as `NNP (Proper nouns)`, `NNS (Plurals)`, `CD (Cardinal Number)` and `VBD (Past tense)` which is correct with the hypothesis since most of the times when there are scientific citation something is either being claimed for a particular time or for a particular person, so these tags are really important and show how POS tags can help.

In [66]:
for i, j in get_ranking_tags:
    print(cv.get_feature_names()[j])

nnp
nns
cd
vbd
nn
jj
in
cc
vb
dt
prp
vbz
vbn
rb
vbp
to
wikicode
vbg
pos
fw
nnps
md
wdt
jjr
jjs
rbr
wp
wrb
rp
rbs
ex
pdt
others
sym
uh
