In [1]:
import os
import glob
import json
import gensim
import sklearn
import numpy as np
import pandas as pd

from gensim.models import word2vec

from bert_serving.client import BertClient

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer



In [2]:
ROOT = os.getcwd().split("src")[0]

In [3]:
# papers, ref_submissions, industrial_strategy
document_type = "papers"

### Required Functions

In [4]:
def write_json_file(data, file):
    '''
    Write data to JSON file
    '''

    with open(f"{file}", "w") as f:
        json.dump(data, f)

In [5]:
def read_json_file(file):
    '''
    Read data from JSON file
    '''

    with open(f"{file}", "r") as f:
        data = json.load(f)
    
    return data

### Import Preprocessed JSON files

In [8]:
corpus = []
json_documents = glob.glob(f"{ROOT}/data/{document_type}/json/preprocessed/**/*.json",
                          recursive=True)
for json_document in json_documents:
    data = read_json_file(f"{json_document}")
    if "body" in data.keys():
        document_string = " ".join(data["body"])
    elif "chapters" in data.keys():
        document_string = ""
        for chapter in data["chapters"].keys():
            document_string += f" {data['chapters'][chapter]}"
    corpus.append(document_string)
print(f"Loaded {len(corpus)} documents")

Loaded 1641 documents


### CountVectorizer

In [20]:
count_vectorizer = CountVectorizer()
count_X = count_vectorizer.fit_transform(corpus)

In [23]:
count_vectorised = {}
for json_document in json_documents:
    file_name = json_document.split("/")[-1].replace(".json", "")
    data = read_json_file(f"{json_document}")
    if "body" in data.keys():
        document_string = " ".join(data["body"])
    elif "chapters" in data.keys():
        document_string = ""
        for chapter in data["chapters"].keys():
            document_string += f" {data['chapters'][chapter]}"
    tokens = count_vectorizer.transform([document_string]).toarray().tolist()
    count_vectorised[file_name] = {}
    if document_type == "papers":
        count_vectorised[file_name]["tokens"] = tokens
        if "arxiv" in json_document:
            count_vectorised[file_name]["database"] = "arxiv"
        elif "scopus" in json_document:
            count_vectorised[file_name]["database"] = "scopus"
        else:
            count_vectorised[file_name]["database"] = "unknown"
    else:
        count_vectorised[file_name] = tokens
print("Done!")

Done!


In [24]:
write_json_file(count_vectorised, f"{ROOT}/data/{document_type}/json/vectorised/count_vectors.json")

### TF-IDF

In [25]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_X = tfidf_vectorizer.fit_transform(corpus)

In [27]:
tfidf_vectorised = {}
for json_document in json_documents:
    file_name = json_document.split("/")[-1].replace(".json", "")
    data = read_json_file(f"{json_document}")
    if "body" in data.keys():
        document_string = " ".join(data["body"])
    elif "chapters" in data.keys():
        document_string = ""
        for chapter in data["chapters"].keys():
            document_string += f" {data['chapters'][chapter]}"
    tokens = tfidf_vectorizer.transform([document_string]).toarray().tolist()
    tfidf_vectorised[file_name] = {}
    if document_type == "papers":
        tfidf_vectorised[file_name]["tokens"] = tokens
        if "arxiv" in json_document:
            tfidf_vectorised[file_name]["database"] = "arxiv"
        elif "scopus" in json_document:
            tfidf_vectorised[file_name]["database"] = "scopus"
        else:
            tfidf_vectorised[file_name]["database"] = "unknown"
    else:
        tfidf_vectorised[file_name] = tokens
print("Done!")

Done!


In [28]:
write_json_file(tfidf_vectorised, f"{ROOT}/data/{document_type}/json/vectorised/tfidf_vectors.json")

### word2vec

In [7]:
import gensim.downloader as api
corpus = api.load('text8')
model = api.load("glove-wiki-gigaword-50")

In [8]:
def vectorise_sentence(model, sentence):
    sentence_vectors = []
    tokens = sentence.split(" ")
    num_tokens = 0
    for token in tokens:
        try:
            if num_tokens == 0:
                sentence_vectors = model[token]
            else:
                sentence_vectors = np.add(sentence_vectors,
                                          model[token])
            num_tokens += 1
        except:
            pass
    if num_tokens == 0:
        return np.array([])
    return sentence_vectors / num_tokens

In [9]:
word2vec_vectorised = {}
for i, json_document in enumerate(json_documents):
    if i % 100 == 0:
        print(f"{i / len(json_documents) * 100} complete")
    file_name = json_document.split("/")[-1].replace(".json", "")
    data = read_json_file(f"{json_document}")
    tokenized_sentences = []
    num_sentences = 0
    if "body" in data.keys():
        for sentence in data["body"]:
            try:
                if num_sentences == 0:
                    tokenized_sentences = vectorise_sentence(model, sentence)
                else:
                    tokenized_sentences = np.add(tokenized_sentences,
                                                 vectorise_sentence(model, sentence))
                num_sentences += 1
            except:
                pass
    elif "chapters" in data.keys():
        for chapter in data["chapters"]:
            for sentence in data["chapters"][chapter]:
                try:
                    if num_sentences == 0:
                        tokenized_sentences = vectorise_sentence(model, sentence)
                    else:
                        tokenized_sentences = np.add(tokenized_sentences,
                                                     vectorise_sentence(model, sentence))
                    num_sentences += 1
                except:
                    pass
    if num_sentences == 0:
        continue
    tokenized_sentences = tokenized_sentences / num_sentences
    word2vec_vectorised[file_name] = {}
    if document_type == "papers":
        word2vec_vectorised[file_name]["tokens"] = tokenized_sentences.tolist()
        if "arxiv" in json_document:
            word2vec_vectorised[file_name]["database"] = "arxiv"
        elif "scopus" in json_document:
            word2vec_vectorised[file_name]["database"] = "scopus"
        else:
            word2vec_vectorised[file_name]["database"] = "unknown"
    else:
        word2vec_vectorised[file_name] = tokenized_sentences.tolist()
print("Done!")

0.0 complete
6.0938452163315056 complete
12.187690432663011 complete
18.281535648994517 complete
24.375380865326022 complete
30.469226081657524 complete
36.56307129798903 complete
42.656916514320535 complete
48.750761730652044 complete
54.844606946983546 complete
60.93845216331505 complete
67.03229737964655 complete
73.12614259597807 complete
79.21998781230957 complete
85.31383302864107 complete
91.40767824497257 complete
97.50152346130409 complete
Done!


In [10]:
write_json_file(word2vec_vectorised, f"{ROOT}/data/{document_type}/json/vectorised/word2vec_vectors.json")

### BERT

In [6]:
bc = BertClient()

In [11]:
bert_vectorised = {}
for i, json_document in enumerate(json_documents):
    if i % 100 == 0:
        print(f"{i / len(json_documents) * 100} complete")
    file_name = json_document.split("/")[-1].replace(".json", "")
    data = read_json_file(f"{json_document}")
    tokenized_sentences = []
    num_sentences = 0
    if "body" in data.keys():
        tokenized_sentences = bc.encode(data["body"])
        num_sentences += 1
    elif "chapters" in data.keys():
        for chapter in data["chapters"]:
            if num_sentences == 0:
                tokenized_sentences = bc.encode(data["body"])
            else:
                tokenized_sentences = np.add(tokenized_sentences,
                                             bc.encode(data["body"]))
                
            num_sentences += 1
    if num_sentences == 0:
        continue
    tokenized_sentences = tokenized_sentences / num_sentences
    bert_vectorised[file_name] = {}
    if document_type == "papers":
        bert_vectorised[file_name]["tokens"] = tokenized_sentences.tolist()
        if "arxiv" in json_document:
            bert_vectorised[file_name]["database"] = "arxiv"
        elif "scopus" in json_document:
            bert_vectorised[file_name]["database"] = "scopus"
        else:
            bert_vectorised[file_name]["database"] = "unknown"
    else:
        bert_vectorised[file_name] = tokenized_sentences.tolist()
print("Done!")

0.0 complete


KeyboardInterrupt: 

In [13]:
write_json_file(bert_vectorised, f"{ROOT}/data/{document_type}/json/vectorised/bert_vectors.json")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\sgmcart3\\Documents\\Projects\\target_app\\/data/papers/json/vectorised/bert_vectors.json'