In [36]:
import os
import glob
import json
import bert
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from bert import run_classifier
from bert import optimization
from bert import tokenization

ModuleNotFoundError: No module named 'tensorflow.contrib'

In [37]:
ROOT = os.getcwd().split("src")[0]

### Required Functions

In [3]:
def write_json_file(data, file):
    """
    Write data to JSON file
    """

    with open(f"{file}", "w") as f:
        json.dump(data, f)

In [4]:
def read_json_file(file):
    """
    Read data from JSON file
    """

    with open(f"{file}", "r") as f:
        data = json.load(f)
    
    return data

### Import REF Submission Strings

In [29]:
ref_paper_dict = {
    "institution_code": [],
    "uoa": [],
    "merge_code": [],
    "body": [],
    "sentences": [],
}
json_documents = glob.glob(f"{ROOT}/data/ref_submissions/json/preprocessed/**/*.json",
                          recursive=True)
for json_document in json_documents:
    data = read_json_file(f"{json_document}")
    file_name = json_document.split("/")[-1]
    file_name = file_name.replace(".json", "")
    institution_code = file_name.split("-")[0]
    uoa = file_name.split("-")[1]
    ref_paper_dict["institution_code"].append(institution_code)
    ref_paper_dict["uoa"].append(uoa)
    ref_paper_dict["merge_code"].append(f"{institution_code}-{uoa}")
    sentences = []
    chapter_strings = []
    for chapter in data["chapters"]:
        for sentence in data["chapters"][chapter]:
            sentences.append(sentence)
        chapter_strings.append(" ".join(data["chapters"][chapter]))
    doc_string = " ".join(chapter_strings)
    ref_paper_dict["body"].append(doc_string)
    ref_paper_dict["sentences"].append(sentences)
ref_paper_df = pd.DataFrame(ref_paper_dict)
ref_paper_df.head()

Unnamed: 0,institution_code,uoa,merge_code,body,sentences
0,10007767,22,10007767-22,patons research health policy politics health ...,[patons research health policy politics health...
1,10003678,32,10003678-32,casestudy ofimpact arise fromresearch peter os...,[casestudy ofimpact arise fromresearch peter o...
2,10007767,3,10007767-3,research prof shaughn obrien team keele lead f...,[research prof shaughn obrien team keele lead ...
3,10007805,12,10007805-12,performance measurement research university st...,[performance measurement research university s...
4,10007790,26,10007790-26,every year million people orldwide experience ...,[every year million people orldwide experience...


### Form REF Submission Dataframe

In [21]:
ref_submission = pd.read_csv(f"{ROOT}/data/ref_submissions/ref_impact_results.csv")

In [22]:
ref_submission = ref_submission.rename(
    columns={
        "Institution name": "institution",
        "Institution code (UKPRN)": "institution_code",
        "Unit of assessment number": "uoa",
    }
)
ref_submission["merge_code"] = ref_submission["institution_code"].apply(str).str.cat(ref_submission["uoa"].apply(str), sep="-")
ref_submission.head()

Unnamed: 0,institution,institution_code,uoa,unclassified,1*,2*,3*,4*,weighted,merge_code
0,Anglia Ruskin University,10000291,3,0.0,0.0,0.0,0.8,0.2,11.3,10000291-3
1,Anglia Ruskin University,10000291,4,0.0,0.8,0.2,0.0,0.0,13.7,10000291-4
2,Anglia Ruskin University,10000291,5,0.0,0.2,0.7,0.1,0.0,7.25,10000291-5
3,Anglia Ruskin University,10000291,15,0.0,0.0,0.3,0.7,0.0,8.0,10000291-15
4,Anglia Ruskin University,10000291,16,0.0,0.5,0.5,0.0,0.0,8.0,10000291-16


In [24]:
ref_vector_scores = pd.merge(ref_paper_df, ref_submission, on="merge_code", how="left")
ref_vector_scores = ref_vector_scores.drop(["institution_code_x", "uoa_x"], axis=1)
ref_vector_scores = ref_vector_scores.rename(
    columns={
        "institution_code_y": "institution_code",
        "uoa_y": "uoa",
    }
)
ref_vector_scores["institution_code"] = pd.to_numeric(ref_vector_scores["institution_code"], errors="coerce")
ref_vector_scores["4*"] = pd.to_numeric(ref_vector_scores["4*"])
ref_vector_scores["3*"] = pd.to_numeric(ref_vector_scores["3*"])
ref_vector_scores["2*"] = pd.to_numeric(ref_vector_scores["2*"])
ref_vector_scores["1*"] = pd.to_numeric(ref_vector_scores["1*"])
ref_vector_scores["unclassified"] = pd.to_numeric(ref_vector_scores["unclassified"])
ref_vector_scores = ref_vector_scores[[
    "institution",
    "institution_code",
    "merge_code",
    "uoa",
    "unclassified",
    "1*",
    "2*",
    "3*",
    "4*",
    "weighted",
    "body",
]
]
ref_vector_scores.head()

Unnamed: 0,institution,institution_code,merge_code,uoa,unclassified,1*,2*,3*,4*,weighted,body
0,Keele University,10007767.0,10007767-22,22.0,0.0,0.0,0.2,0.6,0.2,30.1,calum patons research keele university underta...
1,Kingston University,10003678.0,10003678-32,32.0,0.0,0.0,0.4,0.3,0.3,6.2,research underpin impact case study may divide...
2,Keele University,10007767.0,10007767-3,3.0,0.0,0.0,0.0,0.24,0.76,43.98,quantify menstrual blood loss menstrual pictog...
3,University of Strathclyde,10007805.0,10007805-12,12.0,0.0,0.0,0.2,0.5,0.3,67.2,associate train programme develop context rese...
4,University of Edinburgh,10007790.0,10007790-26,26.0,0.0,0.0,0.1,0.5,0.4,23.8,physical fitness reduce stroke impair fitness ...


### Prepare train and test data

In [25]:
y_class = ref_vector_scores[["unclassified", "1*", "2*", "3*", "4*"]].to_numpy()
print(f"{type(y_class)}, {y_class.shape}")

<class 'numpy.ndarray'>, (4900, 5)


In [26]:
y = ref_vector_scores["weighted"].to_numpy()
print(f"{type(y)}, {y.shape}")

<class 'numpy.ndarray'>, (4900,)


In [13]:
X = ref_vector_scores[[f"X{i}" for i in range(vector_len)]].to_numpy()
print(f"{type(X)}, {X.shape}")

<class 'numpy.ndarray'>, (4900, 50)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

### BERT transfer learning
https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d

In [15]:
model = Sequential()
model.add(Embedding(input_dim=X.shape[1], output_dim=20))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="tanh"))