# TODO
0. REMOVE SPACE AT THE END OF SOME FOUND LABELS FOR NAIVE MODEL
0. IMPORTANT: make submission and evaluation logic the same function because now they are seperate pieces of logic that do the same thing
0. add levenshtein distance to naive model to get best predictions
0. during the submit process, save all new dataset names in the naive model.
0. after NLP prediction, use levenshtein distance to find previously known dataset. If distance is small enough, use the known dataset
1. do data pre-processing (cleaning etc) IMPORTANT: cleaning the whole text may result in inconsistencies between the dataset name in our cleaned text, and the dataset label in the answers csv
3. add naive baseline (with a buffer of all known datasets) and combine with spacy model
4. add SciBERT as a third model


# settings

In [None]:
# notebook settings
is_submission = True
use_gpu = True

# nlp model settings
use_nlp = False # whether to use the nlp model at all
train_nlp = False # whether to train the nlp
save_model = False # whether to save the trained model
saved_model_filepath = None # whether to load a previously trained model

# other settings
test_size = 0.2 

if use_nlp and (not train_nlp and not saved_model_filepath):
    raise Exception("ERROR: nlp model must be loaded from file OR trained")

# imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import spacy
import csv
import re
import pickle
import string
import nltk
import os
import networkx as nx
import matplotlib.pyplot as plt
import itertools
from tqdm.notebook import tqdm
from math import floor
from random import shuffle
from time import sleep
from sklearn.model_selection import train_test_split

if use_gpu:
    using_gpu = spacy.require_gpu()
    print(f"using gpu: {using_gpu}")
else:
    print("not using gpu\n")

for dirname, _, filenames in os.walk('/kaggle/input'):
    print(f"{dirname} contains {len(filenames)} files")

# load data


In [None]:
def load_train_data():
    training_data = []
    
    # open the csv with id's, data labels, etc. and append the json files to it
    files = []
    train_dir = '../input/coleridgeinitiative-show-us-the-data/train' # location of the training json files
    df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv') # location of the training csv file (does not contain the actual texts)
    for i in df.index:
        file_id = df['Id'][i]
        filename = f"{file_id}.json"
        filepath = os.path.join(train_dir, filename)
        with open(filepath) as json_file:
            file = json.loads(json_file.read())
            files.append(file)
    df['file'] = files
    
    return df

df = load_train_data()
df.describe()
df.info()

# Create train and validation set

In [None]:
def fill_graph(ids, labels, use_pseudonyms):
    graph = nx.Graph()
    
    amount_of_publications = 0
    amount_of_datasets = 0
    id_to_name = {}
    
    # fill the graph with all ids and labels (duplicates will not be added twice)
    for i in range(len(ids)):
        converted_id = ids[i]
        converted_label = labels[i]
        
        if use_pseudonyms:
            if not ids[i] in id_to_name:
                id_to_name[ids[i]] = 'P{}'.format(amount_of_publications)
                amount_of_publications += 1

            if not labels[i] in id_to_name:
                id_to_name[labels[i]] = 'D{}'.format(amount_of_datasets)
                amount_of_datasets += 1

            converted_id = id_to_name[ids[i]]
            converted_label = id_to_name[labels[i]]
            
        if not graph.has_node(converted_id):
            graph.add_node(converted_id)
            
        if not graph.has_node(converted_label):
            graph.add_node(converted_label)
            
        graph.add_edge(converted_id, converted_label)
    
    undirected_graph = graph.to_undirected()
    return undirected_graph, id_to_name

In [None]:
def visualize_graph(graph):
    node_color = []
    for node in graph.nodes(data=True):
        node_type = node[0][0]
        if node_type == 'D':
            node_color.append('#8cfffb')
        elif node_type == 'P':
            node_color.append('#c4ff0e')

    plt.figure(1,figsize=(40, 40)) 
    nx.draw(graph, node_size=2500, with_labels=True, font_weight='bold', node_color=node_color)
    plt.savefig("subgraph.pdf")

In [None]:
threshold = 20

def combine_small_graphs(sub_graphs):
    deleted = []
    combined = nx.Graph()
    for i, sg in enumerate(sub_graphs):
        if (sg.number_of_nodes() < threshold):
            combined.add_edges_from(sg.edges(data=True))
            combined.add_nodes_from(sg.nodes(data=True))
            deleted.append(sg)
            
    for sg in deleted:
        sub_graphs.remove(sg)
        
    return sub_graphs, combined

In [None]:
def remove_dataset_nodes(sub_graphs):
    new_subgraphs = []
    for i, sg in enumerate(sub_graphs):
        copy_sg = nx.Graph()
        copy_sg.add_nodes_from(sg.nodes(data=True))
        copy_sg.add_edges_from(sg.edges(data=True))
        nodes_to_remove = []
        for node in copy_sg.nodes():
            if "D" in node:
                nodes_to_remove.append(node)
        
        copy_sg.remove_nodes_from(nodes_to_remove)
        new_subgraphs.append(copy_sg)
        
    return new_subgraphs

In [None]:
def find_nearest_split(amounts, split):
    target = split * sum(amounts)
    
    combinations = []
    for i in range(1, len(amounts)):
        combinations.append([list(x) for x in itertools.combinations(amounts, i)])
        
    combinations = list(itertools.chain(*combinations))
    summed_combinations = [sum(combination) for combination in combinations]
    
    nearest_value = min(summed_combinations, key=lambda x:abs(x - target))
    nearest_value_index = summed_combinations.index(nearest_value)
    return combinations[nearest_value_index]

In [None]:
def custom_train_test_split(df, use_pseudonyms = False):
    ids = df["Id"]
    labels = df["dataset_label"]
    undirected_graph, id_to_name = fill_graph(ids, labels, use_pseudonyms)

    # extract subgraphs 
    sub_graphs = [undirected_graph.subgraph(c) for c in nx.connected_components(undirected_graph)]
    
    # merge smaller ones into one bigger graph
    sub_graphs, combined = combine_small_graphs(sub_graphs)
    sub_graphs.append(combined)
    
    # remove all dataset nodes from the graph
    sub_graphs = remove_dataset_nodes(sub_graphs)
    
    # find the nearest split
    nodes = [sub_graph.nodes() for sub_graph in sub_graphs]
    flattened_nodes = list(itertools.chain(*nodes))
    amount_of_nodes = [len(sub_graph.nodes()) for sub_graph in sub_graphs]
    split = find_nearest_split(amount_of_nodes, split=test_size)
    
    test_nodes = list(itertools.chain(*[nodes[amount_of_nodes.index(s)] for s in split]))
    train_nodes = [n for n in flattened_nodes if n not in test_nodes]
    
    if use_pseudonyms:
        # convert D and P names to ids again using the conversion dictionary
        test_nodes = [[k for k,v in id_to_name.items() if v == test_node][0] for test_node in test_nodes]
        train_nodes = [[k for k,v in id_to_name.items() if v == train_node][0] for train_node in train_nodes]
    
    train_df = df[df["Id"].isin(train_nodes)]
    test_df = df[df["Id"].isin(test_nodes)]
    
    return train_df["file"], test_df["file"], train_df["dataset_label"], test_df["dataset_label"]

In [None]:
if is_submission:
    X_train = df["file"]
    y_train = [{
        "dataset_label": df["dataset_label"][index],
        "dataset_title": df["dataset_title"][index],
        "cleaned_label": df["cleaned_label"][index]
    } 
        for index in df.index]
else:
    X_train, X_val, y_train, y_val = custom_train_test_split(df)
    y_train = [{
        "dataset_label": df["dataset_label"][index],
        "dataset_title": df["dataset_title"][index],
        "cleaned_label": df["cleaned_label"][index]
    } 
        for index in y_train.index]
    y_val = [{
        "dataset_label": df["dataset_label"][index],
        "dataset_title": df["dataset_title"][index],
        "cleaned_label": df["cleaned_label"][index]
    } 
        for index in y_val.index]
    
    print(X_train[:5])
    print(y_train[:5])

In [None]:
# some config variables
MIN_LENGTH_SAMPLE = 10
MAX_LENGTH_SAMPLE = 9000

def format_dataframe_for_spacy(xs, ys):
    '''
    xs - array of samples, where each sample is an array of dictionaries, where each dictionary has a `text` and `section_title` key-value pair
    ys - array of strings, where the i'th index is the dataset label corresponding to the i'th sample in `xs`
    '''
    data = []
    pb = tqdm(total=len(xs))
    for x, y in zip(xs, ys):
        for section in x:
            # each section contains a 'section_title' and a 'text' key, for now we only use 'text'
            text = section['text']

            # only consider this sample when these rules apply
            if len(text) < MIN_LENGTH_SAMPLE:
                continue
            if len(text) > MAX_LENGTH_SAMPLE:
                continue
            # END OF RULES

            # !IMPORTANT TODO: Adding padding to the dataset title removes about 1/3rd of the training data. probably not good
            if f' {y} ' in text: # Only use a section as a training sample IF it contains a dataset label
                start_index = text.find(y)
                end_index = start_index + len(y)
                entity = (start_index, end_index, 'DATASET')
                entities = [entity]
                sample = (text, {'entities': entities})
                data.append(sample)
                # TODO: What if it finds two matches in a file?
        pb.update(1)
    pb.close()
    return data

spacy_training_data = format_dataframe_for_spacy(X_train, [y["dataset_label"] for y in y_train])
print(spacy_training_data[0])

In [None]:
def load_test_data():
    buffer = []
    ids = []
    for dirname, _, filenames in os.walk('../input/coleridgeinitiative-show-us-the-data/test'):
        for filename in filenames:
            filepath = os.path.join(dirname, filename)
            with open(filepath) as json_file:
                file = json.loads(json_file.read())
                file_id = filename.replace(".json", "")
                ids.append(file_id)
                buffer.append(file)
    return buffer, ids

X_test, ids = load_test_data()
print(ids[:5])

# Spacy NER model

In [None]:
def create_blank_nlp():
    nlp = spacy.load("en_core_web_sm")
    nlp.remove_pipe("ner")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    ner.add_label('DATASET')
    return nlp

In [None]:
import datetime as dt

def load_nlp_from_file(filename):
    with open(filename, 'rb') as pickle_file:
        model = pickle.load(pickle_file)
        return model
    
def save_nlp_to_file(model):
    filename = f"{dt.datetime.now()}.sav"
    pickle.dump(model, open(filename, 'wb'))
    return filename

In [None]:
from spacy.util import minibatch, compounding
from tqdm import trange

spacy.warnings.filterwarnings("ignore", message=r"\[W030\]", category=UserWarning)

EPOCHS = 10

def train_nlp_model(nlp, training_data):
    optimizer = nlp.begin_training()
    for epoch in range(EPOCHS):
        losses = {}
        batch_gen = minibatch(training_data, size=compounding(4.0, 32.0, 1.1))
        batches = list(batch_gen)
        pb = tqdm(total=len(batches))
        for batch_index, batch in enumerate(batches):
            pb.set_description(f'epoch: {epoch} | batch {batch_index} of {len(batches)}')
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.1,  # dropout - make it harder to memorise data
                losses=losses,
            )
            pb.update(1)
        pb.close()
        print(f"Losses at iteration {epoch} - {dt.datetime.now()} {losses}")
    return nlp

nlp = None
if use_nlp and train_nlp:
    print("Training NLP model from scratch...")
    nlp = create_blank_nlp()
    nlp = train_nlp_model(nlp, spacy_training_data)
    print("NLP model trained")
    if save_model:
        print("Saving trained NLP model...")
        save_nlp_to_file(nlp)
        print("NLP model saved")
elif use_nlp and saved_model_filepath:
    print(f"Loading NLP model from: {saved_model_filepath}...")
    nlp = load_nlp_from_file(saved_model_filepath)
    print("Loaded NLP model")

In [None]:
# for x in X_val:
#     for section in x:
#         doc = nlp(section['text'])
#         for ent in doc.ents:
#             print(ent.text)

# Functions used by kaggle

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def clean_label(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

cleaning functions from other notebook (for cleaning sections, not predictions/labels)

In [None]:
def clean_section(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('r[^\w\s]', ' ', str(text).lower()).strip()
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    text = lem.lemmatize(text)
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()) # !IMPORTANT this was added by us
    
    return text

# Naive model
keeps track of all dataset labels in our training data, and checks if a sample contains one of these, and if so, returns it

In [None]:
class NaiveModel():
    def __init__(self, X_train, y_train):
        self.labels = list()
        self.X_train = X_train
        self.y_train = y_train
        
    def train(self):
        for y in self.y_train:
            new_labels = [
                clean_section(y["dataset_title"]),
                clean_section(y["dataset_label"]),
                y["cleaned_label"]
            ]
            for label in new_labels:
                if not label in self.labels:
                    self.labels.append(label)
        
    def predict(self, text):
        found_labels = []
        for label in self.labels:
            if label in text:
                if label not in found_labels: # no duplicates
                    found_labels.append(label)
        
        if len(found_labels) < 1:
            return None
        return found_labels

Create naive model and train it

In [None]:
naive = NaiveModel(X_train, y_train)
naive.train()
print(f'Naive model has collected: {len(naive.labels)} unique labels')
print(list(naive.labels)[:5])

# Test naive model

In [None]:
from tqdm import trange

def f_score(tp, fp, fn, beta=0.5):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    score = (1+beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return score

def evaluate(preds_val, y_val):
    '''
    preds_val - an array containing n strings, where each string is a prediction, or multiple predictions seperated by a '|' character
    y_val - an array containing n labels, it is important to note these are the DATASET_LABELS, and not the CLEANED_LABEL (for predictions, these must go through Kaggle's clean_text function)
    '''
    tp = 0
    fp = 0
    fn = 0
    for pred, y in zip(preds_val, y_val):
        if pred:
            preds = pred.split("|")
            for pred in preds:
                j = jaccard(pred, y['cleaned_label'])
                if j > 0.5:
                    tp += 1
                else:
                    fp += 1
        else:
            print("no pred:", preds, "|", y['cleaned_label'])
            fn += 1
    return f_score(tp, fp, fn)

def predict_one(naive, file, filter_subsets = False):
    preds = []
    for section in file:
        text = clean_label(section['text'])
        text = clean_section(text)
        labels = naive.predict(text)
        if labels:
            preds += labels
        else:
            doc = nlp(text)
            for ent in doc.ents:
                if ent.label_ == "DATASET":
                    preds.append(ent.text)
    
    # remove labels that are a subset of other labels to reduce False Positives    
    if filter_subsets:
        for i in preds:
            is_subset = False
            for j in preds:
                if i in j and i != j:
                    is_subset = True
            if is_subset:
                preds.remove(i)
    
    labels = [clean_label(label) for label in preds]
    labels = set(labels)
    labels = list(labels)
    labels = "|".join(labels)
    return labels

def test(model, xs, ys):
    predictions = []
    pbar = tqdm(total=len(xs)) # Progress bar
    for x in xs:
        pbar.update(1) # Update progress bar
        new_prediction = predict_one(naive, x, filter_subsets=False)
        predictions.append(new_prediction)
        
    pbar.close()
    print(evaluate(predictions, y_val))

if not is_submission:
    test(naive, X_val, y_val)

# Generate output file (WORK IN PROGRESS)

predict function that takes in some models, and tries to predict a dataset title for each

In [None]:
def submit(ids, xs):
    buffer = {}
    for file_id, file in zip(ids, xs):
        dataset_label = predict_one(naive, file)
        buffer[file_id] = dataset_label
    
    ids = buffer.keys()
    predictions = buffer.values()
    data = {"Id": ids, "PredictionString": predictions }
    df = pd.DataFrame(data=data)
    df.to_csv('submission.csv', index=False)

if is_submission:
    submit(ids, X_test)