# Dataset Creation

In the dataset creation phase, I tested out different things, I tried to use the GPT-3.5-turbo API to create the [dataset](https://gist.github.com/Hero2323/bff12400cec5ab54467ea35ba89e976f) for me (didn't work), I tried using [scancodes](https://gist.github.com/Hero2323/da410d4f06547ef3b4bdb626bbde868b), and at the end I resorted to manual, model assisted labeling. Mainly, I used the [Fossology API](https://gist.github.com/Hero2323/7ed99af2e336216860ad74e6002de5db) to extract what it thinks are copyrights then using [google sheets](https://docs.google.com/spreadsheets/d/132NnbJT4nqb-hxPX-XRFvUWTUg9SW0-ueW2YkpykgSk/edit?usp=sharing) to more easily label the dataset as I could use conditional formatting for better visibility and speed.

# False Positive Detection Training

In this phase, I wrote various functions that simplified the iterative process of testing various models, vectorization methods, parameters, etc. As well as improved the formatting of results among other things. I tested out many models, including RF, SVM, NB, Bert, RNN, as well as many vectorization and embedding methods such as TF-IDF, BoW, GloVe, FastText, and BERT embeddings.

In [None]:
# When testing many datasets at the same time, this function
# can aggregate the results in cleaner manner

def aggregate_reports(reports, print_aggregates=True):
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    if print_aggregates:
        print("## Precision")
        print(dfs[0].to_markdown())
        print("## Recall")
        print(dfs[1].to_markdown())
        print("## F1-score")
        print(dfs[2].to_markdown())
    else:
        return dfs[0], dfs[1], dfs[2]

# Below is what the output looks like

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.992318 | 0.972816 | precision |
| 1    | 0.977273 | 0.679558 | precision |
| 2    | 0.837209 | 0.783333 | precision |
| 3    | 0.966989 | 0.923404 | precision |
| 4    | 0.992459 | 0.973096 | precision |
| Mean | 0.95325  | 0.866441 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.990244 | 0.978516 | recall   |
| 1    | 0.869663 | 0.931818 | recall   |
| 2    | 0.80597  | 0.817391 | recall   |
| 3    | 0.985234 | 0.841085 | recall   |
| 4    | 0.990422 | 0.978738 | recall   |
| Mean | 0.928307 | 0.90951  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.99128  | 0.975657 | f1-score |
| 1    | 0.920333 | 0.785942 | f1-score |
| 2    | 0.821293 | 0.8      | f1-score |
| 3    | 0.976026 | 0.880325 | f1-score |
| 4    | 0.99144  | 0.975909 | f1-score |
| Mean | 0.940074 | 0.883567 | f1-score |
None


In [None]:
# easily shows me exactly which rows were misclassified by the model
def get_missclassified_rows(X, y_true, y_pred, only_this_class = [0, 1], return_index=False):
    if type(y_true) != list:
        y_true = y_true.tolist()
    if type(y_pred) != list:
        y_pred = y_pred.tolist()
    if type(X) != list:
        X = X.tolist()
    missclassified_rows = []
    for i in range(len(y_true)):
        if y_true[i] != y_pred[i] and y_true[i] in only_this_class:
            missclassified_rows.append(i)
    if return_index:
        return [(y_pred[i], i, X[i]) for i in missclassified_rows]
    else:
        return [(y_pred[i], X[i]) for i in missclassified_rows]

In [None]:
# This is the main function I used for training the model, in this I can easily include
# any number of parameters for the preprocessing function, which at the time could have
# different number of parameters as I worked on developing it
# Additionally, I transitioned to a different way of visualizing the results here were I test
# on all the datasets (including the training one) and simply view what percent of the 
# rows were missclasisfied

def train(svm, vectorizer, threshold, preprocess_function,**kwargs):
    X_train_tfidf = vectorizer.fit_transform(preprocess_function(X_train, **kwargs))
    X_test_tfidf = vectorizer.transform(preprocess_function(X_test, **kwargs))
    X_1_tfidf = vectorizer.transform(preprocess_function(X_1, **kwargs))
    X_2_tfidf = vectorizer.transform(preprocess_function(X_2, **kwargs))
    X_3_tfidf = vectorizer.transform(preprocess_function(X_3, **kwargs))
    preprocessed_X = preprocess_function(X, **kwargs)
    X_tfidf = vectorizer.transform(preprocessed_X)
    svm.fit(X_train_tfidf, y_train)
    if True: #svm.probability:
        y_pred = svm.predict_proba(X_test_tfidf)
        y_pred_1 = svm.predict_proba(X_1_tfidf)
        y_pred_2 = svm.predict_proba(X_2_tfidf)
        y_pred_3 = svm.predict_proba(X_3_tfidf)
        y_pred_4 = svm.predict_proba(X_tfidf)
        if threshold is None:
            y_pred_classification = np.argmax(y_pred, axis=1)
            y_pred_1_classification = np.argmax(y_pred_1, axis=1)
            y_pred_2_classification = np.argmax(y_pred_2, axis=1)
            y_pred_3_classification = np.argmax(y_pred_3, axis=1)
            y_pred_4_classification = np.argmax(y_pred_4, axis=1)
        else:
            y_pred_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred]
            y_pred_1_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_1]
            y_pred_2_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_2]
            y_pred_3_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_3]
            y_pred_4_classification = [np.argmax(y) if max(y) > threshold else 0 for y in y_pred_4]
    else:
        y_pred_classification = svm.predict(X_test_tfidf)
        y_pred_1_classification = svm.predict(X_1_tfidf)
        y_pred_2_classification = svm.predict(X_2_tfidf)
        y_pred_3_classification = svm.predict(X_3_tfidf)
        y_pred_4_classification = svm.predict(X_tfidf)
    report = classification_report(y_test, y_pred_classification, output_dict=True)
    report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
    report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
    report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
    report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
    miss_classified_rows_0 = get_missclassified_rows(preprocessed_X, y, y_pred_4_classification, only_this_class=[0], return_index=True)
    miss_classified_rows_1 = get_missclassified_rows(preprocessed_X, y, y_pred_4_classification, only_this_class=[1], return_index=True)
    #aggregate_reports([report, report_1, report_2, report_3, report_4])
    print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
    print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')
    
# Here is how I would call it
clf = OneVsRestClassifier(SVC(probability=True, C=25))
test = train(clf, TfidfVectorizer(ngram_range=(1, 2), binary=True), None, preprocess_function,
                                            replace_dates=True, remove_numbers=True, replace_copyright_symbols=True,
                                            remove_whitespaces=True, lower=True, remove_special_characters=True, 
                                            replace_emails=True, replace_entities=True,
                                            spacy_model=spacy.load('../NER_models/train-semi-supervised-dataset-1/model-best'))
# and Here is what the output looks like 


Number of missclassifications in class 0:  25.0 out of a total sample of:  16377.0  - about  0.15 % of the class was missclassified

Number of missclassifications in class 1:  43.0 out of a total sample of:  5393.0  - about  0.8 % of the class was missclassified

# NER Model Training

In this phase, I used [doccano](https://github.com/doccano/doccano) for easier labeling. I tested out several models and settled on SpaCy's en_core_web_sm model for its lightweight, performance, and simple training. I wrote several functions that automatically convert between JSONL (needed by doccano), CSV (default data format I was using), and .spacy (binary format used by SpaCy for training). Below I showcase the utility functions I wrote

In [None]:
from tqdm import tqdm
import json
import re
import spacy
from sklearn.model_selection import train_test_split

def text_to_json(sentences):
    """
    Convert list of sentences to a specific JSON format.
    """
    new_json = list()
    for sentence in tqdm(sentences):  # Progress bar for iterating through sentences
        labels = list()  # Initializing empty labels
        # Appending the sentence and its (empty) labels to the resulting JSON
        new_json.append({'text': sentence, "labels": labels})
    return new_json

def text_to_json_model_assisted(sentences, model):
    """
    Convert list of sentences to a JSON format using a model for predictions.
    """
    sentences = model(sentences)  # Using the model to process sentences
    new_json = list()
    for sentence in tqdm(sentences):  # Progress bar for iterating through sentences
        labels = list()
        for e in sentence.ents:  # Iterating through detected entities in the sentence
            # Appending start, end character positions and label of the entity
            labels.append([e.start_char, e.end_char, e.label_])
        # Appending the sentence and its detected labels to the resulting JSON
        new_json.append({'text': sentence.text, "labels": labels})
    return new_json

def text_to_json_labels_separate(sentences, labels, entity_name):
    """
    Convert list of sentences and labels to a JSON format with specific entity name.
    """
    new_json = list()
    for sentence, label in tqdm(zip(sentences, labels)):  # Progress bar for iterating through sentence-label pairs
        if label is None or label == '':
            continue  # If label is empty or None, skip the iteration
        
        # Search for the exact match of the label in the sentence
        pattern = r"\b" + re.escape(label) + r"\b"
        match = re.search(pattern, sentence)
        if not match:
            continue  # If label is not found in the sentence, skip the iteration
        
        # Appending the sentence and the position of its label to the resulting JSON
        new_json.append({'text': sentence, "labels": [match.start(), match.end(), entity_name]})
    return new_json

def write_json_to_disk(new_json, path):
    """
    Write a list of dictionaries in the JSONL format to disk.
    """
    with open(path, 'w') as f:
        for item in new_json:
            # Writing each dictionary in the list as a separate line in the JSONL file
            f.write(json.dumps(item) + '\n')

def convert_jsonl_to_spacy(jsonl_path, spacy_path):
    """
    Load data from a JSONL file and convert it to spaCy's training format.
    """
    data = []
    with open(jsonl_path, 'r') as f:
        for line in f:
            item = json.loads(line)
            text = item['text']
            # Extracting entities from the JSONL line
            entities = [(e[0], e[1], e[2]) for e in item['labels']]
            data.append((text, {'entities': entities}))
    
    nlp = spacy.blank('en')  # Creating a blank English NLP object
    doc_bin = spacy.tokens.DocBin()  # Initializing a DocBin for efficient storage of `Doc` objects
    for text, annotations in data:
        doc = nlp.make_doc(text)  # Creating a `Doc` object from text
        example = spacy.training.Example.from_dict(doc, annotations)  # Creating an example from the `Doc` and its annotations
        doc_bin.add(example.reference)  # Adding the `Doc` to the DocBin
    doc_bin.to_disk(spacy_path)  # Saving the DocBin to disk

def spacy_train_test_split(file_path, split=0.2, random_state=42, shuffle=True):
    """
    Split spaCy formatted data into training and testing sets.
    """
    doc_bin = spacy.tokens.DocBin().from_disk(file_path)  # Loading the DocBin from disk
    nlp = spacy.load("en_core_web_sm")  # Loading the English small core model
    docs = list(doc_bin.get_docs(nlp.vocab))  # Retrieving `Doc` objects from the DocBin using the model's vocabulary
    # Splitting the `Doc` objects into training and testing sets
    train_docs, test_docs = train_test_split(docs, test_size=split, random_state=random_state, shuffle=shuffle)
    train_doc_bin = spacy.tokens.DocBin(docs=train_docs)  # Creating a DocBin for training docs
    test_doc_bin = spacy.tokens.DocBin(docs=test_docs)  # Creating a DocBin for testing docs
    
    # Deriving the paths for saving the training and testing DocBins
    train_path = file_path.split('.spacy')[0] + '-train.spacy'
    test_path = file_path.split('.spacy')[0] + '-test.spacy'

    train_doc_bin.to_disk(train_path)  # Saving the training DocBin to disk
    test_doc_bin.to_disk(test_path)  # Saving the testing DocBin to disk
