# Model & Evaluation

## 1. Load Packages and Datasets

In [168]:
import json
import os
import re
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML

from collections import Counter
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.model_selection import train_test_split

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, PatternRecognizer, Pattern, RecognizerResult
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.predefined_recognizers import EmailRecognizer, UrlRecognizer, PhoneRecognizer

from tqdm.auto import tqdm
from dateutil import parser
from sklearn.metrics import classification_report, accuracy_score
import time
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [169]:
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [170]:
# Load pre-processed train and test set from EDA notebook
train_df = pd.read_json('./../data/preprocess_train.json')
test_df = pd.read_json('./../data/preprocess_test.json')

raw_train_df = pd.read_json('./../data/train.json')
raw_test_df = pd.read_json('./../data/test.json')

## 2. Model Building

In [171]:
# Create function to run classification report and collect performance metrics
def evaluate(y, y_pred):
    accuracy = round(accuracy_score(y, y_pred), 2)
    print('Accuracy Score:', accuracy, '\n')

    report = classification_report(y, y_pred)
    print('Classification Report:')
    print(report)

### 2.1 Baseline - Logistic Regression Model

### 2.2 Random Forest Model

In [172]:
# Convert list of tokens back to strings
train_df['tokens_joined'] = train_df['tokens_processed'].apply(lambda tokens: ' '.join(tokens))

# Extract features and labels
X = train_df['tokens_joined']
y = train_df['labels_processed']

mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y)

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X_tfidf, y_bin, test_size=0.2, random_state=599)

In [173]:
start_time = time.time()

# Initial RF model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=599)
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_rf = rf_classifier.predict(X_valid)

end_time = time.time()

In [174]:
runtime_rf = end_time - start_time
print("Model runtime:", runtime_rf, "seconds")
evaluate(y_valid, y_pred_rf)

Model runtime: 15.567684650421143 seconds
Accuracy Score: 0.86 

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00        12
           2       1.00      0.01      0.01       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       1.00      0.78      0.88      1744
   macro avg       0.15      0.08      0.08      1744
weighted avg       0.89      0.78      0.78   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [175]:
# Hyperparameter Tuning of Random Forest
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
start_time = time.time()

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1,
                           cv=3, verbose=2, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

Fitting 3 folds for each of 288 candidates, totalling 864 fits


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_l

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [176]:
final_rf_model = RandomForestClassifier(**best_params)
final_rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_rf_param = rf_classifier.predict(X_valid)

end_time = time.time()

In [177]:
# Evaluate the model
runtime_rf_param = end_time - start_time
print("Model runtime:", runtime_rf_param, "seconds")
evaluate(y_valid, y_pred_rf_param)

Model runtime: 274.48314595222473 seconds
Accuracy Score: 0.86 

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00        12
           2       1.00      0.01      0.01       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       1.00      0.78      0.88      1744
   macro avg       0.15      0.08      0.08      1744
weighted avg       0.89      0.78      0.78   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3 K-Nearest Neighbors Model

In [180]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the classifier
knn_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_knn = knn_classifier.predict(X_valid)

# Evaluate the model
print(classification_report(y_valid, y_pred_knn))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      0.08      0.15        12
           2       0.38      0.04      0.08       183
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         0
           8       0.22      0.02      0.04       169
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00      1362

   micro avg       0.98      0.79      0.87      1744
   macro avg       0.20      0.09      0.10      1744
weighted avg       0.85      0.79      0.79      1744
 samples avg       0.99      0.91      0.93      1744



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.4 BERT with MS Presidio Model

In [126]:
#Only using TRAIN Data and splitting in 80/20

train = pd.read_json('./../data/train.json')
#test = pd.read_json('./../data/test.json')

preprocessed_train = pd.read_json('./../data/preprocess_train.json')
#preprocessed_test = pd.read_json('./../data/preprocess_test.json')

In [127]:
# Functions
def token_index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind

def find_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left

def count_whitespaces(word):
    return len(word) - len(word.rstrip())

def date_check(text):
    try:
        parsed_date = parser.parse(text)
        return True
    except:
        return False
    

def pii_fbeta_score(pred_df, gt_df,beta=5):
    df = pred_df.merge(gt_df,how='outer',on=['document',"token"],suffixes=('_pred','_gt'))

    df['cm'] = ""

    df.loc[df.label_gt.isna(),'cm'] = "FP"
    df.loc[df.label_pred.isna(),'cm'] = "FN"
    df.loc[(df.label_gt.notna()) & (df.label_gt!=df.label_pred),'cm'] = "FN"

    df.loc[(df.label_pred.notna()) & (df.label_gt.notna()) & (df.label_gt==df.label_pred),'cm'] = "TP"
    
    FP = (df['cm']=="FP").sum()
    FN = (df['cm']=="FN").sum()
    TP = (df['cm']=="TP").sum()
    
    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    f1 = precision * recall / (precision + recall)
    
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1-Score: " + str(f1))

    s_micro = (1+(beta**2))*TP/(((1+(beta**2))*TP) + ((beta**2)*FN) + FP)

    return s_micro

In [128]:
# Modeling
x = pd.DataFrame(preprocessed_train)
y = x['labels']
x = x.drop(columns='labels')
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.25, random_state=0)

In [129]:
ALLOW_LIST = []
DENY_LIST_EMAIL = []
DENY_LIST_ADDRESS = []
DENY_LIST_URL = []
DENY_LIST_NAME = []
DENY_LIST_PHONE = []
DENY_LIST_ID = []

In [130]:
all_stopwords = list(stopwords.words())
words = Counter()
for doc in preprocessed_train.tokens:
    words.update(doc)
#for doc in preprocessed_test.tokens:
#    words.update(doc)
all_stopwords  += [str(w).lower() for w, i in words.items() if i > 55]
all_stopwords = list(sorted(set(all_stopwords)))
del words

ALLOW_LIST.extend(all_stopwords)

In [131]:
PHONE_ALLOW_LIST = ['phone', 'number', 'telephone', 'cell', 'cellphone',
              'mobile', 'call', 'ph', 'tel', 'mobile', 'Email']
URL_DENY_LIST = ["wikipedia", "coursera", ".pdf", ".PDF", "article",
             ".png",".gov", ".work", ".ai", ".firm", ".arts",
             ".store", ".rec", ".biz", ".travel", '.ru', 'designabetterbusiness', '.tools', 'designorate',
                       'designresearchtechniques', 'ec', '.europa', 'forbes', 'google',
                       'ideas', 'trello', '.edu']

In [134]:
#Prepping the list of datasets for PII

tokens = train_x['tokens'].apply(pd.Series).stack().reset_index(drop=True).tolist()
labels = train_y.apply(pd.Series).stack().reset_index(drop=True).tolist()

#tokens = train['tokens'].apply(pd.Series).stack().reset_index(drop=True).tolist()
#labels = train['labels'].apply(pd.Series).stack().reset_index(drop=True).tolist()

for i in set(labels):
    indices = [j for j in range(len(labels)) if labels[j] == i]
    if i == 'O':
        ALLOW_LIST.extend([tokens[i] for i in indices])
    if i == 'B-EMAIL':
        DENY_LIST_EMAIL.extend([tokens[i] for i in indices])
    elif i in ['B-STREET_ADDRESS', 'I-STREET_ADDRESS']:
        DENY_LIST_ADDRESS.extend([tokens[i] for i in indices])
    elif i in ['B-URL_PERSONAL', 'I-URL_PERSONAL']:
        DENY_LIST_URL.extend([tokens[i] for i in indices])
    elif i in ['B-NAME_STUDENT', 'I-NAME_STUDENT']:
    #elif i in ['I-NAME_STUDENT']:
        DENY_LIST_NAME.extend([tokens[i] for i in indices])
    elif i in ['B-PHONE_NUM', 'I-PHONE_NUM']:
        DENY_LIST_PHONE.extend([tokens[i] for i in indices])
    elif i in ['B-ID_NUM', 'I-ID_NUM']:
        DENY_LIST_ID.extend([tokens[i] for i in indices])
    else:
        continue


In [135]:
id_regex = r'([A-Za-z]{2}[.?]:)?\d{12,12}'
id_pattern = Pattern(name="id", regex=id_regex, score = 0.5)
id_recognizer = PatternRecognizer(supported_entity="ID_CUSTOM", patterns = [id_pattern])

address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(cir(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
address_pattern = Pattern(name="address", regex=address_regex, score=0.5)
address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM", patterns = [address_pattern], context=["st", "Apt"])

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_pattern = Pattern(name="email address", regex=email_regex, score=0.5)
email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM", patterns = [email_pattern])

url_regex = r'((https?)|(http?)|(ftp?))://\S+|www\.\S+'
url_pattern = Pattern(name="url", regex=url_regex, score=0.5)
url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM", patterns = [url_pattern])

phone_regex = r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$'
phone_pattern = Pattern(name='phone', regex=phone_regex, score=0.5)
phone_recognizer = PatternRecognizer(supported_entity='PHONE_CUSTOM', patterns=[phone_pattern])

In [136]:
class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts: NlpArtifacts) -> list[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results

new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

In [137]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
Collecting spacy<3.8.0,>=3.7.2
  Downloading spacy-3.7.4-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
Collecting thinc<8.3.0,>=8.2.2
  Downloading thinc-8.2.3-cp39-cp39-macosx_10_9_x86_64.whl (880 kB)
Installing collected packages: thinc, spacy, en-core-web-lg
  Attempting uninstall: thinc
    Found existing installation: thinc 8.2.1
    Uninstalling thinc-8.2.1:
      Successfully uninstalled thinc-8.2.1
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.1
    Uninstalling spacy-3.7.1:
      Successfully uninstalled spacy-3.7.1
Successfully installed en-core-web-lg-3.7.1 spacy-3.7.4 thinc-8.2.3
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')




In [138]:
dictionary = RecognizerRegistry()
dictionary.load_predefined_recognizers()
dictionary.add_recognizer(address_recognizer)
dictionary.add_recognizer(email_recognizer)
dictionary.add_recognizer(url_recognizer)
dictionary.add_recognizer(phone_recognizer)
dictionary.add_recognizer(id_recognizer)

In [139]:
analyzer = AnalyzerEngine(supported_languages=['en'],
                          registry=dictionary,
                          nlp_engine=nlp_engine,
                          context_aware_enhancer=LemmaContextAwareEnhancer(
                              context_similarity_factor=0.6,
                              min_score_with_context_similarity=0.4
                          ))

In [140]:
# Training/Testing
preds = []
#test = preprocessed_test
test = val_x
temp = test.apply(lambda x: token_index(x), axis=1)
test['start'] = temp.apply(lambda x: x[0])
test['end'] = temp.apply(lambda x: x[1])

[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=200; total time=   1.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=1000; total time=   6.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=300; total time=   1.8s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time=   6.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_sampl

In [141]:
for i, d in enumerate(tqdm(test.iterrows())):
    results = analyzer.analyze(text=d[1]['full_text'],
                               entities=["PHONE_CUSTOM", "PERSON", "URL_CUSTOM", "EMAIL_ADDRESS",
                                         "EMAIL_CUSTOM", "ADDRESS_CUSTOM", "US_SSN", "US_ITIN",
                                         "US_PASSPORT", "US_BANK_NUMBER", "USERNAME", "ID_CUSTOM"],
                               allow_list=ALLOW_LIST,
                               language='en', 
                               score_threshold=0.005)
    pre_preds = []
    for r in results:
        s = find_larger(d[1]['start'], r.start)
        end = r.end
        word = d[1]['full_text'][r.start:r.end]
        end = end - count_whitespaces(word)
        temp_preds = [s]
        try:
            while d[1]['end'][s+1] <= end:
                temp_preds.append(s+1)
                s +=1
        except:
            pass
        
        tmp = False
        
        if r.entity_type == 'PHONE_CUSTOM':
            if date_check(word):
                continue
            for w in PHONE_ALLOW_LIST:
                if w in d[1]['full_text'][max(r.start-50, 0):min(r.end+50, len(d[1]['full_text']))]:
                    tmp = False
                    break
                else:
                    tmp = True 
            label =  'PHONE_NUM'
        if r.entity_type == 'PERSON':
            if str(i).upper() in wikipedia:
                tmp = True
                break
            label =  'NAME_STUDENT'
        if r.entity_type == 'URL_CUSTOM':
            for w in URL_DENY_LIST:
                if w in word:
                    tmp = True
                    break
            label = 'URL_PERSONAL'
        if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
            label = "EMAIL"
        if r.entity_type == 'ADDRESS_CUSTOM':
            label = 'STREET_ADDRESS'
        if r.entity_type in ['US_SSN', 'US_ITIN', 'US_PASSPORT', 'US_BANK_NUMBER', 'ID_CUSTOM']:
            label = 'ID_NUM'
        if r.entity_type == 'USERNAME':
            label =  'USERNAME'
        if tmp:
            continue
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and ((p - pre_preds[-1]['token'])==1):
                    label_f = "I-"+label
                else:
                    label_f = "B-"+label
            else:
                label_f = "B-"+label
            pre_preds.append(({
                    "document":d[1]['document'],
                    "token":p,
                    "label":label_f,
                    "rlabel":r.entity_type
                }))
    preds.extend(pre_preds)

[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=200; total time=   1.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=300; total time=   1.7s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=200; total time=   1.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_sampl

0it [00:00, ?it/s]

[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=1000; total time=   6.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.1s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time=   5.7s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=100; total time=   0.7s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samp

NameError: name 'wikipedia' is not defined

In [None]:
predicted_results = pd.DataFrame(preds).iloc[:,:-1].reset_index()
predicted_results.columns = ['row_id','document', 'token', 'label']

temp = val_x[['document']].join(val_y)
dictionary = temp['labels'].apply(lambda x: {'indx': list(range(len(x))), 'vals': x})
indices = dictionary.apply(lambda x: x['indx']).explode()
values = dictionary.apply(lambda x: x['vals']).explode()

ground_truth = pd.concat([indices, values], axis=1).reset_index()
ground_truth['document'] = ground_truth['index'].apply(lambda x: temp['document'][x])
ground_truth = ground_truth.drop(columns='index')
ground_truth.columns = ['token', 'label', 'document']
ground_truth = ground_truth[ground_truth['label'] != 'O']
ground_truth = ground_truth.reset_index(names=['row_id'])


In [None]:
print("FBeta Score: " + str(pii_fbeta_score(predicted_results, ground_truth, 5)))

In [None]:
evaluate(predicted_results, ground_truth)