Setup

In [31]:
import os
#from data_loader import load_word_dataset
import random
import numpy as np
import sys
import nltk
from sklearn.model_selection import train_test_split
import string
# Importing libraries
import pandas as pd
import pprint, time


import sklearn_crfsuite
from sklearn_crfsuite import metrics

from nltk.corpus import treebank
from nltk.corpus import brown

from nltk.classify import MaxentClassifier
from sklearn.metrics import f1_score
import pycrfsuite
from nltk.tag import hmm
# from nltk.classify import megam
from sklearn.model_selection import GridSearchCV
from nltk.tag import BrillTaggerTrainer
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.metrics import ConfusionMatrix

In [32]:
#download the treebank corpus from nltk

nltk.download('treebank')
  
# reading the Treebank tagged sentences
tagged_sentences = list(nltk.corpus.treebank.tagged_sents())

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\kstap\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Function to create noisy data

In [33]:

def modify_sentence_with_probability_and_count(corpus, join_probability, split_probability):
    """
    Modify sentences in a corpus with specified join and split probabilities
    and count the number of splits and joins.

    Args:
    - corpus: NLTK corpus object containing tagged sentences.
    - join_probability: Probability of joining two adjacent words (0.0 to 1.0).
    - split_probability: Probability of splitting a single word (0.0 to 1.0).

    Returns:
    - A modified corpus with words joined and split based on the specified probabilities.
    - Counts of splits and joins as a tuple (split_count, join_count).
    """
    modified_corpus = []
    split_count = 0
    join_count = 0

    for tagged_sentence in corpus:
        modified_sentence = []
        i = 0

        while i < len(tagged_sentence):
            # Randomly decide whether to join the current word with the next word
            if random.random() < join_probability and i < len(tagged_sentence) - 1:
                word1, tag1 = tagged_sentence[i]
                word2, tag2 = tagged_sentence[i + 1]

                # Randomly select a tag from the two original tags
                random_tag = random.choice([tag1, tag2])

                joined_word = word1 + word2
                modified_sentence.append((joined_word, random_tag))
                i += 2  # Move two steps forward
                join_count += 1
            else:
                # Randomly decide whether to split the current word
                word, tag = tagged_sentence[i]
                if len(word) > 1 and random.random() < split_probability:
                    # Split the word into two parts
                    split_index = random.randint(1, len(word) - 1)
                    part1 = word[:split_index]
                    part2 = word[split_index:]
                    modified_sentence.append((part1, tag))
                    modified_sentence.append((part2, tag))
                    split_count += 1
                else:
                    modified_sentence.append((word, tag))
                i += 1

        modified_corpus.append(modified_sentence)

    return modified_corpus, (split_count, join_count)

# Example usage:
join_probability = 0  # Adjust this value as desired
split_probability = 0  # Adjust this value as desired
modified_treebank, (split_count, join_count) = modify_sentence_with_probability_and_count(tagged_sentences, join_probability, split_probability)

# Print the counts of splits and joins
print("Splits:", split_count)
print("Joins:", join_count)

# Print the original and modified tagged sentences
print("Original Sentence:")
print(tagged_sentences[561])
print("\nModified Sentence:")
print(modified_treebank[561])


Splits: 0
Joins: 0
Original Sentence:
[('But', 'ADP'), ('in', 'ADP'), ('the', 'DET'), ('three', 'NUM'), ('leading', 'VERB'), ('political', 'ADJ'), ('contests', 'NOUN'), ('of', 'ADP'), ('1989', 'NUM'), (',', '.'), ('the', 'DET'), ('negative', 'ADJ'), ('ads', 'NOUN'), ('have', 'VERB'), ('reached', 'VERB'), ('new', 'ADJ'), ('levels', 'NOUN'), ('of', 'ADP'), ('hostility', 'NOUN'), (',', '.'), ('*-1', 'X'), ('raising', 'VERB'), ('fears', 'NOUN'), ('that', 'ADP'), ('this', 'DET'), ('kind', 'NOUN'), ('of', 'ADP'), ('mudslinging', 'NOUN'), (',', '.'), ('empty', 'ADJ'), ('of', 'ADP'), ('significant', 'ADJ'), ('issues', 'NOUN'), (',', '.'), ('is', 'VERB'), ('ushering', 'VERB'), ('in', 'ADP'), ('a', 'DET'), ('new', 'ADJ'), ('era', 'NOUN'), ('of', 'ADP'), ('campaigns', 'NOUN'), ('without', 'ADP'), ('content', 'NOUN'), ('.', '.')]

Modified Sentence:
[('But', 'ADP'), ('in', 'ADP'), ('the', 'DET'), ('three', 'NUM'), ('leading', 'VERB'), ('political', 'ADJ'), ('contests', 'NOUN'), ('of', 'ADP'), ('19

Brill function

In [5]:
def train_and_evaluate_brill(data, num_repetitions, joinprob, splitprob):

    """
    Train and evaluate a Brill tagger.

    Parameters:
    - data: List of feature-label pairs.
    - num_repetitions: number of times to repeat the experiment.
    - joinprob: probability of randomly joining words
    - splitprob: probability of randomly splitting words

    Returns:
    - f1_scores: list of f1 scores with length equal to num_repetitions.
    - split_counts: list of splits made each run with length equal to num_repetitions.
    - join_counts: list of joins made each run with length equal to num_repetitions.
    """
    
    f1_scores = []
    split_counts = []
    join_counts = []

    for _ in range(num_repetitions):
        # train test split
        train_set, test_set = train_test_split(data, train_size=0.8)

        # corrupt the training set
        corrupt_train_set, (split_count, join_count) = modify_sentence_with_probability_and_count(train_set, join_probability=joinprob, split_probability=splitprob)
        split_counts.append(split_count)
        join_counts.append(join_count)

        tag1 = DefaultTagger('NN')
        unigram_tagger = UnigramTagger(corrupt_train_set, backoff=tag1)

        templates = nltk.brill.nltkdemo18()
        trainer = BrillTaggerTrainer(templates=templates, initial_tagger=unigram_tagger)

        # Train the Brill Tagger using the templates
        brill_tagger = trainer.train(corrupt_train_set, max_rules=200)

        # get predictions
        test_untagged_words = [tup[0] for sent in test_set for tup in sent]
        tags = brill_tagger.tag(test_untagged_words)
        brill_preds = [tag for  _,tag in tags]

        test_true_tags = [tup[1] for sent in test_set for tup in sent]
        f1 = f1_score(brill_preds, test_true_tags, average='weighted')



        f1 = f1_score(test_true_tags, brill_preds, average='weighted')
        f1_scores.append(f1)

    return f1_scores, split_counts, join_counts


Unigram function

In [6]:
def train_and_evaluate_unigram(data, num_repetitions, joinprob, splitprob):

    """
    Train and evaluate a Unigram tagger.

    Parameters:
    - data: List of feature-label pairs.
    - num_repetitions: number of times to repeat the experiment.
    - joinprob: probability of randomly joining words
    - splitprob: probability of randomly splitting words

    Returns:
    - f1_scores: list of f1 scores with length equal to num_repetitions.
    - split_counts: list of splits made each run with length equal to num_repetitions.
    - join_counts: list of joins made each run with length equal to num_repetitions.
    """
    
    f1_scores = []
    split_counts = []
    join_counts = []

    for _ in range(num_repetitions):
        # train test split
        train_set, test_set = train_test_split(data, train_size=0.8)

        # corrupt train set
        corrupt_train_set, (split_count, join_count) = modify_sentence_with_probability_and_count(train_set, join_probability=joinprob, split_probability=splitprob)
        split_counts.append(split_count)
        join_counts.append(join_count)

        tag1 = DefaultTagger('NN')
        unigram_tagger = UnigramTagger(corrupt_train_set, backoff=tag1)

     
        # get predictions
        test_untagged_words = [tup[0] for sent in test_set for tup in sent]
        unigram_tags = unigram_tagger.tag(test_untagged_words)
        unigram_preds = [tag for  _,tag in unigram_tags]

        test_true_tags = [tup[1] for sent in test_set for tup in sent]

        f1 = f1_score(test_true_tags, unigram_preds, average='weighted')
        f1_scores.append(f1)

    return f1_scores, split_counts, join_counts

HMM function

In [7]:
def train_and_evaluate_hmm(data, num_repetitions, joinprob, splitprob):  


    """
    Train and evaluate an HMM tagger.

    Parameters:
    - data: List of feature-label pairs.
    - num_repetitions: number of times to repeat the experiment.
    - joinprob: probability of randomly joining words
    - splitprob: probability of randomly splitting words

    Returns:
    - f1_scores: list of f1 scores with length equal to num_repetitions.
    - split_counts: list of splits made each run with length equal to num_repetitions.
    - join_counts: list of joins made each run with length equal to num_repetitions.
    """

    f1_scores = []
    split_counts = []
    join_counts = []

    for _ in range(num_repetitions):
                            
        train_set, test_set = train_test_split(data, train_size=0.8)

        corrupt_train_set, (split_count, join_count) = modify_sentence_with_probability_and_count(train_set, join_probability=joinprob, split_probability=splitprob)
        split_counts.append(split_count)
        join_counts.append(join_count)

        tagger = nltk.HiddenMarkovModelTagger.train(corrupt_train_set)
        
        predicted_labels = []
        correct_labels = []

        for i, sent in enumerate(test_set):
            predicted_labels += [tag for _, tag in tagger.tag([word for word, _ in sent])]
            correct_labels += [tag for _, tag in sent]

        f1 = f1_score(correct_labels, predicted_labels, average='weighted')
        f1_scores.append(f1)

    return f1_scores, split_counts, join_counts

Feature extraction function

In [8]:
def word_features(sentence, i):

    """
    Extract features for a given index in a sentence.

    Parameters:
    - sentence: List of feature-label pairs.
    - i: index

    Returns:
    - features: a dictionary of features on a given index.
    """
        
    word = sentence[i][0]
    tag = sentence[i][1]
    features = {
        'word': word,
        'is_first': i == 0,  # if the word is the first word
        'is_last': i == len(sentence) - 1,  # if the word is the last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,  # word is in uppercase
        'is_all_lower': word.lower() == word,  # word is in lowercase
        # prefix of the word
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        # suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        # extracting previous word
        'prev_word': '' if i == 0 else sentence[i - 1][0],
        # extracting next word
        'next_word': '' if i == len(sentence) - 1 else sentence[i + 1][0],
        'has_hyphen': '-' in word,  # if word has a hyphen
        'is_numeric': word.isdigit(),  # if word is numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }

    # Add previous tag and its previous tag
    prev_tag = '' if i == 0 else sentence[i - 1][1]
    prev_prev_tag = '' if i < 2 else sentence[i - 2][1]
    features['prev_prev_tag'] = f'{prev_prev_tag}_{prev_tag}'

    # Add word after the next word
    features['next_next_word'] = '' if i > len(sentence) - 3 else sentence[i + 2][0]

    # Add word before the previous word
    features['prev_prev_word'] = '' if i < 2 else sentence[i - 2][0]

    return features


MEMM tuning function

In [9]:
# from nltk.classify import MaxentClassifier
# from nltk.classify.util import apply_features
# from nltk.metrics import f1_score
# import numpy as np
from sklearn.model_selection import KFold

def train_and_tune_memm(MEMM_train, max_iter_values, num_folds=5):
    """
    Train and tune a Maximum Entropy Markov Model (MEMM) using cross-validation.

    Parameters:
    - MEMM_train: List of feature-label pairs for training.
    - max_iter_values: List of max_iter values to tune.
    - num_folds: Number of folds for cross-validation.

    Returns:
    - best_max_iter: The best max_iter value found.
    - best_f1: The F1 score achieved with the best max_iter value.
    """

    # Initialize variables to keep track of the best max_iter and its associated F1 score
    best_max_iter = None
    best_f1 = 0.0

    # Define the number of folds for cross-validation
    kf = KFold(n_splits=num_folds)

    for max_iter in max_iter_values:
        f1_scores = []

        for train_idx, valid_idx in kf.split(MEMM_train):
            train_set = [MEMM_train[i] for i in train_idx]
            valid_set = [MEMM_train[i] for i in valid_idx]

            maxent_classifier = MaxentClassifier.train(train_set, algorithm='gis', max_iter=max_iter)

            valid_features = [features for features, _ in valid_set]
            valid_labels = [pos for _, pos in valid_set]

            predictions = [maxent_classifier.classify(features) for features in valid_features]

            f1 = f1_score(valid_labels, predictions, average='weighted')
            f1_scores.append(f1)

        # Calculate the average F1 score across folds
        avg_f1 = np.mean(f1_scores)

        # Check if this max_iter gives a better F1 score than the current best
        if avg_f1 > best_f1:
            best_max_iter = max_iter
            best_f1 = avg_f1

    return best_max_iter

# Example usage:
# best_max_iter, best_f1 = train_and_tune_memm(MEMM_train40, max_iter_values=[5, 10, 15, 20])
# print(f"Best max_iter: {best_max_iter}")
# print(f"Best F1 Score: {best_f1}")


CRF tuning function

In [10]:


def train_and_tune_crf_with_cv(X, y, param_grid, n_folds=5):
    
    """
    Train and tune a Conditional Random Fields (CRF) Model using cross-validation.

    Parameters:
    - X: list of extracted features
    - y: list of corresponding tags
    - param_grid: search grid dictionary.
    - n_folds: Number of folds for cross-validation.

    Returns:
    - best_params: A dictionary of the best parameter values found.
    """

    best_f1 = 0.0
    best_params = {}

    for max_iter in param_grid['max_iterations']:
        for c1 in param_grid['c1']:
            for c2 in param_grid['c2']:
                f1_scores = []

                for fold in range(n_folds):
                    # Split data into training and validation sets
                    train_indices = [i for i in range(len(X)) if i % n_folds != fold]
                    valid_indices = [i for i in range(len(X)) if i % n_folds == fold]

                    X_train_fold = [X[i] for i in train_indices]
                    y_train_fold = [y[i] for i in train_indices]
                    X_valid_fold = [X[i] for i in valid_indices]
                    y_valid_fold = [y[i] for i in valid_indices]

                    # Train the CRF model
                    trainer = pycrfsuite.Trainer(verbose=False)
                    for x_train, y_train in zip(X_train_fold, y_train_fold):
                        trainer.append(x_train, y_train)
                    trainer.set_params({
                        'max_iterations': max_iter,
                        'c1': c1,
                        'c2': c2,
                        'feature.possible_transitions': True
                    })
                    trainer.train('temp_model.crfsuite')

                    # Test the CRF model
                    tagger = pycrfsuite.Tagger()
                    tagger.open('temp_model.crfsuite')

                    CRF_predictions = [tagger.tag(instance) for instance in X_valid_fold]

                    CRF_flat_predictions = [tag for instance_tags in CRF_predictions for tag in instance_tags]
                    CRF_flat_ground_truth = [tag for instance_tags in y_valid_fold for tag in instance_tags]

                    f1 = f1_score(CRF_flat_ground_truth, CRF_flat_predictions, average='weighted')
                    f1_scores.append(f1)

                mean_f1 = np.mean(f1_scores)
                if mean_f1 > best_f1:
                    best_f1 = mean_f1
                    best_params = {
                        'max_iterations': max_iter,
                        'c1': c1,
                        'c2': c2,
                        'feature.possible_transitions': True
                    }

    #print("Best Parameters: ", best_params)
    #print("Best F1 Score: ", best_f1)

    return best_params




Validation for MEMM and CRF

In [34]:
# validation

random.seed(1234)
valid_set, model_set = train_test_split(tagged_sentences, train_size=0.1)

# feature extraction    
X_valid = []
y_valid = []
for sentence in valid_set:
    X_sentence = []
    y_sentence = []
    for i in range(len(sentence)):
        X_sentence.append(word_features(sentence, i))
        y_sentence.append(sentence[i][1])
    X_valid.append(X_sentence)
    y_valid.append(y_sentence)    

MEMM_valid = []  # Collect feature-label pairs for MEMM
for sentence_features, sentence_labels in zip(X_valid, y_valid):
    MEMM_valid.extend(list(zip(sentence_features, sentence_labels)))  

best_max_iter = train_and_tune_memm(MEMM_valid, max_iter_values=[10,20,30])   



In [12]:

# Define the parameter grid for tuning crf
param_grid = {
    'max_iterations': [20, 50, 100],
    'c1': [0.01, 0.1, 1.0, 10],
    'c2': [1e-4, 1e-3, 1e-2, 1e-1]
}

# feature extraction    
X_valid = []
y_valid = []
for sentence in valid_set:
    X_sentence = []
    y_sentence = []
    for i in range(len(sentence)):
        X_sentence.append(word_features(sentence, i))
        y_sentence.append(sentence[i][1])
    X_valid.append(X_sentence)
    y_valid.append(y_sentence)    


# validation
best_param = train_and_tune_crf_with_cv(X_valid, y_valid, param_grid)

MEMM function

In [13]:
def train_and_evaluate_memm(data, num_repetitions, joinprob, splitprob, max_iters):

    """
    Train and evaluate an MEMM tagger.

    Parameters:
    - data: List of feature-label pairs.
    - num_repetitions: number of times to repeat the experiment.
    - joinprob: probability of randomly joining words
    - splitprob: probability of randomly splitting words
    - max_iters: maximum iterations for the MaxentClassifier 

    Returns:
    - f1_scores: list of f1 scores with length equal to num_repetitions.
    - split_counts: list of splits made each run with length equal to num_repetitions.
    - join_counts: list of joins made each run with length equal to num_repetitions.
    """
   
    # initialize list to keep track of model performance
    f1_scores = []
    split_counts = []
    join_counts = []
    

    for _ in range(num_repetitions):

        # train test split and formatting
        train_set, test_set = train_test_split(data, train_size=0.8)

    
        corrupt_train_set, (split_count, join_count) = modify_sentence_with_probability_and_count(train_set, join_probability=joinprob, split_probability=splitprob)
        split_counts.append(split_count)
        join_counts.append(join_count)

        # feature extraction    
        X_train = []
        y_train = []
        for sentence in corrupt_train_set:
            X_sentence = []
            y_sentence = []
            for i in range(len(sentence)):
                X_sentence.append(word_features(sentence, i))
                y_sentence.append(sentence[i][1])
            X_train.append(X_sentence)
            y_train.append(y_sentence) 


        X_test = []
        y_test = []
        for sentence in test_set:
            X_sentence = []
            y_sentence = []
            for i in range(len(sentence)):
                X_sentence.append(word_features(sentence, i))
                y_sentence.append(sentence[i][1])
            X_test.append(X_sentence)
            y_test.append(y_sentence) 
        
        
        MEMM_train = []  # Collect feature-label pairs for MEMM
        for sentence_features, sentence_labels in zip(X_train, y_train):
            MEMM_train.extend(list(zip(sentence_features, sentence_labels)))

        MEMM_test = []  # Collect feature-label pairs for MEMM
        for sentence_features, sentence_labels in zip(X_test, y_test):
            MEMM_test.extend(list(zip(sentence_features, sentence_labels)))

        
        # training using the tuned value
        maxent_classifier = MaxentClassifier.train(MEMM_train, algorithm='gis', max_iter=max_iters)

        # predictions 
        memm_predictions = maxent_classifier.classify_many([features for features, _ in MEMM_test])

        memm_true_labels = [pos for _, pos in MEMM_test]
        
        
        f1 = f1_score(memm_true_labels, memm_predictions, average='weighted')
        f1_scores.append(f1)


    return f1_scores, split_counts, join_counts


#train_and_evaluate_memm(Xdata=X, ydata=y, num_repetitions=2, train_prop=0.4)

CRF function

In [14]:

def train_and_evaluate_crf(data, num_repetitions, joinprob, splitprob, param_grid):

    """
    Train and evaluate a CRF tagger.

    Parameters:
    - data: List of feature-label pairs.
    - num_repetitions: number of times to repeat the experiment.
    - joinprob: probability of randomly joining words
    - splitprob: probability of randomly splitting words
    - param_grid: dictionary of parameter specifications for the maximum number of iterations and regularization parameters

    Returns:
    - f1_scores: list of f1 scores with length equal to num_repetitions.
    - split_counts: list of splits made each run with length equal to num_repetitions.
    - join_counts: list of joins made each run with length equal to num_repetitions.
    """

    # initialize list to keep track of model performance
    f1_scores = []
    split_counts = []
    join_counts = []



    for _ in range(num_repetitions):

         # train test split and formatting
        train_set, test_set = train_test_split(data, train_size=0.8)

        
        corrupt_train_set, (split_count, join_count) = modify_sentence_with_probability_and_count(train_set, join_probability=joinprob, split_probability=splitprob)
        split_counts.append(split_count)
        join_counts.append(join_count)

        # feature extraction    
        X_train = []
        y_train = []
        for sentence in corrupt_train_set:
            X_sentence = []
            y_sentence = []
            for i in range(len(sentence)):
                X_sentence.append(word_features(sentence, i))
                y_sentence.append(sentence[i][1])
            X_train.append(X_sentence)
            y_train.append(y_sentence) 


        X_test = []
        y_test = []
        for sentence in test_set:
            X_sentence = []
            y_sentence = []
            for i in range(len(sentence)):
                X_sentence.append(word_features(sentence, i))
                y_sentence.append(sentence[i][1])
            X_test.append(X_sentence)
            y_test.append(y_sentence)    
   

        

        # training using the tuned value
        trainer = pycrfsuite.Trainer(verbose=False)

        # Add training data
        for x, y in zip(X_train, y_train):
	        trainer.append(x, y)

        # Set trainer parameters
        trainer.set_params(param_grid)

        # Train the CRF model
        trainer.train('pos.crfsuite')


        # Testing
        # Initialize the tagger
        tagger = pycrfsuite.Tagger()
        tagger.open('pos.crfsuite')


        # predictions
        CRF_predictions = [tagger.tag(instance) for instance in X_test]

        CRF_flat_predictions = [tag for instance_tags in CRF_predictions for tag in instance_tags]
        CRF_flat_ground_truth = [tag for instance_tags in y_test for tag in instance_tags]

        # test score
        f1 = f1_score(CRF_flat_ground_truth, CRF_flat_predictions, average='weighted')
        f1_scores.append(f1)


    return f1_scores, split_counts, join_counts




Collecting all results


In [15]:
# random.seed(1234)

# unigram_clean, unigram_clean_splits, unigram_clean_joins = train_and_evaluate_unigram(data=model_set, num_repetitions=5, joinprob=0,splitprob=0)
# unigram_low, unigram_low_splits, unigram_low_joins = train_and_evaluate_unigram(data=model_set, num_repetitions=5, joinprob=0.1,splitprob=0.05)
# unigram_mid, unigram_mid_splits, unigram_mid_joins = train_and_evaluate_unigram(data=model_set, num_repetitions=5, joinprob=0.3,splitprob=0.11)
# unigram_high, unigram_high_splits, unigram_high_joins = train_and_evaluate_unigram(data=model_set, num_repetitions=5, joinprob=0.6,splitprob=0.33)

# brill_clean, brill_clean_splits, brill_clean_joins = train_and_evaluate_brill(data=model_set, num_repetitions=5, joinprob=0,splitprob=0)
# brill_low, brill_low_splits, brill_low_joins = train_and_evaluate_brill(data=model_set, num_repetitions=5, joinprob=0.1,splitprob=0.05)
# brill_mid, brill_mid_splits, brill_mid_joins = train_and_evaluate_brill(data=model_set, num_repetitions=5, joinprob=0.3,splitprob=0.11)
# brill_high, brill_high_splits, brill_high_joins = train_and_evaluate_brill(data=model_set, num_repetitions=5, joinprob=0.6,splitprob=0.33)


# hmm_clean, hmm_clean_splits, hmm_clean_joins = train_and_evaluate_hmm(data=model_set, num_repetitions=5, joinprob=0,splitprob=0)
# hmm_low, hmm_low_splits, hmm_low_joins = train_and_evaluate_hmm(data=model_set, num_repetitions=5, joinprob=0.1,splitprob=0.05)
# hmm_mid, hmm_mid_splits, hmm_mid_joins = train_and_evaluate_hmm(data=model_set, num_repetitions=5, joinprob=0.3,splitprob=0.11)
# hmm_high, hmm_high_splits, hmm_high_joins = train_and_evaluate_hmm(data=model_set, num_repetitions=5, joinprob=0.6,splitprob=0.33)

# memm_clean, memm_clean_splits, memm_clean_joins = train_and_evaluate_memm(data=model_set, num_repetitions=5, joinprob=0,splitprob=0, max_iters=best_max_iter)
# memm_low, memm_low_splits, memm_low_joins = train_and_evaluate_memm(data=model_set, num_repetitions=5, joinprob=0.1,splitprob=0.05, max_iters=best_max_iter)
# memm_mid, memm_mid_splits, memm_mid_joins = train_and_evaluate_memm(data=model_set, num_repetitions=5, joinprob=0.3,splitprob=0.11, max_iters=best_max_iter)
# memm_high, memm_high_splits, memm_high_joins = train_and_evaluate_memm(data=model_set, num_repetitions=5, joinprob=0.6,splitprob=0.33, max_iters=best_max_iter)

# crf_clean, crf_clean_splits, crf_clean_joins = train_and_evaluate_crf(data=model_set, num_repetitions=5, joinprob=0,splitprob=0, param_grid=best_param)
# crf_low, crf_low_splits, crf_low_joins = train_and_evaluate_crf(data=model_set, num_repetitions=5, joinprob=0.1,splitprob=0.05, param_grid=best_param)
# crf_mid, crf_mid_splits, crf_mid_joins = train_and_evaluate_crf(data=model_set, num_repetitions=5, joinprob=0.3,splitprob=0.11, param_grid=best_param)
# crf_high, crf_high_splits, crf_high_joins = train_and_evaluate_crf(data=model_set, num_repetitions=5, joinprob=0.6,splitprob=0.33, param_grid=best_param)

In [16]:
# f1_scores = {
#     "hmm_clean": hmm_clean,
#     "hmm_low": hmm_low,
#     "hmm_mid": hmm_mid,
#     "hmm_high": hmm_high,
#     "memm_clean": memm_clean,
#     "memm_low": memm_low,
#     "memm_mid": memm_mid,
#     "memm_high": memm_high,
#     "crf_clean": crf_clean,
#     "crf_low": crf_low,
#     "crf_mid": crf_mid,
#     "crf_high": crf_high,
#     "unigram_clean": unigram_clean,
#     "unigram_low": unigram_low,
#     "unigram_mid": unigram_mid,
#     "unigram_high": unigram_high,
#     "brill_clean": brill_clean,
#     "brill_low": brill_low,
#     "brill_mid": brill_mid,
#     "brill_high": brill_high
# }

# with open('insert file path here', "w") as file:
#     for model_threshold, f1_score in f1_scores.items():
#         file.write(f"{model_threshold}: {f1_score}\n")

In [18]:
# with open('insert file path', "w") as file:
#     for param, paramval in best_param.items():
#         file.write(f"{param}: {paramval}\n")

In [19]:
# max_it = {
#     "max_it": best_max_iter
# }

# with open('insert file path here', "w") as file:
#     for param, paramval in max_it.items():
#         file.write(f"{param}: {paramval}\n")