# Preprocessing

## Import dataset

In [1]:
import pandas as pd

data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

Unnamed: 0,text,label
64298,Disney shareholders to get their day in court ...,2
44825,Bad case of shakes As long as the last ember f...,1
117531,College Notebook: Ole Miss to hire coach Missi...,1
2900,Cycling: Hamilton wins time trial VOULIAGMENI ...,1
80332,Law out a month to seven weeks The Patriots ar...,1


In [2]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

## Spacy

In [3]:
import spacy
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

# nlp.enable_pipe('senter')

In [4]:
# TODO fix regex to remove instances like '(Reuters) Reuters'
# import re
# from spacy.tokens import Doc
# from spacy.language import Language

# @Language.component("regex_cleanup")
# def regex_cleanup(doc):
#     cleaned_tokens = []
#     for token in doc:
#         if not re.match(r'((\w+))', token.text):
#             cleaned_tokens.append(token)
#     return Doc(doc.vocab, words=[token.text for token in cleaned_tokens])

# nlp.add_pipe("regex_cleanup", first=True)

In [5]:
# TODO: process all data
small_data = data.head(1000)
small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens'] = small_data['text'].apply(nlp)


Unnamed: 0,text,label,tokens
923,South African telephone monopoly to go ahead w...,3,"(South, African, telephone, monopoly, to, go, ..."
434,Infocus: Data Driven Attacks Using HTTP Tunnel...,3,"(Infocus, :, Data, Driven, Attacks, Using, HTT..."
970,Microsoft ships updated Works Version 8 of the...,3,"(Microsoft, ships, updated, Works, Version, 8,..."
441,Apple Introduces Production Suite Production S...,3,"(Apple, Introduces, Production, Suite, Product..."
840,Ex-Chess Champion Fischer to Marry Japanese Wo...,0,"(Ex, -, Chess, Champion, Fischer, to, Marry, J..."


In [6]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens_count'] = small_data['tokens'].apply(len)


count    1000.000000
mean       47.046000
std        20.916486
min        14.000000
25%        32.000000
50%        46.000000
75%        56.000000
max       152.000000
Name: tokens_count, dtype: float64

In [7]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [8]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [9]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [10]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [11]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['text_embeddings'] = small_data['tokens_f

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner
219,BioVeris Settles 2 Lawsuits Against Chief Exec...,3,"(BioVeris, Settles, 2, Lawsuits, Against, Chie...",61,"[BioVeris, Settles, 2, Lawsuits, Chief, Execut...",bioveris settles 2 lawsuit chief executive son...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(BioVeris, PROPN, B, ORG), (Settles, PROPN, I..."
479,"Hungarian GP, Qualifying Fifth and ninth for F...",1,"(Hungarian, GP, ,, Qualifying, Fifth, and, nin...",20,"[Hungarian, GP, Qualifying, Fifth, ninth, Fern...",hungarian gp qualifying fifth ninth fernando j...,"[[-5.9302, -2.4201, 1.0303, 0.72541, 5.4153, 1...","[(Hungarian, ADJ, B, NORP), (GP, NOUN, O, ), (..."
964,No Respite for Microsoft European antitrust re...,2,"(No, Respite, for, Microsoft, European, antitr...",48,"[Respite, Microsoft, European, antitrust, regu...",respite microsoft european antitrust regulator...,"[[-2.2872, -1.9349, 1.0128, -0.95223, 7.1268, ...","[(Respite, PROPN, O, ), (Microsoft, PROPN, B, ..."
428,Google Index Database to be Archived? Google I...,3,"(Google, Index, Database, to, be, Archived, ?,...",62,"[Google, Index, Database, Archived, Google, In...",google index database archive google index dat...,"[[1.7625, -0.28477, 3.955, 4.0928, -1.0044, 2....","[(Google, PROPN, B, ORG), (Index, PROPN, O, ),..."
697,NYMEX Crude Hits Record \$46.76 SINGAPORE (Re...,2,"(NYMEX, Crude, Hits, Record, \$46.76, , SINGA...",53,"[NYMEX, Crude, Hits, Record, \$46.76, SINGAPOR...",nymex crude hits record \$46.76 singapore reut...,"[[0.21921, 2.4813, -4.0682, -1.905, 2.098, 3.0...","[(NYMEX, NOUN, O, ), (Crude, PROPN, O, ), (Hit..."


In [12]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    # non_entity_strings = [token for token in doc 
    #                     if token.text not in entity_dict 
    #                     and token.ent_iob_ == "O"
    #                     and token.pos_ != 'SPACE']
    # entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)


In [13]:
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner,entity_dict
680,"Puerto Rico Stuns Dream Team, 92-73 ATHENS, Gr...",0,"(Puerto, Rico, Stuns, Dream, Team, ,, 92, -, 7...",61,"[Puerto, Rico, Stuns, Dream, Team, 92, 73, ATH...",puerto rico stuns dream team 92 73 athens gree...,"[[-4.6394, 0.89256, -2.2244, -1.8791, 6.3153, ...","[(Puerto, PROPN, B, ORG), (Rico, PROPN, I, ORG...","{'puerto rico': ('PROPN', 'GPE'), '92': ('NUM'..."
502,Kerry leading Bush in key swing states (AFP) A...,0,"(Kerry, leading, Bush, in, key, swing, states,...",47,"[Kerry, leading, Bush, key, swing, states, AFP...",kerry lead bush key swing state afp afp poll p...,"[[-2.0371, -2.4904, 0.5559, 0.77599, 2.3708, -...","[(Kerry, PROPN, B, PERSON), (leading, VERB, O,...","{'kerry': ('PROPN', 'PERSON'), 'bush': ('PROPN..."
671,Lucrative Cash Package Came as Fairchild Repor...,2,"(Lucrative, Cash, Package, Came, as, Fairchild...",36,"[Lucrative, Cash, Package, Came, Fairchild, Re...",lucrative cash package come fairchild reported...,"[[-0.62464, -0.69781, -0.76776, 3.1496, 2.1022...","[(Lucrative, ADJ, O, ), (Cash, PROPN, O, ), (P...","{'\$53.2': ('PROPN', 'ORG'), 'jeffrey j. stein..."
898,News: New Hypoxic Event Found Off Oregon Coast...,3,"(News, :, New, Hypoxic, Event, Found, Off, Ore...",70,"[News, New, Hypoxic, Event, Found, Oregon, Coa...",news new hypoxic event find oregon coast secon...,"[[7.0608, -7.8137, 0.15888, 0.86217, 4.5475, -...","[(News, NOUN, O, ), (New, PROPN, O, ), (Hypoxi...","{'oregon': ('PROPN', 'GPE'), 'second': ('ADJ',..."
728,"DiMarco, Riley Get on Ryder Cup Team (AP) AP -...",1,"(DiMarco, ,, Riley, Get, on, Ryder, Cup, Team,...",44,"[DiMarco, Riley, Ryder, Cup, Team, AP, AP, Hal...",dimarco riley ryder cup team ap ap hal sutton ...,"[[-5.8441, -0.62597, 3.0768, -1.1331, 0.77332,...","[(DiMarco, PROPN, B, PERSON), (Riley, PROPN, B...",{'dimarco riley ryder cup team ap ap hal sutto...


In [14]:
# get the filtered text and tokens in its own dataframe and save to csv
small_data_filtered = small_data[['text_filtered', 'label',]]
small_data_filtered.to_csv('datasets/small_data_filtered.csv', index=False)

# Model Training and predictions

In [15]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix

# split the data into training and testing sets
train, test = train_test_split(small_data, test_size=0.2, random_state=42)

train_features = []
train_features_embeddes = []
train_features_text_and_tokens = []
train_features_entities = []
train_features_no_er = []
train_labels = []
test_features = []
test_features_embeddes = []
test_features_text_and_tokens = []
test_features_entities = []
test_features_no_er = []
test_labels = []

for index, row in train.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    # All features
    features = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features.append(features)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    train_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    train_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    train_features_no_er.append(features_no_ner)
    
    train_labels.append(label)
    
for index, row in test.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    features_test = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features.append(features_test)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    test_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    test_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    test_features_no_er.append(features_no_ner)
    test_labels.append(label)

In [16]:
from sklearn.model_selection import GridSearchCV

def modelFit(model, param_grid, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    # Add max_iter to param_grid if the model supports it
    try:
        if hasattr(model, 'max_iter'):
            param_grid['classifier__max_iter'] = [6000]  # or whatever values you want
    except Exception as e:
        print(f"An error occurred when setting max_iter: {e}")


    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

    # Fit the GridSearchCV object
    grid_search.fit(train_features, train_labels)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict the test data
    preds = best_model.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    results = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': confusion_matrix(test_labels, preds),
        'best_params': grid_search.best_params_
    }

    return results

___
## NB

### All features

In [17]:
results = modelFit(MultinomialNB(), {}, train_features, test_features)
print(results)

{'accuracy': 0.805, 'f1': 0.8008366169234518, 'precision': 0.8027972984394132, 'recall': 0.805, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 22, 15],
       [ 7,  1,  6, 83]]), 'best_params': {}}


### Only text and tokens

In [18]:
results = modelFit(MultinomialNB(), {}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

{'accuracy': 0.78, 'f1': 0.7753124338624338, 'precision': 0.7794217341078741, 'recall': 0.78, 'confusion_matrix': array([[31,  5,  0,  3],
       [ 2, 23,  1,  1],
       [ 0,  0, 20, 17],
       [ 9,  1,  5, 82]]), 'best_params': {}}


### Only entities and PoS

In [19]:
results = modelFit(MultinomialNB(), {}, train_features_entities, test_features_entities)
print(results)

{'accuracy': 0.48, 'f1': 0.3145945945945946, 'precision': 0.23396984924623115, 'recall': 0.48, 'confusion_matrix': array([[ 0,  0,  0, 39],
       [ 0,  0,  0, 27],
       [ 0,  0,  0, 37],
       [ 1,  0,  0, 96]]), 'best_params': {}}


  _warn_prf(average, modifier, msg_start, len(result))


### Only Dense Embeddings

In [20]:
results = modelFit(MultinomialNB(), {}, train_features_embeddes, test_features_embeddes)
print(results)

{'accuracy': 0.825, 'f1': 0.8210554285168502, 'precision': 0.8246121873815838, 'recall': 0.825, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 23, 14],
       [ 5,  1,  5, 86]]), 'best_params': {}}


### No NER

In [21]:
results = modelFit(MultinomialNB(), {}, train_features_no_er, test_features_no_er)
print(results)

{'accuracy': 0.805, 'f1': 0.8008366169234518, 'precision': 0.8027972984394132, 'recall': 0.805, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 22, 15],
       [ 7,  1,  6, 83]]), 'best_params': {}}


___
## LR

### All features

In [22]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)



{'accuracy': 0.79, 'f1': 0.7817663323393873, 'precision': 0.796355376344086, 'recall': 0.79, 'confusion_matrix': array([[27,  3,  2,  7],
       [ 1, 21,  1,  4],
       [ 0,  0, 19, 18],
       [ 3,  0,  3, 91]]), 'best_params': {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}}




### Only entities and PoS

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_entities, test_features_entities)
print(results)



{'accuracy': 0.5, 'f1': 0.3550016705646509, 'precision': 0.4237692307692308, 'recall': 0.5, 'confusion_matrix': array([[ 0,  0,  0, 39],
       [ 0,  0,  0, 27],
       [ 0,  0,  4, 33],
       [ 1,  0,  0, 96]]), 'best_params': {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}}


  _warn_prf(average, modifier, msg_start, len(result))


### Only dense embeddings

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_embeddes, test_features_embeddes)
print(results)



KeyboardInterrupt: 

### No NER

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_no_er, test_features_no_er)
print(results)

___
## DTC

### All features

In [None]:
from sklearn.tree import DecisionTreeClassifier

results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entites and PoS

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_entities, test_features_entities)
print(results)

### Only dense embeddings

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_no_er, test_features_no_er)
print(results)

___
## RF

### All features

In [None]:
from sklean.ensemble import RandomForestClassifier

results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_no_er, test_features_no_er)

___
## SVM

### All features

In [None]:
from sklearn.svm import SVC

results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_no_er, test_features_no_er)

___
## Multi-Layer Perceptron

### All features

In [None]:
from sklearn.neural_network import MLPClassifier

results = modelFit(MLPClassifier(), {
        'classifier__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu']
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(MLPClassifier(), {
        'classifier__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu']
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(MLPClassifier(), {
        'classifier__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu']
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(MLPClassifier(), {
        'classifier__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu']
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(MLPClassifier(), {
        'classifier__hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100)],
        'classifier__activation': ['identity', 'logistic', 'tanh', 'relu']
    }, train_features_no_er, test_features_no_er)

___
## XGBoost

### All features

In [None]:
from xgboost import XGBClassifier

results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_no_er, test_features_no_er)

---

# Gradient Boosting

### All features

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_no_er, test_features_no_er)
print(results)

# K-Nearest Neighbors

### All features

In [None]:
from sklearn.neighbors import KNeighborsClassifier

results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_no_er, test_features_no_er)
print(results)

# Neural Networks

### All features

In [None]:
from sklearn.neural_network import MLPClassifier

results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_no_er, test_features_no_er)
print(results)