# Preprocessing

## Import dataset

In [49]:
import pandas as pd

data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

Unnamed: 0,text,label
59375,"Yankees 10, Red Sox 7: Just a typical chapter ...",1
54767,Alpharma Sells Generic of Pfizer Drug NEW YOR...,2
34472,Guerrero Beaned During Angels Game (Reuters) R...,1
11645,"O Romeo, art thou a good Republican? Romance f...",0
71008,"21 die, 2000 injured in Japan #39;s quake Loca...",0


In [50]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

# Cleanup Text

In [51]:
import re

In [52]:
# Not perfect, there's instances like '(Reuters)' that are not removed
def remove_reuters(text):
    pattern = r'\((\w+)\) \1+'
    return re.sub(pattern, '', text)

In [53]:
def remove_links(text):
    # Example
    # &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=BNNY.OB target=/stocks/quickinfo/fullquote""&gt;BNNY.OB&lt;/A&gt;
    pattern = r'&lt;A HREF=.*?&gt;(.*?)&lt;/A&gt;'
    return re.sub(pattern, '', text)

In [54]:
import html

def remove_html_tags(text):
    decoded_text = html.unescape(text)
    pattern = r'<.*?>'
    return re.sub(pattern, '', decoded_text)

In [65]:
def remove_quotes(text):
    pattern = r'quot;'
    return re.sub(pattern, '', text)

In [67]:
data['text'] = data['text'].apply(remove_links)
data['text'] = data['text'].apply(remove_reuters)
data['text'] = data['text'].apply(remove_html_tags)
data['text'] = data['text'].apply(remove_quotes)

"Somewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner.  Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn."

In [57]:
data.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black - Sho...,2
1,Carlyle Looks Toward Commercial Aerospace - P...,2
2,Oil and Economy Cloud Stocks' Outlook - Soari...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


## Spacy

In [56]:
import spacy
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

# nlp.enable_pipe('senter')

In [57]:
# TODO: process all data
small_data = data.head(1000)
small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens'] = small_data['text'].apply(nlp)


Unnamed: 0,text,label,tokens
436,Columnists: The Panacea of Information Securit...,3,"(Columnists, :, The, Panacea, of, Information,..."
952,U.N. Official Urges Political Independence - ...,0,"(U.N., Official, Urges, Political, Independenc..."
777,Rescue Teams Aid Hurricane Charley Victims PUN...,0,"(Rescue, Teams, Aid, Hurricane, Charley, Victi..."
996,Dollar Rises Vs Euro After Asset Data NEW YOR...,2,"(Dollar, Rises, Vs, Euro, After, Asset, Data, ..."
669,The Region's Highest-Paid Executives Pay for t...,2,"(The, Region, 's, Highest, -, Paid, Executives..."


In [59]:
small_data.iloc[0]['text']

"Wall St. Bears Claw Back Into the Black  - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [42]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens_count'] = small_data['tokens'].apply(len)


count    1000.000000
mean       46.372000
std        20.876864
min        14.000000
25%        31.000000
50%        44.000000
75%        55.000000
max       152.000000
Name: tokens_count, dtype: float64

In [43]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [44]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [45]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [46]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [11]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['text_embeddings'] = small_data['tokens_f

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner
200,Video games 'good for children' Computer games...,3,"(Video, games, ', good, for, children, ', Comp...",26,"[Video, games, good, children, Computer, games...",video game good child computer game promote pr...,"[[-0.057557, -1.7326, 4.388, 2.1707, -1.1465, ...","[(Video, NOUN, O, ), (games, NOUN, O, ), (good..."
320,"Lenovo revenue grows, but problems persist Chi...",3,"(Lenovo, revenue, grows, ,, but, problems, per...",24,"[Lenovo, revenue, grows, problems, persist, Ch...",lenovo revenue grow problem persist china larg...,"[[0.92553, 2.4457, -0.12281, 3.1267, 0.7986, 2...","[(Lenovo, PROPN, B, ORG), (revenue, NOUN, O, )..."
134,U.S. Barred From Weakening Dolphin Rules (AP) ...,3,"(U.S., Barred, From, Weakening, Dolphin, Rules...",50,"[U.S., Barred, Weakening, Dolphin, Rules, AP, ...",u.s. bar weaken dolphin rules ap ap victory en...,"[[-3.8398, 2.7468, -5.5801, 6.5437, 7.1942, 2....","[(U.S., PROPN, B, GPE), (Barred, VERB, O, ), (..."
679,Swimming showdown Ian Thorpe and Michael Phelp...,0,"(Swimming, showdown, Ian, Thorpe, and, Michael...",24,"[Swimming, showdown, Ian, Thorpe, Michael, Phe...",swimming showdown ian thorpe michael phelps ch...,"[[2.9178, -2.1378, 0.85246, -4.7337, 1.4823, 0...","[(Swimming, NOUN, O, ), (showdown, NOUN, O, ),..."
629,Millions Wait Hours in Venezuela to Vote in Re...,0,"(Millions, Wait, Hours, in, Venezuela, to, Vot...",35,"[Millions, Wait, Hours, Venezuela, Vote, Recal...",million wait hour venezuela vote recall electi...,"[[-2.9262, -2.5402, -0.27415, 4.2282, -0.94817...","[(Millions, NOUN, B, CARDINAL), (Wait, VERB, O..."


In [12]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    # non_entity_strings = [token for token in doc 
    #                     if token.text not in entity_dict 
    #                     and token.ent_iob_ == "O"
    #                     and token.pos_ != 'SPACE']
    # entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)


In [13]:
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner,entity_dict
954,AP: Group Discovers John the Baptist Cave KIBB...,0,"(AP, :, Group, Discovers, John, the, Baptist, ...",90,"[AP, Group, Discovers, John, Baptist, Cave, KI...",ap group discovers john baptist cave kibbutz t...,"[[0.73506, -0.13789, 9.1416, 11.004, 3.2473, 0...","[(AP, NOUN, B, ORG), (Group, PROPN, B, ORG), (...","{'ap': ('PROPN', 'ORG'), 'john baptist': ('PRO..."
869,"Letter, suspicious powder spark anthrax scare ...",0,"(Letter, ,, suspicious, powder, spark, anthrax...",67,"[Letter, suspicious, powder, spark, anthrax, s...",letter suspicious powder spark anthrax scare u...,"[[-2.215, -1.9555, 2.0093, 0.67661, 2.7137, -0...","[(Letter, NOUN, O, ), (suspicious, ADJ, O, ), ...","{'u.s.': ('PROPN', 'GPE'), 'malaysia': ('PROPN..."
199,3D holograms to crack forgeries A 3D hologram ...,3,"(3D, holograms, to, crack, forgeries, A, 3D, h...",21,"[3D, holograms, crack, forgeries, 3D, hologram...",3d hologram crack forgery 3d hologram techniqu...,"[[-6.4571, -2.9551, -6.4594, -1.8931, -2.8373,...","[(3D, ADJ, O, ), (holograms, NOUN, O, ), (crac...",{}
154,Experts Downplay Texas Shark Attacks (AP) AP -...,3,"(Experts, Downplay, Texas, Shark, Attacks, (, ...",51,"[Experts, Downplay, Texas, Shark, Attacks, AP,...",expert downplay texas shark attacks ap ap shar...,"[[2.9159, -0.21193, 0.47963, 2.3368, -0.6856, ...","[(Experts, NOUN, O, ), (Downplay, PROPN, O, ),...","{'texas': ('PROPN', 'GPE'), 'ap ap': ('PROPN',..."
783,Mass. Republicans Eye Kerry's Senate Seat (AP)...,0,"(Mass., Republicans, Eye, Kerry, 's, Senate, S...",67,"[Mass., Republicans, Eye, Kerry, Senate, Seat,...",mass. republicans eye kerry senate seat ap ap ...,"[[-0.3604, -0.72662, 1.7799, 5.8378, 1.6046, -...","[(Mass., PROPN, B, GPE), (Republicans, PROPN, ...","{'republicans': ('PROPN', 'NORP'), 'kerry sena..."


In [14]:
# get the filtered text and tokens in its own dataframe and save to csv
small_data_filtered = small_data[['text_filtered', 'label',]]
small_data_filtered.to_csv('datasets/small_data_filtered.csv', index=False)

# Model Training and predictions

In [15]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix

# split the data into training and testing sets
train, test = train_test_split(small_data, test_size=0.2, random_state=42)

train_features = []
train_features_embeddes = []
train_features_text_and_tokens = []
train_features_entities = []
train_features_no_er = []
train_labels = []
test_features = []
test_features_embeddes = []
test_features_text_and_tokens = []
test_features_entities = []
test_features_no_er = []
test_labels = []

for index, row in train.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    # All features
    features = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features.append(features)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    train_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    train_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    train_features_no_er.append(features_no_ner)
    
    train_labels.append(label)
    
for index, row in test.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    features_test = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features.append(features_test)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    test_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    test_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    test_features_no_er.append(features_no_ner)
    test_labels.append(label)

In [16]:
from sklearn.model_selection import GridSearchCV

def modelFit(model, param_grid, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    # Add max_iter to param_grid if the model supports it
    try:
        if hasattr(model, 'max_iter'):
            param_grid['classifier__max_iter'] = [6000]  # or whatever values you want
    except Exception as e:
        print(f"An error occurred when setting max_iter: {e}")


    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

    # Fit the GridSearchCV object
    grid_search.fit(train_features, train_labels)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict the test data
    preds = best_model.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    results = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': confusion_matrix(test_labels, preds),
        'best_params': grid_search.best_params_
    }

    return results

___
## NB

### All features

In [17]:
results = modelFit(MultinomialNB(), {}, train_features, test_features)
print(results)

{'accuracy': 0.805, 'f1': 0.8008366169234516, 'precision': 0.8027972984394132, 'recall': 0.805, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 22, 15],
       [ 7,  1,  6, 83]], dtype=int64), 'best_params': {}}


### Only text and tokens

In [18]:
results = modelFit(MultinomialNB(), {}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

{'accuracy': 0.78, 'f1': 0.7753124338624338, 'precision': 0.7794217341078741, 'recall': 0.78, 'confusion_matrix': array([[31,  5,  0,  3],
       [ 2, 23,  1,  1],
       [ 0,  0, 20, 17],
       [ 9,  1,  5, 82]], dtype=int64), 'best_params': {}}


### Only entities and PoS

In [19]:
results = modelFit(MultinomialNB(), {}, train_features_entities, test_features_entities)
print(results)

{'accuracy': 0.48, 'f1': 0.3145945945945946, 'precision': 0.23396984924623115, 'recall': 0.48, 'confusion_matrix': array([[ 0,  0,  0, 39],
       [ 0,  0,  0, 27],
       [ 0,  0,  0, 37],
       [ 1,  0,  0, 96]], dtype=int64), 'best_params': {}}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Only Dense Embeddings

In [20]:
results = modelFit(MultinomialNB(), {}, train_features_embeddes, test_features_embeddes)
print(results)

{'accuracy': 0.825, 'f1': 0.8210554285168502, 'precision': 0.8246121873815838, 'recall': 0.825, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 23, 14],
       [ 5,  1,  5, 86]], dtype=int64), 'best_params': {}}


### No NER

In [21]:
results = modelFit(MultinomialNB(), {}, train_features_no_er, test_features_no_er)
print(results)

{'accuracy': 0.805, 'f1': 0.8008366169234516, 'precision': 0.8027972984394132, 'recall': 0.805, 'confusion_matrix': array([[30,  5,  0,  4],
       [ 0, 26,  1,  0],
       [ 0,  0, 22, 15],
       [ 7,  1,  6, 83]], dtype=int64), 'best_params': {}}


___
## LR

### All features

In [22]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features, test_features)
print(results)



### Only text and tokens

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)



{'accuracy': 0.79, 'f1': 0.7817663323393873, 'precision': 0.796355376344086, 'recall': 0.79, 'confusion_matrix': array([[27,  3,  2,  7],
       [ 1, 21,  1,  4],
       [ 0,  0, 19, 18],
       [ 3,  0,  3, 91]]), 'best_params': {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}}




### Only entities and PoS

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_entities, test_features_entities)
print(results)



{'accuracy': 0.5, 'f1': 0.3550016705646509, 'precision': 0.4237692307692308, 'recall': 0.5, 'confusion_matrix': array([[ 0,  0,  0, 39],
       [ 0,  0,  0, 27],
       [ 0,  0,  4, 33],
       [ 1,  0,  0, 96]]), 'best_params': {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}}


  _warn_prf(average, modifier, msg_start, len(result))


### Only dense embeddings

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_embeddes, test_features_embeddes)
print(results)



KeyboardInterrupt: 

### No NER

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_no_er, test_features_no_er)
print(results)

___
## DTC

### All features

In [None]:
from sklearn.tree import DecisionTreeClassifier

results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entites and PoS

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_entities, test_features_entities)
print(results)

### Only dense embeddings

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_no_er, test_features_no_er)
print(results)

___
## RF

### All features

In [None]:
from sklean.ensemble import RandomForestClassifier

results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_no_er, test_features_no_er)

___
## SVM

### All features

In [None]:
from sklearn.svm import SVC

results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_no_er, test_features_no_er)

___
## XGBoost

### All features

In [None]:
from xgboost import XGBClassifier

results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_no_er, test_features_no_er)

---

## Gradient Boosting

### All features

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_no_er, test_features_no_er)
print(results)

## K-Nearest Neighbors

### All features

In [None]:
from sklearn.neighbors import KNeighborsClassifier

results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_no_er, test_features_no_er)
print(results)

## Multi-Layer Perc

### All features

In [None]:
from sklearn.neural_network import MLPClassifier

results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_no_er, test_features_no_er)
print(results)