# Preprocessing

## Import dataset

In [49]:
import pandas as pd

data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

Unnamed: 0,text,label
3050,Stocks Are Up Despite Rising Oil Prices NEW YO...,0
7642,Mexico Arrests Major Drug Trafficker (AP) AP -...,0
52527,US Files Grievance Over Airbus With WTO The Bu...,2
103697,Steelers defense lives up to ranking The Washi...,1
24301,9/11: NY port authority sues Saudi Arabia The ...,0


In [50]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

# Cleanup Text

In [51]:
import re

In [52]:
# Not perfect, there's instances like '(Reuters)' that are not removed
def remove_reuters(text):
    pattern = r'\((\w+)\) \1+'
    return re.sub(pattern, '', text)

In [53]:
def remove_links(text):
    # Example
    # &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=BNNY.OB target=/stocks/quickinfo/fullquote""&gt;BNNY.OB&lt;/A&gt;
    pattern = r'&lt;A HREF=.*?&gt;(.*?)&lt;/A&gt;'
    return re.sub(pattern, '', text)

In [54]:
import html

def remove_html_tags(text):
    decoded_text = html.unescape(text)
    pattern = r'<.*?>'
    return re.sub(pattern, '', decoded_text)

In [65]:
def remove_quotes(text):
    pattern = r'quot;'
    return re.sub(pattern, '', text)

In [67]:
data['text'] = data['text'].apply(remove_links)
data['text'] = data['text'].apply(remove_reuters)
data['text'] = data['text'].apply(remove_html_tags)
data['text'] = data['text'].apply(remove_quotes)

"Somewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner.  Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn."

In [57]:
data.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black - Sho...,2
1,Carlyle Looks Toward Commercial Aerospace - P...,2
2,Oil and Economy Cloud Stocks' Outlook - Soari...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


## Spacy

In [56]:
import spacy
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

# nlp.enable_pipe('senter')

In [57]:
# TODO: process all data
#small_data = data.sample(30000)

#get 30000 entries with the same distribution of the original data
small_data = data.groupby('label').apply(lambda x: x.sample(10000)).reset_index(drop=True)

#small_data = data.copy()
small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

Unnamed: 0,text,label,tokens
21015,A break with tradition If you think of your bu...,2,"(A, break, with, tradition, If, you, think, of..."
29258,UCB up after deal to sell unit to Cytec for \$...,2,"(UCB, up, after, deal, to, sell, unit, to, Cyt..."
16959,Yankees low key after clinching post-season sp...,1,"(Yankees, low, key, after, clinching, post, -,..."
22759,Future looks bleak for Yukos HAMISH ROBERTSON:...,2,"(Future, looks, bleak, for, Yukos, HAMISH, ROB..."
12916,"Heeling power During every break in play, ever...",1,"(Heeling, power, During, every, break, in, pla..."


In [59]:
small_data.iloc[0]['text']

"Wall St. Bears Claw Back Into the Black  - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [42]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

count    40000.000000
mean        44.057950
std         12.376145
min         12.000000
25%         36.000000
50%         43.000000
75%         50.000000
max        203.000000
Name: tokens_count, dtype: float64

In [43]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [44]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [45]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [46]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [11]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner
9305,Greenspan: Oil Price Surge Not Big Issue WASHI...,0,"(Greenspan, :, Oil, Price, Surge, Not, Big, Is...",82,"[Greenspan, Oil, Price, Surge, Big, Issue, WAS...",greenspan oil price surge big issue washington...,"[[-1.4591, -0.059439, 1.4654, 1.8356, -0.20327...","[(Greenspan, PROPN, B, PERSON), (Oil, PROPN, O..."
27505,Pay Up for Growth Legg Mason's Mary Chris Gay ...,2,"(Pay, Up, for, Growth, Legg, Mason, 's, Mary, ...",23,"[Pay, Growth, Legg, Mason, Mary, Chris, Gay, k...",pay growth legg mason mary chris gay know pay ...,"[[1.5095, 4.3979, -0.40606, 0.75999, 1.7007, -...","[(Pay, VERB, O, ), (Growth, PROPN, B, ORG), (L..."
30671,Hubble Space Telescope Spies a Bright Supernov...,3,"(Hubble, Space, Telescope, Spies, a, Bright, S...",48,"[Hubble, Space, Telescope, Spies, Bright, Supe...",hubble space telescope spies bright supernova ...,"[[0.17506, 0.6873, -4.2802, -1.1594, -0.017178...","[(Hubble, PROPN, B, FAC), (Space, PROPN, I, FA..."
7612,Dems Challenge Bush in New Campaign Ad (AP) AP...,0,"(Dems, Challenge, Bush, in, New, Campaign, Ad,...",52,"[Dems, Challenge, Bush, New, Campaign, Ad, AP,...",dems challenge bush new campaign ad ap ap demo...,"[[-1.608, 7.8588, -4.869, 5.3426, 2.8102, 2.08...","[(Dems, PROPN, B, ORG), (Challenge, PROPN, I, ..."
648,Thousands riot after Pakistan funeral SIALKOT...,0,"(Thousands, riot, after, Pakistan, funeral, ,...",54,"[Thousands, riot, Pakistan, funeral, SIALKOT, ...",thousand riot pakistan funeral sialkot pakista...,"[[-1.4431, -1.7198, -0.84067, 2.7954, 0.33977,...","[(Thousands, NOUN, B, CARDINAL), (riot, NOUN, ..."


In [12]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    # non_entity_strings = [token for token in doc 
    #                     if token.text not in entity_dict 
    #                     and token.ent_iob_ == "O"
    #                     and token.pos_ != 'SPACE']
    # entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

In [13]:
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner,entity_dict
30680,Cisco warns of flaws in ACS product BOSTON - N...,3,"(Cisco, warns, of, flaws, in, ACS, product, BO...",39,"[Cisco, warns, flaws, ACS, product, BOSTON, Ne...",cisco warn flaw acs product boston networking ...,"[[-1.9552, 0.29589, 4.3212, 1.4282, -0.63587, ...","[(Cisco, PROPN, B, ORG), (warns, VERB, O, ), (...","{'cisco': ('PROPN', 'GPE'), 'flaw acs product'..."
19237,"Atlanta Braves, Smoltz Agree to New Deal ATLA...",1,"(Atlanta, Braves, ,, Smoltz, Agree, to, New, D...",62,"[Atlanta, Braves, Smoltz, Agree, New, Deal, AT...",atlanta braves smoltz agree new deal atlanta s...,"[[-1.9546, 2.2159, -0.65238, 2.7719, 3.6104, 1...","[(Atlanta, PROPN, B, ORG), (Braves, PROPN, I, ...","{'atlanta': ('PROPN', 'GPE'), 'atlanta sports ..."
21550,Oracle plots course for growth Oracle #39;s ho...,2,"(Oracle, plots, course, for, growth, Oracle, #...",45,"[Oracle, plots, course, growth, Oracle, 39;s, ...",oracle plot course growth oracle 39;s hostile ...,"[[0.20333, -2.2937, 4.0433, 1.9336, 1.5848, 0....","[(Oracle, NOUN, B, ORG), (plots, NOUN, O, ), (...","{'oracle': ('PROPN', 'ORG'), 'oracle 39;s': ('..."
47,Telstra for sale at \$5 a share TREASURER Pete...,0,"(Telstra, for, sale, at, \$5, a, share, TREASU...",37,"[Telstra, sale, \$5, share, TREASURER, Peter, ...",telstra sale \$5 share treasurer peter costell...,"[[-0.32288, 4.5599, -0.17286, -1.8951, 2.1669,...","[(Telstra, PROPN, B, ORG), (sale, NOUN, O, ), ...","{'telstra': ('PROPN', 'NORP'), '\$5': ('PROPN'..."
555,Kashmir solution still far off despite Musharr...,0,"(Kashmir, solution, still, far, off, despite, ...",54,"[Kashmir, solution, far, despite, Musharraf, g...",kashmir solution far despite musharraf gambit ...,"[[0.20171, 1.6834, -2.7667, 3.8716, 5.7647, 0....","[(Kashmir, PROPN, B, LOC), (solution, NOUN, O,...","{'kashmir': ('PROPN', 'LOC'), 'afp afp': ('PRO..."


In [14]:
# get the filtered text and tokens in its own dataframe and save to csv
#small_data_filtered = small_data[['text_filtered', 'label',]]
#small_data_filtered.to_csv('datasets/small_data_filtered.csv', index=False)

# Model Training and predictions

In [15]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix

# split the data into training and testing sets
train, test = train_test_split(small_data, test_size=0.2, random_state=42)

train_features = []
train_features_embeddes = []
train_features_text_and_tokens = []
train_features_entities = []
train_features_no_er = []
train_labels = []
test_features = []
test_features_embeddes = []
test_features_text_and_tokens = []
test_features_entities = []
test_features_no_er = []
test_labels = []

for index, row in train.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    # All features
    features = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features.append(features)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    train_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    train_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    train_features_no_er.append(features_no_ner)
    
    train_labels.append(label)
    
for index, row in test.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    features_test = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features.append(features_test)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    test_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    test_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    test_features_no_er.append(features_no_ner)
    test_labels.append(label)

In [16]:
from sklearn.model_selection import GridSearchCV

def modelFit(model, param_grid, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

    # Fit the GridSearchCV object
    grid_search.fit(train_features, train_labels)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict the test data
    preds = best_model.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    results = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': confusion_matrix(test_labels, preds),
        'best_params': grid_search.best_params_
    }

    return results

___
## NB

### All features

In [17]:
results = modelFit(MultinomialNB(), {}, train_features, test_features)
print(results)

{'accuracy': 0.904625, 'f1': 0.9044251932117491, 'precision': 0.9045102173212911, 'recall': 0.904625, 'confusion_matrix': array([[1817,   67,  111,   52],
       [  21, 1924,   12,   10],
       [  59,   17, 1740,  170],
       [  77,   21,  146, 1756]]), 'best_params': {}}


### Only text and tokens

In [18]:
results = modelFit(MultinomialNB(), {}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

{'accuracy': 0.90625, 'f1': 0.9060497362386052, 'precision': 0.9060131772958145, 'recall': 0.90625, 'confusion_matrix': array([[1825,   65,  109,   48],
       [  24, 1923,    9,   11],
       [  62,   13, 1745,  166],
       [  87,   18,  138, 1757]]), 'best_params': {}}


### Only entities and PoS

In [19]:
results = modelFit(MultinomialNB(), {}, train_features_entities, test_features_entities)
print(results)

{'accuracy': 0.301625, 'f1': 0.20236541079088105, 'precision': 0.5700431432380205, 'recall': 0.301625, 'confusion_matrix': array([[  33, 1972,   10,   32],
       [   1, 1931,    8,   27],
       [   4, 1807,   79,   96],
       [   7, 1581,   42,  370]]), 'best_params': {}}


### Only Dense Embeddings

In [20]:
results = modelFit(MultinomialNB(), {}, train_features_embeddes, test_features_embeddes)
print(results)

{'accuracy': 0.900375, 'f1': 0.9001754048431471, 'precision': 0.9002720823557113, 'recall': 0.900375, 'confusion_matrix': array([[1814,   68,  111,   54],
       [  20, 1922,   12,   13],
       [  64,   20, 1721,  181],
       [  73,   21,  160, 1746]]), 'best_params': {}}


### No NER

In [21]:
results = modelFit(MultinomialNB(), {}, train_features_no_er, test_features_no_er)
print(results)

{'accuracy': 0.903875, 'f1': 0.9036569269056173, 'precision': 0.9037067168247837, 'recall': 0.903875, 'confusion_matrix': array([[1816,   68,  111,   52],
       [  21, 1924,   12,   10],
       [  61,   17, 1737,  171],
       [  80,   21,  145, 1754]]), 'best_params': {}}


___
## LR

### All features

In [22]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features, test_features)
print(results)



### Only text and tokens

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_entities, test_features_entities)
print(results)

### Only dense embeddings

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_no_er, test_features_no_er)
print(results)

___
## DTC

### All features

In [None]:
from sklearn.tree import DecisionTreeClassifier

results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entites and PoS

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_entities, test_features_entities)
print(results)

### Only dense embeddings

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_no_er, test_features_no_er)
print(results)

___
## RF

### All features

In [None]:
from sklean.ensemble import RandomForestClassifier

results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_no_er, test_features_no_er)

___
## SVM

### All features

In [None]:
from sklearn.svm import SVC

results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_no_er, test_features_no_er)

___
## XGBoost

### All features

In [None]:
from xgboost import XGBClassifier

results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_no_er, test_features_no_er)

---

## Gradient Boosting

### All features

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_no_er, test_features_no_er)
print(results)

## K-Nearest Neighbors

### All features

In [None]:
from sklearn.neighbors import KNeighborsClassifier

results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_no_er, test_features_no_er)
print(results)

## Multi-Layer Perc

### All features

In [None]:
from sklearn.neural_network import MLPClassifier

results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_no_er, test_features_no_er)
print(results)