# Preprocessing

## Imports


In [2]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix
import pandas as pd
import re
import spacy
import html
import pickle
from sklearn.model_selection import GridSearchCV
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


## Import dataset

In [3]:


data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

Unnamed: 0,text,label
21727,"U.S. Military Deaths in Iraq Pass 1,000 BAGHDA...",0
104515,LSU Going Miles to Fill Coaching Vacancy BATO...,1
3759,Silver lining melts foul mood US shot-putter A...,1
59586,FASB Delays Options Expensing Rule Bowing to c...,2
86314,US team wins Tommy Bahama Challenge in playoff...,1


In [4]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

# Cleanup Text

In [5]:
# Not perfect, there's instances like '(Reuters)' that are not removed
def remove_reuters(text):
    pattern = r'\((\w+)\) \1+'
    return re.sub(pattern, '', text)

In [6]:
def remove_links(text):
    # Example
    # &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=BNNY.OB target=/stocks/quickinfo/fullquote""&gt;BNNY.OB&lt;/A&gt;
    pattern = r'&lt;A HREF=.*?&gt;(.*?)&lt;/A&gt;'
    return re.sub(pattern, '', text)

In [7]:

def remove_html_tags(text):
    decoded_text = html.unescape(text)
    pattern = r'<.*?>'
    return re.sub(pattern, '', decoded_text)

In [8]:
def remove_quotes(text):
    pattern = r'quot;'
    return re.sub(pattern, '', text)

In [9]:
data['text'] = data['text'].apply(remove_links)
data['text'] = data['text'].apply(remove_reuters)
data['text'] = data['text'].apply(remove_html_tags)
data['text'] = data['text'].apply(remove_quotes)

In [10]:
data.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black - Sho...,2
1,Carlyle Looks Toward Commercial Aerospace - P...,2
2,Oil and Economy Cloud Stocks' Outlook - Soari...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


## Spacy

In [11]:
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

# nlp.enable_pipe('senter')

In [12]:
# TODO: process all data
#small_data = data.sample(30000)

# Set the random seed
np.random.seed(42)

# get 30000 entries with the same distribution of the original data
small_data = data.groupby('label').apply(lambda x: x.sample(10000)).reset_index(drop=True)

small_data.to_csv('datasets/small_data_sample.csv', index=False)



In [13]:
small_data = pd.read_csv('datasets/small_data_sample.csv')

small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

Unnamed: 0,text,label,tokens
30717,Copernic unveils Desktop Search Tool Copernic ...,3,"(Copernic, unveils, Desktop, Search, Tool, Cop..."
1084,Hundreds held over Apec protests Chile's gover...,0,"(Hundreds, held, over, Apec, protests, Chile, ..."
5491,Japan Considers Sanctions Vs. N. Korea - Japa...,0,"(Japan, Considers, Sanctions, Vs, ., N., Korea..."
35480,New Bird Species Found in Philippines - Filip...,3,"(New, Bird, Species, Found, in, Philippines, ..."
6978,"Sudanese Militiamen Disarm, Say They Not Janja...",0,"(Sudanese, Militiamen, Disarm, ,, Say, They, N..."


In [14]:
small_data.iloc[0]['text']

'Explosion Rocks Baghdad Neighborhood BAGHDAD, Iraq, August 24 -- A car bomb exploded near the gate of a US-funded Iraqi television network in Baghdad on Tuesday, killing at least two people and wounding two others, authorities and witnesses said.'

In [15]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

count    40000.000000
mean        43.277425
std         12.164849
min          9.000000
25%         36.000000
50%         42.000000
75%         49.000000
max        196.000000
Name: tokens_count, dtype: float64

In [16]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [17]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [18]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [19]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [20]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner
27417,Adelphia Wants Rigas Family to Repay Billions ...,2,"(Adelphia, Wants, Rigas, Family, to, Repay, Bi...",35,"[Adelphia, Wants, Rigas, Family, Repay, Billio...",adelphia want rigas family repay billions wash...,"[[-3.6483, 0.84923, -0.83937, 2.7171, 3.0337, ...","[(Adelphia, PROPN, B, ORG), (Wants, VERB, I, O..."
37932,Open source industry challenges Gartner on Lin...,3,"(Open, source, industry, challenges, Gartner, ...",47,"[Open, source, industry, challenges, Gartner, ...",open source industry challenge gartner linux r...,"[[5.5906, -3.7454, 13.984, 3.6313, -1.241, 5.2...","[(Open, ADJ, O, ), (source, NOUN, O, ), (indus..."
12047,Netherlands beats Finland in World Cup qualifi...,1,"(Netherlands, beats, Finland, in, World, Cup, ...",38,"[Netherlands, beats, Finland, World, Cup, qual...",netherlands beat finland world cup qualifier n...,"[[-3.5619, -0.94442, 1.1237, 1.2517, 3.7817, -...","[(Netherlands, PROPN, B, GPE), (beats, VERB, O..."
17120,Notebook: Murray out another week or so LOS AN...,1,"(Notebook, :, Murray, out, another, week, or, ...",49,"[Notebook, Murray, week, LOS, ANGELES, worst, ...",notebook murray week los angeles bad opening n...,"[[-0.048501, -0.40296, 0.50047, 0.6931, 0.0141...","[(Notebook, NOUN, O, ), (Murray, PROPN, B, PER..."
25390,"Oil Flat After Drop, More Losses Seen Oil pric...",2,"(Oil, Flat, After, Drop, ,, More, Losses, Seen...",43,"[Oil, Flat, Drop, Losses, Seen, Oil, prices, h...",oil flat drop loss see oil price hold steady t...,"[[0.81622, 3.3474, -0.74592, 5.1131, -5.3648, ...","[(Oil, PROPN, O, ), (Flat, PROPN, O, ), (Drop,..."


In [21]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    # non_entity_strings = [token for token in doc 
    #                     if token.text not in entity_dict 
    #                     and token.ent_iob_ == "O"
    #                     and token.pos_ != 'SPACE']
    # entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

In [22]:
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner,entity_dict
7455,UN nuclear chief: patience wearing thin over l...,0,"(UN, nuclear, chief, :, patience, wearing, thi...",70,"[UN, nuclear, chief, patience, wearing, thin, ...",un nuclear chief patience wear thin lack progr...,"[[5.0715, 5.5179, -2.8, -0.24576, 5.5012, -5.3...","[(UN, PROPN, B, ORG), (nuclear, ADJ, O, ), (ch...","{'un': ('PROPN', 'ORG'), 'north korea': ('PROP..."
16832,"Miller takes weekly honors with fifth, sixth w...",1,"(Miller, takes, weekly, honors, with, fifth, ,...",46,"[Miller, takes, weekly, honors, fifth, sixth, ...",miller take weekly honor fifth sixth win com h...,"[[-1.8241, 1.0959, 1.2405, 2.3382, 0.18408, -3...","[(Miller, PROPN, B, ORG), (takes, VERB, O, ), ...","{'miller': ('PROPN', 'ORG'), 'weekly': ('ADJ',..."
21023,Unions may block Alitalia rescue The Italian a...,2,"(Unions, may, block, Alitalia, rescue, The, It...",24,"[Unions, block, Alitalia, rescue, Italian, air...",union block alitalia rescue italian airline fa...,"[[-1.9135, -2.6366, -5.631, 3.2933, 5.6485, 4....","[(Unions, NOUN, O, ), (block, VERB, O, ), (Ali...","{'alitalia rescue': ('PROPN', 'ORG'), 'italian..."
5290,Kerry Questions Bush's Judgment on Iraq NEW YO...,0,"(Kerry, Questions, Bush, 's, Judgment, on, Ira...",61,"[Kerry, Questions, Bush, Judgment, Iraq, NEW, ...",kerry questions bush judgment iraq new york se...,"[[-2.0371, -2.4904, 0.5559, 0.77599, 2.3708, -...","[(Kerry, PROPN, B, PERSON), (Questions, PROPN,...","{'bush': ('PROPN', 'PERSON'), 'iraq': ('PROPN'..."
32129,Enter your e-mail: Archos has turned up the he...,3,"(Enter, your, e, -, mail, :, Archos, has, turn...",44,"[Enter, e, mail, Archos, turned, heat, portabl...",enter e mail archos turn heat portable enterta...,"[[4.2648, -0.15802, 5.8268, 2.0154, 1.0183, 1....","[(Enter, VERB, O, ), (e, NOUN, O, ), (mail, NO...","{'archos': ('PROPN', 'LOC'), '20': ('NUM', 'CA..."


In [23]:
# get the filtered text and tokens in its own dataframe and save to csv
#small_data_filtered = small_data[['text_filtered', 'label',]]
#small_data_filtered.to_csv('datasets/small_data_filtered.csv', index=False)

# Model Training and predictions

In [24]:
# split the data into training and testing sets
train, test = train_test_split(small_data, test_size=0.2, random_state=42)

train_features = []
train_features_embeddes = []
train_features_text_and_tokens = []
train_features_entities = []
train_features_no_er = []
train_labels = []
test_features = []
test_features_embeddes = []
test_features_text_and_tokens = []
test_features_entities = []
test_features_no_er = []
test_labels = []

for index, row in train.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    entity_dict_str = str(entity_dict)
    word_embeddings_str = [str(embedding) for embedding in word_embeddings.tolist()]
    text_filtered_str = str(text_filtered)
    tokens_filtered_str = [str(token) for token in tokens_filtered]
    # All features
    features = {
        'entity_dict': entity_dict_str,  # Convert dictionary to string
        'word_embeddings': word_embeddings_str,  # Convert numpy array to list of strings
        'text_filtered': text_filtered_str,  # Convert token to string
        'tokens_filtered': tokens_filtered_str  # Convert tokens to strings
    }
    train_features.append(features)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': word_embeddings_str,  # Convert numpy array to list of strings
    } 
    train_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': text_filtered_str,  # Convert token to string
        'tokens_filtered': tokens_filtered_str  # Convert tokens to strings
    }
    train_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': entity_dict_str,  # Convert dictionary to string
    }
    train_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': word_embeddings_str,  # Convert numpy array to list of strings
        'text_filtered': text_filtered_str,  # Convert token to string
        'tokens_filtered': tokens_filtered_str  # Convert tokens to strings
    } 
    train_features_no_er.append(features_no_ner)
    
    train_labels.append(label)
    
for index, row in test.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    
    features_test = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features.append(features_test)
    
    # Only embeddings
    features_embedds = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
    } 
    test_features_embeddes.append(features_embedds)
    
    # Only text and tokens
    features_text_and_tokens = {
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features_text_and_tokens.append(features_text_and_tokens)
    
    # Only entities
    features_entities = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
    }
    test_features_entities.append(features_entities)
    
    # No NER
    features_no_ner = {
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    } 
    test_features_no_er.append(features_no_ner)
    test_labels.append(label)

### Pickle save the features and Labels

In [26]:
# pickle the data

with open('datasets/pickle/train_features.pkl', 'wb') as f:
    pickle.dump(train_features, f)

with open('datasets/pickle/train_features_embeddes.pkl', 'wb') as f:
    pickle.dump(train_features_embeddes, f)
    
with open('datasets/pickle/train_features_text_and_tokens.pkl', 'wb') as f:
    pickle.dump(train_features_text_and_tokens, f)

with open('datasets/pickle/train_features_entities.pkl', 'wb') as f:
    pickle.dump(train_features_entities, f)

with open('datasets/pickle/train_features_no_er.pkl', 'wb') as f:
    pickle.dump(train_features_no_er, f)

with open('datasets/pickle/train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)

with open('datasets/pickle/test_features.pkl', 'wb') as f:
    pickle.dump(test_features, f)

with open('datasets/pickle/test_features_embeddes.pkl', 'wb') as f:
    pickle.dump(test_features_embeddes, f)
    
with open('datasets/pickle/test_features_text_and_tokens.pkl', 'wb') as f:
    pickle.dump(test_features_text_and_tokens, f)

with open('datasets/pickle/test_features_entities.pkl', 'wb') as f:
    pickle.dump(test_features_entities, f)
    
with open('datasets/pickle/test_features_no_er.pkl', 'wb') as f:
    pickle.dump(test_features_no_er, f)
    
with open('datasets/pickle/test_labels.pkl', 'wb') as f:
    pickle.dump(test_labels, f)

### Pickle load the feature  and Labels

In [27]:
# import pickled data
with open('datasets/pickle/train_features.pkl', 'rb') as f:
    train_features = pickle.load(f)

with open('datasets/pickle/train_features_embeddes.pkl', 'rb') as f:
    train_features_embeddes = pickle.load(f)

with open('datasets/pickle/train_features_text_and_tokens.pkl', 'rb') as f:
    train_features_text_and_tokens = pickle.load(f)

with open('datasets/pickle/train_features_entities.pkl', 'rb') as f:
    train_features_entities = pickle.load(f)

with open('datasets/pickle/train_features_no_er.pkl', 'rb') as f:
    train_features_no_er = pickle.load(f)

with open('datasets/pickle/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
    
with open('datasets/pickle/test_features.pkl', 'rb') as f:
    test_features = pickle.load(f)
    
with open('datasets/pickle/test_features_embeddes.pkl', 'rb') as f:
    test_features_embeddes = pickle.load(f)
    
with open('datasets/pickle/test_features_text_and_tokens.pkl', 'rb') as f:
    test_features_text_and_tokens = pickle.load(f)
    
with open('datasets/pickle/test_features_entities.pkl', 'rb') as f:
    test_features_entities = pickle.load(f)
    
with open('datasets/pickle/test_features_no_er.pkl', 'rb') as f:
    test_features_no_er = pickle.load(f)
    
with open('datasets/pickle/test_labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)

In [34]:


def modelFit(model, param_grid, train_features, test_features):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

    start = time.time()

    # Fit the GridSearchCV object
    grid_search.fit(train_features, train_labels)
    
    end = time.time() - start

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict the test data
    preds = best_model.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    results = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': confusion_matrix(test_labels, preds),
        'best_params': grid_search.best_params_,
        'time': end
    }

    return results

___
## NB

### All features

In [35]:
results = modelFit(MultinomialNB(), {}, train_features, test_features)
print(results)

{'accuracy': 0.894625, 'f1': 0.8943349932664348, 'precision': 0.894309369030074, 'recall': 0.894625, 'confusion_matrix': array([[1819,   83,  100,   45],
       [  22, 1920,   13,   12],
       [  63,   18, 1699,  206],
       [  84,   19,  178, 1719]]), 'best_params': {}, 'time': 34.89687442779541}


### Only text and tokens

In [36]:
results = modelFit(MultinomialNB(), {}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

{'accuracy': 0.89675, 'f1': 0.8964542874845637, 'precision': 0.8964006023298475, 'recall': 0.89675, 'confusion_matrix': array([[1820,   82,  101,   44],
       [  22, 1920,   13,   12],
       [  66,   17, 1716,  187],
       [  94,   16,  172, 1718]]), 'best_params': {}, 'time': 6.520312070846558}


### Only entities and PoS

In [37]:
results = modelFit(MultinomialNB(), {}, train_features_entities, test_features_entities)
print(results)

{'accuracy': 0.298375, 'f1': 0.19730812003999837, 'precision': 0.5184803674947036, 'recall': 0.298375, 'confusion_matrix': array([[  38, 1961,   16,   32],
       [   4, 1922,    7,   34],
       [   4, 1820,   52,  110],
       [   8, 1567,   50,  375]]), 'best_params': {}, 'time': 0.7184696197509766}


### Only Dense Embeddings

In [38]:
results = modelFit(MultinomialNB(), {}, train_features_embeddes, test_features_embeddes)
print(results)

{'accuracy': 0.89175, 'f1': 0.8914349845519445, 'precision': 0.891388269185095, 'recall': 0.89175, 'confusion_matrix': array([[1813,   84,  101,   49],
       [  26, 1916,   16,    9],
       [  66,   19, 1683,  218],
       [  83,   23,  172, 1722]]), 'best_params': {}, 'time': 26.22289800643921}


### No NER

In [39]:
results = modelFit(MultinomialNB(), {}, train_features_no_er, test_features_no_er)
print(results)

{'accuracy': 0.895, 'f1': 0.894707443184342, 'precision': 0.8946807730938828, 'recall': 0.895, 'confusion_matrix': array([[1819,   83,  101,   44],
       [  23, 1920,   14,   10],
       [  63,   18, 1702,  203],
       [  85,   19,  177, 1719]]), 'best_params': {}, 'time': 33.66146731376648}


___
## LR

### All features

In [40]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features, test_features)
print(results)



{'accuracy': 0.89675, 'f1': 0.896433635274754, 'precision': 0.8964773276887752, 'recall': 0.89675, 'confusion_matrix': array([[1809,   81,   95,   62],
       [  21, 1921,   12,   13],
       [  73,   17, 1690,  206],
       [  79,   21,  146, 1754]]), 'best_params': {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}, 'time': 17979.420502901077}


### Only text and tokens

In [41]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)



{'accuracy': 0.892875, 'f1': 0.8925242560296233, 'precision': 0.8926486116935399, 'recall': 0.892875, 'confusion_matrix': array([[1806,   82,   97,   62],
       [  22, 1918,   13,   14],
       [  69,   23, 1675,  219],
       [  76,   29,  151, 1744]]), 'best_params': {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}, 'time': 13111.028271913528}


### Only entities and PoS

In [42]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_entities, test_features_entities)
print(results)



{'accuracy': 0.29925, 'f1': 0.19864595577564048, 'precision': 0.5014962276186938, 'recall': 0.29925, 'confusion_matrix': array([[  41, 1961,   13,   32],
       [   4, 1920,    7,   36],
       [   9, 1820,   47,  110],
       [  14, 1562,   38,  386]]), 'best_params': {'classifier__C': 10.0, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}, 'time': 1789.4881656169891}


### Only dense embeddings

In [43]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_embeddes, test_features_embeddes)
print(results)



{'accuracy': 0.89525, 'f1': 0.894914149916059, 'precision': 0.8950304127240184, 'recall': 0.89525, 'confusion_matrix': array([[1802,   85,   95,   65],
       [  19, 1923,   11,   14],
       [  72,   17, 1686,  211],
       [  78,   22,  149, 1751]]), 'best_params': {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}, 'time': 11989.5702521801}


### No NER

In [44]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}, train_features_no_er, test_features_no_er)
print(results)



{'accuracy': 0.89675, 'f1': 0.8964505816391909, 'precision': 0.8965493900613543, 'recall': 0.89675, 'confusion_matrix': array([[1804,   80,   94,   69],
       [  21, 1923,   11,   12],
       [  72,   16, 1691,  207],
       [  78,   19,  147, 1756]]), 'best_params': {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}, 'time': 17553.760464668274}


___
## DTC

### All features

In [45]:

results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features, test_features)
print(results)

{'accuracy': 0.772125, 'f1': 0.7719130635750178, 'precision': 0.7717283971404898, 'recall': 0.772125, 'confusion_matrix': array([[1610,  120,  180,  137],
       [ 120, 1705,   62,   80],
       [ 165,   73, 1419,  329],
       [ 144,   97,  316, 1443]]), 'best_params': {'classifier__max_depth': None, 'classifier__min_samples_split': 2}, 'time': 937.1630198955536}


### Only text and tokens

In [46]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

{'accuracy': 0.76475, 'f1': 0.76415846431765, 'precision': 0.7638631278529681, 'recall': 0.76475, 'confusion_matrix': array([[1591,  139,  155,  162],
       [ 107, 1717,   47,   96],
       [ 183,   84, 1382,  337],
       [ 156,   96,  320, 1428]]), 'best_params': {'classifier__max_depth': None, 'classifier__min_samples_split': 5}, 'time': 398.36433386802673}


### Only entites and PoS

In [47]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_entities, test_features_entities)
print(results)

{'accuracy': 0.297875, 'f1': 0.19664970603438572, 'precision': 0.48558346253784396, 'recall': 0.297875, 'confusion_matrix': array([[  41, 1961,   13,   32],
       [   4, 1922,    7,   34],
       [  10, 1820,   48,  108],
       [  17, 1565,   46,  372]]), 'best_params': {'classifier__max_depth': None, 'classifier__min_samples_split': 2}, 'time': 1450.9381716251373}


### Only dense embeddings

In [48]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_embeddes, test_features_embeddes)
print(results)

{'accuracy': 0.77, 'f1': 0.7701868109659573, 'precision': 0.7704857385740805, 'recall': 0.77, 'confusion_matrix': array([[1573,  126,  181,  167],
       [ 106, 1683,   89,   89],
       [ 184,   62, 1421,  319],
       [ 148,   84,  285, 1483]]), 'best_params': {'classifier__max_depth': None, 'classifier__min_samples_split': 2}, 'time': 480.0716848373413}


### No NER

In [49]:
results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    }, train_features_no_er, test_features_no_er)
print(results)

{'accuracy': 0.77, 'f1': 0.7698793496446737, 'precision': 0.7698128318506936, 'recall': 0.77, 'confusion_matrix': array([[1590,  124,  175,  158],
       [ 105, 1713,   67,   82],
       [ 184,   80, 1404,  318],
       [ 133,   78,  336, 1453]]), 'best_params': {'classifier__max_depth': None, 'classifier__min_samples_split': 2}, 'time': 882.4388146400452}


___
## RF

### All features

In [50]:

results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features, test_features)

KeyboardInterrupt: 

### Only text and tokens

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [5, 10, 20, 30]
    }, train_features_no_er, test_features_no_er)

___
## SVM

### All features

In [None]:

results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(SVC(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }, train_features_no_er, test_features_no_er)

___
## XGBoost

### All features

In [None]:

results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features, test_features)

### Only text and tokens

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_text_and_tokens, test_features_text_and_tokens)

### Only entities and PoS

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_entities, test_features_entities)

### Only dense embeddings

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_embeddes, test_features_embeddes)

### No NER

In [None]:
results = modelFit(XGBClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7, 9]
    }, train_features_no_er, test_features_no_er)

---

## Gradient Boosting

### All features

In [None]:

results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(GradientBoostingClassifier(), {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.5],
    'classifier__max_depth': [3, 5, 7]
}, train_features_no_er, test_features_no_er)
print(results)

## K-Nearest Neighbors

### All features

In [None]:

results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(KNeighborsClassifier(), {
    'classifier__n_neighbors': [3, 5, 7, 10],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}, train_features_no_er, test_features_no_er)
print(results)

## Multi-Layer Perc

### All features

In [None]:

results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features, test_features)
print(results)

### Only text and tokens

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_text_and_tokens, test_features_text_and_tokens)
print(results)

### Only entities and PoS

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_entities, test_features_entities)
print(results)

### Only Dense Embeddings

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_embeddes, test_features_embeddes)
print(results)

### No NER

In [None]:
results = modelFit(MLPClassifier(), {
    'classifier__hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
    'classifier__activation': ['logistic', 'relu'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate': ['constant', 'adaptive']
}, train_features_no_er, test_features_no_er)
print(results)