## Import dataset

In [66]:
import pandas as pd

data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

Unnamed: 0,text,label
11104,Chiron Delays Flu Vaccine Shipments NEW YORK ...,2
92576,Sheffield tops list of AL MVP favourites CBC S...,1
5531,Red Sox: Burly effort CHICAGO -- A year ago la...,1
14492,Hewitt serves notice heading to Open When you ...,1
96578,Dollar declines following Greenspan remarks (A...,0


## Construct corpus

In [67]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

Number of unique tokens in corpus

In [68]:
# from nltk.tokenize import wordpunct_tokenize
# from collections import Counter

# tokens = wordpunct_tokenize(corpus)
# word_counts = Counter(tokens)

# print("Number of unique tokens:", len(word_counts))

## Tokenization

In [69]:
# from nltk import word_tokenize, sent_tokenize
# from nltk.tokenize import wordpunct_tokenize

# data['word_tokens'] = data['text'].apply(word_tokenize) # slower than wordpunct_tokenize
# data['word_tokens'] = data['text'].apply(wordpunct_tokenize)
# data['word_counts'] = data['word_tokens'].apply(lambda x: len(x))
# data['unique_word_counts'] = data['word_tokens'].apply(lambda x: len(set(x)))

# data['sentences'] = data['text'].apply(sent_tokenize)
# data['sentence_counts'] = data['sentences'].apply(len)

# data.sample(5)

# Spacy

In [70]:
import spacy
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

nlp.enable_pipe('senter')

In [71]:
# import re
# from spacy.tokens import Doc
# from spacy.language import Language

# @Language.component("regex_cleanup")
# def regex_cleanup(doc):
#     cleaned_tokens = []
#     for token in doc:
#         if not re.match(r'((\w+))', token.text):
#             cleaned_tokens.append(token)
#     return Doc(doc.vocab, words=[token.text for token in cleaned_tokens])

# nlp.add_pipe("regex_cleanup", first=True)

In [72]:
# TODO: process all data
small_data = data.sample(1000)
small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

Unnamed: 0,text,label,tokens
53573,"Labor Costs, Hurricane Hit Alcoa Profit Alcoa ...",2,"(Labor, Costs, ,, Hurricane, Hit, Alcoa, Profi..."
113311,Free agent Pavano chooses Yanks Carl Pavano ha...,1,"(Free, agent, Pavano, chooses, Yanks, Carl, Pa..."
17593,Second quarter sees rise in storage revenue Wo...,3,"(Second, quarter, sees, rise, in, storage, rev..."
80083,Hewitt holds off French talent Monfils in Pari...,1,"(Hewitt, holds, off, French, talent, Monfils, ..."
44861,Baghdad hit by several bomb blasts A suicide b...,0,"(Baghdad, hit, by, several, bomb, blasts, A, s..."


In [73]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

count    1000.000000
mean       44.316000
std        11.614346
min        16.000000
25%        37.000000
50%        44.000000
75%        50.000000
max       132.000000
Name: tokens_count, dtype: float64

In [74]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [75]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [76]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [77]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [78]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner
92274,"NFL : McNabb, Owens torch Cowboys While it was...",1,"(NFL, :, McNabb, ,, Owens, torch, Cowboys, Whi...",44,"[NFL, McNabb, Owens, torch, Cowboys, sure, hit...",nfl mcnabb owens torch cowboys sure hit east c...,"[[-4.2266, 8.2051, -1.2351, 4.3285, 6.7409, 11...","[(NFL, NOUN, B, ORG), (McNabb, PROPN, B, PERSO..."
44181,Bittersweet day in Montreal as Expos officiall...,0,"(Bittersweet, day, in, Montreal, as, Expos, of...",35,"[Bittersweet, day, Montreal, Expos, officially...",bittersweet day montreal expos officially play...,"[[0.73147, -2.604, -3.4392, 0.035654, -2.0559,...","[(Bittersweet, PROPN, O, ), (day, NOUN, O, ), ..."
106304,Snake Eater a Slithering Success Despite all t...,3,"(Snake, Eater, a, Slithering, Success, Despite...",38,"[Snake, Eater, Slithering, Success, Despite, B...",snake eater slither success despite bond touch...,"[[-2.0681, 1.4007, -0.4877, -3.2356, 0.53422, ...","[(Snake, PROPN, O, ), (Eater, PROPN, O, ), (Sl..."
44583,IBM claims BlueGene/L now world #39;s most pow...,3,"(IBM, claims, BlueGene, /, L, now, world, #, 3...",41,"[IBM, claims, BlueGene, L, world, 39;s, powerf...",ibm claim bluegene l world 39;s powerful super...,"[[-6.2718, -2.105, 8.7398, 4.6458, 2.0883, 1.5...","[(IBM, PROPN, B, ORG), (claims, VERB, O, ), (B..."
73653,"HP, QLogic Unveil SAN-in-a-Box Storage powerho...",3,"(HP, ,, QLogic, Unveil, SAN, -, in, -, a, -, B...",43,"[HP, QLogic, Unveil, SAN, Box, Storage, powerh...",hp qlogic unveil san box storage powerhouse he...,"[[-0.79661, 8.0083, -4.6256, 7.6264, 0.02344, ...","[(HP, PROPN, B, ORG), (QLogic, PROPN, B, ORG),..."


In [79]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    non_entity_strings = [token for token in doc 
                        if token.text not in entity_dict 
                        and token.ent_iob_ == "O"
                        and token.pos_ != 'SPACE']
    entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

In [80]:
small_data.sample(5)

Unnamed: 0,text,label,tokens,tokens_count,tokens_filtered,text_filtered,text_embeddings,text_ner,entity_dict
80934,Bomber Hits Near Baghdad Airport A car bomb ex...,0,"(Bomber, Hits, Near, Baghdad, Airport, A, car,...",37,"[Bomber, Hits, Near, Baghdad, Airport, car, bo...",bomber hits near baghdad airport car bomb expl...,"[[-0.1339, 2.3994, -0.88813, 2.1828, -0.51505,...","[(Bomber, PROPN, O, ), (Hits, PROPN, O, ), (Ne...","{'baghdad airport': ('PROPN', 'FAC'), 'lebanes..."
101886,Dollar Hits New Low; Gold Heads for #36;455 (...,2,"(Dollar, Hits, New, Low, ;, Gold, Heads, for, ...",57,"[Dollar, Hits, New, Low, Gold, Heads, 36;455, ...",dollar hits new low gold heads 36;455 reuters ...,"[[-1.8113, 0.44784, -1.846, 0.52974, 0.78695, ...","[(Dollar, NOUN, O, ), (Hits, PROPN, O, ), (New...","{'36;455': ('CCONJ', 'CARDINAL'), 'reuter reut..."
14941,Okocha Tests Liverpool Super Eagles captain Au...,1,"(Okocha, Tests, Liverpool, Super, Eagles, capt...",36,"[Okocha, Tests, Liverpool, Super, Eagles, capt...",okocha tests liverpool super eagles captain au...,"[[1.5999, 3.4158, 1.0194, -0.1812, 0.40594, -1...","[(Okocha, PROPN, B, ORG), (Tests, PROPN, I, OR...","{'okocha': ('PROPN', 'NORP'), 'liverpool super..."
116709,Sorry Anelka still on the trading block Anelka...,1,"(Sorry, Anelka, still, on, the, trading, block...",42,"[Sorry, Anelka, trading, block, Anelka, expres...",sorry anelka trading block anelka express wish...,"[[4.1412, 1.4775, 1.7142, -1.2661, -2.878, -1....","[(Sorry, INTJ, O, ), (Anelka, PROPN, B, PRODUC...","{'anelka': ('PROPN', 'NORP'), 'quot;big': ('PR..."
105818,Dollar Down But Seen in Ranges (Reuters) Reute...,2,"(Dollar, Down, But, Seen, in, Ranges, (, Reute...",44,"[Dollar, Seen, Ranges, Reuters, Reuters, dolla...",dollar see ranges reuters reuters dollar fall ...,"[[-1.8113, 0.44784, -1.846, 0.52974, 0.78695, ...","[(Dollar, NOUN, O, ), (Seen, VERB, O, ), (Rang...","{'reuters reuters': ('PROPN', 'ORG'), 'dollar ..."


In [81]:
# get the filtered text and tokens in its own dataframe and save to csv
small_data_filtered = small_data[['text_filtered', 'label',]]
small_data_filtered.to_csv('datasets/small_data_filtered.csv', index=False)

In [82]:
# %store small_data

# Model Training and predictions

In [102]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import confusion_matrix

# split the data into training and testing sets
train, test = train_test_split(small_data, test_size=0.2, random_state=42)

train_features = []
train_labels = []
test_features = []
test_labels = []

for index, row in train.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    features = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    train_features.append(features)
    train_labels.append(label)
    
for index, row in test.iterrows():
    entity_dict = row['entity_dict']
    word_embeddings = np.array(row['text_embeddings'])  # Convert list to numpy array
    word_embeddings_doc = np.mean(word_embeddings, axis=0)  # Average the word embeddings
    text_filtered = row['text_filtered']
    tokens_filtered = row['tokens_filtered']
    label = row['label']
    
    features = {
        'entity_dict': str(entity_dict),  # Convert dictionary to string
        'word_embeddings': [str(embedding) for embedding in word_embeddings.tolist()],  # Convert numpy array to list of strings
        'text_filtered': str(text_filtered),  # Convert token to string
        'tokens_filtered': [str(token) for token in tokens_filtered]  # Convert tokens to strings
    }
    test_features.append(features)
    test_labels.append(label)

Accuracy: 0.8600
F1 Score: 0.8596
Precision: 0.8635
Recall: 0.8600
[[36  0  4  2]
 [ 1 44  1  0]
 [ 2  0 58  5]
 [ 1  1 11 34]]


In [110]:
from sklearn.model_selection import GridSearchCV

def modelFit(model, param_grid):
    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('classifier', model)
    ])

    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

    # Fit the GridSearchCV object
    grid_search.fit(train_features, train_labels)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict the test data
    preds = best_model.predict(test_features)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, preds)
    f1 = f1_score(test_labels, preds, average='weighted')
    precision = precision_score(test_labels, preds, average='weighted')
    recall = recall_score(test_labels, preds, average='weighted')

    results = {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': confusion_matrix(test_labels, preds),
        'best_params': grid_search.best_params_
    }

    return results

## NB

In [111]:
results = modelFit(MultinomialNB(), {})
print(results)

{'accuracy': 0.86, 'f1': 0.8596217569936643, 'precision': 0.8634966673991064, 'recall': 0.86, 'confusion_matrix': array([[36,  0,  4,  2],
       [ 1, 44,  1,  0],
       [ 2,  0, 58,  5],
       [ 1,  1, 11, 34]]), 'best_params': {}}


## LR

In [113]:
results = modelFit(LogisticRegression(), {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
})
print(results)



{'accuracy': 0.805, 'f1': 0.8041813832253208, 'precision': 0.8125639499939896, 'recall': 0.805, 'confusion_matrix': array([[32,  0,  4,  6],
       [ 0, 43,  2,  1],
       [ 7,  2, 45, 11],
       [ 1,  2,  3, 41]]), 'best_params': {'classifier__C': 10.0, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}}


## DTC

In [106]:
from sklearn.tree import DecisionTreeClassifier

results = modelFit(DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 5, 10, 15],
        'classifier__min_samples_split': [2, 5, 10]
    })
print(results)

{'accuracy': 0.615, 'f1': 0.6243559264176284, 'precision': 0.6582870313752668, 'recall': 0.615, 'confusion_matrix': array([[24,  1,  3, 14],
       [ 2, 29,  2, 13],
       [ 8,  4, 38, 15],
       [ 5,  2,  8, 32]])}
