In [1]:
import pandas as pd
import spacy
from spacy.tokens import Doc
from nltk import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pickle

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score as accuracy,
    recall_score as recall,
    precision_score as precision,
    f1_score
)

def display_metrics(y_true, y_pred):
    print(f'Confusion Matrix: \n{confusion_matrix(y_true, y_pred)}')
    print('Accuracy: {:.3f}'.format(accuracy(y_true, y_pred)))
    print('Recall: {:.3f}'.format(recall(y_true, y_pred)))
    print('Precision: {:.3f}'.format(precision(y_true, y_pred)))
    print('F1 Score: {:.3f}'.format(f1_score(y_true, y_pred)))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ner_df = pd.read_csv('../../datasets/extended_df.csv')
ner_df.drop(columns=['Unnamed: 0'], inplace=True)
ner_df['Sentence #'] = ner_df['Sentence #'].str.replace('Sentence: ','')
ner_df['Sentence #'].fillna(method='ffill', inplace=True)
ner_df['Sentence #'] = ner_df['Sentence #'].astype('int64')

In [3]:
ner_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,WordLength,Capital,Non-Punctuation,StopWord,IsNER
0,1,Thousands,NNS,O,9,True,True,False,0
1,1,of,IN,O,2,False,True,True,0
2,1,demonstrators,NNS,O,13,False,True,False,0
3,1,have,VBP,O,4,False,True,True,0
4,1,marched,VBN,O,7,False,True,False,0


In [4]:
sentence_df = ner_df.groupby('Sentence #', as_index=False)['Word'].apply(lambda x:x.str.cat(sep=' '))
sentence_df.head()

Unnamed: 0,Sentence #,Word
0,1,Thousands of demonstrators have marched throug...
1,2,Families of soldiers killed in the conflict jo...
2,3,They marched from the Houses of Parliament to ...
3,4,"Police put the number of marchers at 10,000 wh..."
4,5,The protest comes on the eve of the annual con...


In [5]:
nlp = spacy.load('en_core_web_sm')
def sentence_preprocessor(df):
    return_list = []
    # for index, sentence in df.iterrows():
    #     words = nlp(sentence['Word'])
    for sentence in range(df['Sentence #'].max()):
        words = nlp(Doc(nlp.vocab, df[df['Sentence #'] == sentence + 1].Word.values))
        for word in words:
            # print(word)
            word_base = word
            word_lemma = word.lemma_
            word_pos = word.pos_
            word_tag = word.tag_
            word_dep = word.dep_
            word_length = len(word)
            word_capitalization = str(word)[0].isupper()
            word_punctiation = str(word).isalnum()
            word_stop = word.is_stop
            is_ner = str(word) in set(ent.text for ent in words.ents)
            return_list.append((word_base, word_lemma, word_pos, word_tag, word_dep, word_length, word_capitalization, word_punctiation, word_stop, is_ner))
    return return_list

df_list = sentence_preprocessor(ner_df)
intermediate_df = pd.DataFrame(df_list, columns=['WordBase', 'WordLemma', 'WordPOS', 'WordTag', 
    'WordDep', 'WordLength', 'IsCapitalized', 'NonPunctuation', 'IsStop', 'PossibleNER'])

In [6]:
X_train = intermediate_df[:835700]
X_test = intermediate_df[835700:]
y_train = ner_df.IsNER[:835700]
y_test = ner_df.IsNER[835700:]

In [7]:
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)
models = [xgb_model, lr_model]
model_names = ['XGB', 'Logistic Regression']

categorical_cols = ['WordLemma', 'WordPOS', 'WordTag', 'WordDep']
numerical_cols = ['IsCapitalized', 'NonPunctuation', 'IsStop', 'PossibleNER', 'WordLength']

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

for model, model_name in zip(models, model_names):


    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

    pipeline.fit(X_train, y_train)  

    preds = pipeline.predict(X_test)

    print('Display metrics for {} with one-hot encoding:'.format(model_name))
    display_metrics(y_test, preds) 

Display metrics for XGB with one-hot encoding:
Confusion Matrix: 
[[186677   3372]
 [  2521  20305]]
Accuracy: 0.972
Recall: 0.890
Precision: 0.858
F1 Score: 0.873


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Display metrics for Logistic Regression with one-hot encoding:
Confusion Matrix: 
[[187391   2658]
 [  2952  19874]]
Accuracy: 0.974
Recall: 0.871
Precision: 0.882
F1 Score: 0.876


In [8]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('xgb_model', xgb_model)
                            ])

pickle.dump(pipeline, open('../../flask_app/ner_model.pkl', 'wb'))