# NER Baseline
This notebook has the following goals:
- To establish baseline accuracy for the NER portion of GLG1's workflow
- To test two different embeddings:
    - One-hot
    - TF-IDF
- To test two different traditional ML models
    - Logistic regression
    - XGB
- To experiment with preprocessing techniques to see how it impacts accuracy
- To determine how to convert text into the format our model is expecting

## Basic imports and setup
### Imports

In [1]:
import pandas as pd
import spacy
from nltk import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score as accuracy,
    recall_score as recall,
    precision_score as precision,
    f1_score
)


  from pandas import MultiIndex, Int64Index


### Read in dataframe built based on previous EDA

In [2]:
ner_df = pd.read_csv('../datasets/extended_df.csv')
ner_df.drop(columns=['Unnamed: 0'], inplace=True)
ner_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,WordLength,Capital,Non-Punctuation,StopWord,IsNER
0,Sentence: 1,Thousands,NNS,O,9,True,True,False,0
1,,of,IN,O,2,False,True,True,0
2,,demonstrators,NNS,O,13,False,True,False,0
3,,have,VBP,O,4,False,True,True,0
4,,marched,VBN,O,7,False,True,False,0


### Make some updates to the "Sentence #" column
Ensuring every row has a sentence number, and changing that column to an int for use as a numerical feature later.

In [3]:
ner_df['Sentence #'] = ner_df['Sentence #'].str.replace('Sentence: ','')
ner_df['Sentence #'].fillna(method='ffill', inplace=True)
ner_df['Sentence #'] = ner_df['Sentence #'].astype('int64')
ner_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,WordLength,Capital,Non-Punctuation,StopWord,IsNER
0,1,Thousands,NNS,O,9,True,True,False,0
1,1,of,IN,O,2,False,True,True,0
2,1,demonstrators,NNS,O,13,False,True,False,0
3,1,have,VBP,O,4,False,True,True,0
4,1,marched,VBN,O,7,False,True,False,0


### Initial splits
Establishing X and y DataFrames, and splitting prior to any engineering, so as to avoid leakage.

In [4]:
X = ner_df.drop(columns=['IsNER', 'Tag'])
y = ner_df['IsNER']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Establishing metrics 
Based on Electronics Purchase Prediction notebook from class.

In [5]:
def display_metrics(y_true, y_pred):
    print(f'Confusion Matrix: \n{confusion_matrix(y_true, y_pred)}')
    print('Accuracy: {:.3f}'.format(accuracy(y_true, y_pred)))
    print('Recall: {:.3f}'.format(recall(y_true, y_pred)))
    print('Precision: {:.3f}'.format(precision(y_true, y_pred)))
    print('F1 Score: {:.3f}'.format(f1_score(y_true, y_pred)))

## Model Testing
### Testing models with one-hot encoding

In [6]:
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)
models = [xgb_model, lr_model]
model_names = ['XGB', 'Logistic Regression']

categorical_cols = ['Word', 'POS']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'bool']]

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

for model, model_name in zip(models, model_names):


    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

    pipeline.fit(X_train, y_train)  

    preds = pipeline.predict(X_test)

    print('Display metrics for {} with one-hot encoding:'.format(model_name))
    display_metrics(y_test, preds)



Display metrics for XGB with one-hot encoding:
Confusion Matrix: 
[[225932   8223]
 [  2632  25357]]
Accuracy: 0.959
Recall: 0.906
Precision: 0.755
F1 Score: 0.824
Display metrics for Logistic Regression with one-hot encoding:
Confusion Matrix: 
[[223103  11052]
 [  6677  21312]]
Accuracy: 0.932
Recall: 0.761
Precision: 0.659
F1 Score: 0.706


### Testing models with TF-IDF encoding

In [7]:
X_train_tfidf= X_train.copy()
X_test_tfidf = X_test.copy()

model_names = ['XGB', 'Logistic Regression']


numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'bool']]

numerical_transformer = SimpleImputer(strategy='constant')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
    ])

tfidf = TfidfVectorizer()

tfidf_vectors = tfidf.fit_transform(X_train_tfidf['Word'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_train_tfidf.index)
X_train_tfidf.drop(columns=['Word'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.fit_transform(X_train_tfidf['POS'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_train_tfidf.index)
X_train_tfidf.drop(columns=['POS'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.transform(X_test_tfidf['Word'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_test_tfidf.index)
X_test_tfidf.drop(columns=['Word'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.transform(X_test_tfidf['POS'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_test_tfidf.index)
X_test_tfidf.drop(columns=['POS'], inplace=True)
X_test_tfidf.join(tfidf_vectors_df)

for model, model_name in zip(models, model_names):


    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

    pipeline.fit(X_train_tfidf, y_train)  

    preds = pipeline.predict(X_test_tfidf)

    print('Display metrics for {} with TF-IDF:'.format(model_name))
    display_metrics(y_test, preds)



Display metrics for XGB with TF-IDF:
Confusion Matrix: 
[[220549  13606]
 [  3333  24656]]
Accuracy: 0.935
Recall: 0.881
Precision: 0.644
F1 Score: 0.744
Display metrics for Logistic Regression with TF-IDF:
Confusion Matrix: 
[[224413   9742]
 [ 10848  17141]]
Accuracy: 0.921
Recall: 0.612
Precision: 0.638
F1 Score: 0.625


## Experiementation
### Establishing baseline preprocessing

In [8]:
def text_preprossesor(word_array):
    return_list = []
    lemmatizer = WordNetLemmatizer()
    for word in word_array:
        word = word.lower()
        word = lemmatizer.lemmatize(word)
        return_list.append(word)
    return return_list
    

### Building new DF to test

In [9]:
preprocessed_df = ner_df.copy()
preprocessed_df.Word = text_preprossesor(preprocessed_df.Word)
preprocessed_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,WordLength,Capital,Non-Punctuation,StopWord,IsNER
0,1,thousand,NNS,O,9,True,True,False,0
1,1,of,IN,O,2,False,True,True,0
2,1,demonstrator,NNS,O,13,False,True,False,0
3,1,have,VBP,O,4,False,True,True,0
4,1,marched,VBN,O,7,False,True,False,0


### Resplitting data

In [10]:
X = preprocessed_df.drop(columns=['IsNER', 'Tag'])
y = preprocessed_df['IsNER']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### One-hot encoding with preprocessed data

In [11]:
xgb_model = XGBClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)
models = [xgb_model, lr_model]
model_names = ['XGB', 'Logistic Regression']

categorical_cols = ['Word', 'POS']

numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'bool']]

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

for model, model_name in zip(models, model_names):


    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

    pipeline.fit(X_train, y_train)  

    preds = pipeline.predict(X_test)

    print('Display metrics for {} with one-hot encoding:'.format(model_name))
    display_metrics(y_test, preds)



Display metrics for XGB with one-hot encoding:
Confusion Matrix: 
[[226065   8090]
 [  2633  25356]]
Accuracy: 0.959
Recall: 0.906
Precision: 0.758
F1 Score: 0.825
Display metrics for Logistic Regression with one-hot encoding:
Confusion Matrix: 
[[223103  11052]
 [  6677  21312]]
Accuracy: 0.932
Recall: 0.761
Precision: 0.659
F1 Score: 0.706


### TD-IDF with preprocessed data

In [12]:
X_train_tfidf= X_train.copy()
X_test_tfidf = X_test.copy()

model_names = ['XGB', 'Logistic Regression']


numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64', 'bool']]

numerical_transformer = SimpleImputer(strategy='constant')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
    ])

tfidf = TfidfVectorizer()

tfidf_vectors = tfidf.fit_transform(X_train_tfidf['Word'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_train_tfidf.index)
X_train_tfidf.drop(columns=['Word'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.fit_transform(X_train_tfidf['POS'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_train_tfidf.index)
X_train_tfidf.drop(columns=['POS'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.transform(X_test_tfidf['Word'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_test_tfidf.index)
X_test_tfidf.drop(columns=['Word'], inplace=True)
X_train_tfidf.join(tfidf_vectors_df)

tfidf_vectors = tfidf.transform(X_test_tfidf['POS'])
tfidf_vectors_df = pd.DataFrame(tfidf_vectors, index=X_test_tfidf.index)
X_test_tfidf.drop(columns=['POS'], inplace=True)
X_test_tfidf.join(tfidf_vectors_df)

for model, model_name in zip(models, model_names):


    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)
                                ])

    pipeline.fit(X_train_tfidf, y_train)  

    preds = pipeline.predict(X_test_tfidf)

    print('Display metrics for {} with TF-IDF:'.format(model_name))
    display_metrics(y_test, preds)



Display metrics for XGB with TF-IDF:
Confusion Matrix: 
[[220549  13606]
 [  3333  24656]]
Accuracy: 0.935
Recall: 0.881
Precision: 0.644
F1 Score: 0.744
Display metrics for Logistic Regression with TF-IDF:
Confusion Matrix: 
[[224413   9742]
 [ 10848  17141]]
Accuracy: 0.921
Recall: 0.612
Precision: 0.638
F1 Score: 0.625


### Changing sentence format
Putting sentences in format they'll be in in app.

In [13]:
sentence_df = ner_df.groupby('Sentence #')['Word'].apply(lambda x:x.str.cat(sep=' '))
sentence_df.head()

Sentence #
1    Thousands of demonstrators have marched throug...
2    Families of soldiers killed in the conflict jo...
3    They marched from the Houses of Parliament to ...
4    Police put the number of marchers at 10,000 wh...
5    The protest comes on the eve of the annual con...
Name: Word, dtype: object

### Converting app format to model format
Testing on first 100 sentences.

In [14]:
nlp = spacy.load('en_core_web_sm')
def sentence_preprocessor(df):
    return_list = []
    for i, sentence in enumerate(df):
        words = nlp(sentence)
        # print(set(words.ents))
        for word in words:
            word_base = word
            word_lemma = word.lemma_
            word_pos = word.pos_
            word_tag = word.tag_
            word_length = len(word)
            word_capitalization = str(word)[0].isupper()
            word_punctiation = str(word).isalnum()
            word_stop = word.is_stop
            is_ner = str(word) in set(ent.text for ent in words.ents)
            return_list.append((i, word_base, word_lemma, word_pos, word_tag, word_length, word_capitalization, word_punctiation, word_stop, is_ner))
    return return_list

df_list = sentence_preprocessor(sentence_df[:100])
intermediate_df = pd.DataFrame(df_list, columns=['SentenceNumber', 'WordBase', 'WordLemma', 'WordPOS', 'WordTag', 
    'WordLength', 'IsCapitalized', 'NonPunctuation', 'IsStop', 'PossibleNER'])
intermediate_df.head(25)



Unnamed: 0,SentenceNumber,WordBase,WordLemma,WordPOS,WordTag,WordLength,IsCapitalized,NonPunctuation,IsStop,PossibleNER
0,0,Thousands,thousand,NOUN,NNS,9,True,True,False,True
1,0,of,of,ADP,IN,2,False,True,True,False
2,0,demonstrators,demonstrator,NOUN,NNS,13,False,True,False,False
3,0,have,have,AUX,VBP,4,False,True,True,False
4,0,marched,march,VERB,VBN,7,False,True,False,False
5,0,through,through,ADP,IN,7,False,True,True,False
6,0,London,London,PROPN,NNP,6,True,True,False,True
7,0,to,to,PART,TO,2,False,True,True,False
8,0,protest,protest,VERB,VB,7,False,True,False,False
9,0,the,the,DET,DT,3,False,True,True,False
