In [None]:
import pandas as pd
import spacy
from spacy.tokens import Doc
from nltk import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pickle

In [None]:
ner_df = pd.read_csv('../../datasets/extended_df.csv')
ner_df.drop(columns=['Unnamed: 0'], inplace=True)
ner_df['Sentence #'] = ner_df['Sentence #'].str.replace('Sentence: ','')
ner_df['Sentence #'].fillna(method='ffill', inplace=True)
ner_df['Sentence #'] = ner_df['Sentence #'].astype('int64')

In [None]:
sentence_df = ner_df.groupby('Sentence #', as_index=False)['Word'].apply(lambda x:x.str.cat(sep=' '))

In [None]:
nlp = spacy.load('en_core_web_sm')
def sentence_preprocessor(df):
    return_list = []
    for sentence in range(df['Sentence #'].max()):
        words = nlp(Doc(nlp.vocab, df[df['Sentence #'] == sentence + 1].Word.values))
        for word in words:
            # print(word)
            word_base = word
            word_lemma = word.lemma_
            word_pos = word.pos_
            word_tag = word.tag_
            word_dep = word.dep_
            word_length = len(word)
            word_capitalization = str(word)[0].isupper()
            word_punctiation = str(word).isalnum()
            word_stop = word.is_stop
            is_ner = str(word) in set(ent.text for ent in words.ents)
            return_list.append((word_base, word_lemma, word_pos, word_tag, word_dep, word_length, word_capitalization, word_punctiation, word_stop, is_ner))
    return return_list

df_list = sentence_preprocessor(ner_df)
intermediate_df = pd.DataFrame(df_list, columns=['WordBase', 'WordLemma', 'WordPOS', 'WordTag', 
    'WordDep', 'WordLength', 'IsCapitalized', 'NonPunctuation', 'IsStop', 'PossibleNER'])

In [None]:
X_train = intermediate_df[:835700]
X_test = intermediate_df[835700:]
y_train = ner_df.IsNER[:835700]
y_test = ner_df.IsNER[835700:]

In [None]:
xgb_model = XGBClassifier(random_state=42)

categorical_cols = ['WordLemma', 'WordPOS', 'WordTag', 'WordDep']
numerical_cols = ['IsCapitalized', 'NonPunctuation', 'IsStop', 'PossibleNER', 'WordLength']

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('xgb_model', xgb_model)
                            ])

pipeline.fit(X_train, y_train)  

In [None]:
pickle.dump(pipeline, open('ner_model.pkl', 'wb'))