In [1]:
import io
import os
from copy import deepcopy

import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imblearn_pipeline
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline as sklearn_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from constants import LOGISTIC_REGRESSION_MODEL, PASSIVE_AGGRESSIVE_MODEL, DECISION_TREE_MODEL, SVM_MODEL, \
    RANDOM_FOREST_MODEL, NAIVE_BAYES_MODEL

number_of_found_word_vecs = 0
number_of_not_found_word_vecs = 0


### Intro 

We are going to introduce to you a machine learning classifier with performance of 95% on hebrew corpus of 66K train instances. 

We've done profound feature engineering, feature & model selection, and we'll present to you the best we've found. 

First - let's load our data

In [2]:
def get_data():
    data_path = 'resources' + os.sep + 'dataset_biluo.csv'
    df = pd.read_csv(data_path)
    y = df['BILUO']
    if str(y.iloc[len(y)-1]) == 'nan':
        y.iloc[len(y)-1] = 'O'
    df.drop(columns=['BILUO', 'Bio'], inplace=True)
    X = df
    return X, y

X,y = get_data()

X.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Gender,Lemma,Number,Person,Pos,Prefix,Status,Suffix,Tense,Token,TokenOrder,Word
0,unspecified,DOCSTART,unspecified,unspecified,foreign,no_pref,unspecified,no_suffix,unspecified,DOCSTART,99,DOCSTART
1,masculine,CARD1,singular,unspecified,numeral,no_pref,absolute,no_suffix,unspecified,אחד,102,אחד
2,unspecified,כול,unspecified,unspecified,quantifier,מ,construct,no_suffix,unspecified,מכל,103,כל
3,masculine,שני,singular,unspecified,noun,no_pref,absolute,no_suffix,unspecified,שני,104,שני
4,masculine,ישראלי,plural,unspecified,noun,no_pref,absolute,no_suffix,unspecified,ישראלים,105,ישראלים


We see we have quiet good information in the csv to start with,  
but we add some context features including word embeddings, gazzet features, and relate to stop_words.

Our features:  

First, few basic features we have in the CSV. 

| Pos | Person | Prefix | Suffix |
|-----| ------ | ------ | ------ |

Second, we want some 'context' features:

| Prev_Pos | Next_Pos |
|----------| -------- |

What about prev & next word/token/lemma?  
Because we have a lot of unique words and we don't want to many 'is_word_X' features, we use the power of trained word embeddings in hebrew:
https://fasttext.cc/docs/en/crawl-vectors.html

We tried taking the vectors of all of the possibilities: Lemma, Token, Word.  
And - the Token vector gives the best result.  
It makes sense because the token captures more significance. For example "לחיפה" has more information then "חיפה". 

| PrevTokenVector | TokenVector | NextTokenVector |
| --------------- | ----------- | --------------- |

Third, we have some 'Gazzet' features. Known set or Locations, Persons, etc, that we made a feature for each type. 

| In_LOC_Gazzet | In_PERS_Gazzet | In_PERCENT_Gazzet | In_MONEY_Gazzet | In_ORG_Gazzet |
| ------------- | -------------- | ----------------- | --------------- | ------------- |

Lastly, we know that stop words tend to of tag 'O', so we'll add 'is_stop_word' feature

| is_stop_word |
|--------------|

So we have 15 features, which part of then will be 'dummy features':

> dummies_cols = ['Person', 'Pos', 'Prev_Pos', 'Next_Pos', 'Suffix', 'Prefix']

In additions, the word vectors will be features as well. 

Our trained model have Vword for each word he has in it's vocabulary. 
This vector is of length - 300. 

For a sequence of prev_word, curr_word, next_word, we'll make a vector of size 900 which is the concatenation of the 3 vectors:  

$$ V_(prev-word) * V_(curr-word) * V_(next-word) $$

And what about words that doesn't have vector representation?  
Well, luckily for us, we will see that 97% of the words to have vector representation we we give the rest of the 3% zero vectors. It's negligible.

And this is our feature exctractor code:

In [3]:
class FeatureExtractor(TransformerMixin):
    def __init__(self):
        self.gazzet_sets = self.load_gazzets()
        self.stop_words = self.load_stop_words()
        print("Loading Word Embeddings... Please wait...")
        model_path = 'resources' + os.sep + 'cc.he.300.vec'
        self.trained_model = self.load_word_embeddings(model_path)
        self.VECTOR_SIZE = 300
        print("FeatureExtractor initialized!")

    def load_word_embeddings(self, fname):
        fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        n, d = map(int, fin.readline().split())
        data = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            data[tokens[0]] = [float(x) for x in tokens[1:]]
        return data

    def load_stop_words(self):
        f = open("resources" + os.sep + "all_heb_stop_words.txt", 'r', encoding='utf-8')
        stop_words = [w.strip("\n") for w in f.readlines()]
        f.close()
        return stop_words

    def load_gazzets(self):
        f = open("resources" + os.sep + "naama_gazzets" + os.sep + "Dictionary.txt", 'r', encoding='utf-8')
        all_gazzets = f.readlines()
        f.close()
        gazzet_sets = {'LOC': set(), 'PERS': set(), 'ORG': set(), 'MONEY': set(), 'PERCENT': set()}
        for item in all_gazzets:
            for possibility in gazzet_sets.keys():
                if possibility in item:
                    item = item.replace(possibility, "").strip("\n").strip(" ")
                    if item != "":
                        gazzet_sets[possibility].add(item)
        print("Done loading gazzets")
        return gazzet_sets

    def transform(self, data):
        X = []
        all_features = {'Gender', 'Lemma', 'Number', 'Person', 'Pos', 'Status', 'Tense', 'Token', 'TokenOrder', 'Word', 'Prev_Prev_Pos', 'Prev_Pos', 'Next_Pos', 'Next_Next_Pos', 'Prev_Prev_Number', 'Prev_Number', 'Next_Number', 'Next_Next_Number', 'Prev_Prev_Gender', 'Prev_Gender', 'Next_Gender', 'Next_Next_Gender', 'Prev_Word', 'Next_Word', 'Prev_Token', 'Next_Token', 'In_LOC_Gazzet', 'In_PERS_Gazzet', 'In_ORG_Gazzet', 'In_MONEY_Gazzet',  'In_PERCENT_Gazzet', 'Suffix'}

        wanted_features = {'Prev_Pos', 'Next_Pos', 'Suffix', 'Prefix', 'TokenVector', 'NextTokenVector',
                           'PrevTokenVector', 'Person', 'Pos', 'In_MONEY_Gazzet', 'In_ORG_Gazzet', 'In_LOC_Gazzet',
                           'In_PERS_Gazzet', 'In_PERCENT_Gazzet', 'is_stop_word'}

        for i in range(0, len(data)):
            prev_prev_row_data, prev_row_data, curr_row_data, next_row_data, next_next_row_data = \
                self.get_prev_curr_next_row_data(data, i)

            if 'TokenVector' in wanted_features:
                    self.add_word_vectors(curr_row_data)

            if 'NextTokenVector' in wanted_features and 'PrevTokenVector' in wanted_features:
                    self.add_word_vectors(prev_row_data)
                    self.add_word_vectors(next_row_data)

            self.add_context_features(curr_row_data, next_next_row_data, next_row_data, prev_prev_row_data,
                                      prev_row_data, wanted_features)
            self.add_gazzet_features(curr_row_data)

            curr_row_data['is_stop_word'] = curr_row_data['Token'] in self.stop_words

            for feat in all_features.difference(wanted_features):
                del curr_row_data[feat]

            if 'TokenVector' in wanted_features:
                if 'NextTokenVector' in wanted_features and 'PrevTokenVector' in wanted_features:
                    self.convert_vectors_to_features(prev_row_data, curr_row_data, next_row_data, include_contex=True)
                else:
                    self.convert_vectors_to_features(prev_row_data, curr_row_data, next_row_data, include_contex=False)

            X.append(curr_row_data)

        print("wanted_features")
        print(wanted_features)

        print(f"number_of_found_word_vecs: {number_of_found_word_vecs}")
        print(f"number_of_not_found_word_vecs: {number_of_not_found_word_vecs}")
        print(f"percent of words without vects: "
              f"{number_of_not_found_word_vecs / (number_of_found_word_vecs + number_of_not_found_word_vecs)}")

        df = pd.DataFrame(X)
        return df

    def add_word_vectors(self, curr_row_data):
        global number_of_found_word_vecs, number_of_not_found_word_vecs
        if curr_row_data['Token'] in self.trained_model:
            curr_row_data['TokenVector'] = self.trained_model[curr_row_data['Token']]
            number_of_found_word_vecs += 1
        else:
            curr_row_data['TokenVector'] = [float(0)] * 300
            number_of_not_found_word_vecs += 1

    def convert_vectors_to_features(self, prev_row_data, curr_row_data, next_row_data, include_contex):
            for i in range(self.VECTOR_SIZE):  # vector size
                curr_row_data['wordvec_' + str(i)] = curr_row_data['TokenVector'][i]
                if include_contex:
                    curr_row_data['next_wordvec_' + str(i)] = next_row_data['TokenVector'][i]
                    curr_row_data['prev_wordvec_' + str(i)] = prev_row_data['TokenVector'][i]

            del curr_row_data['TokenVector']
            if include_contex:
                del prev_row_data['TokenVector']
                del next_row_data['TokenVector']

    def add_gazzet_features(self, curr_row_data):
        for gazzet_key, gazzet_set in self.gazzet_sets.items():
            if curr_row_data['Word'] in gazzet_set or curr_row_data['Token'] in gazzet_set:
                curr_row_data['In_' + gazzet_key + '_Gazzet'] = True
                # print(curr_row_data['Word'], curr_row_data['Token'], " in gazzet: ", gazzet_key)
            else:
                curr_row_data['In_' + gazzet_key + '_Gazzet'] = False

    def add_context_features(self, curr_row_data, next_next_row_data, next_row_data, prev_prev_row_data, prev_row_data, wanted_features):
        curr_row_data['Prev_Prev_Pos'] = prev_prev_row_data['Pos']
        curr_row_data['Prev_Pos'] = prev_row_data['Pos']
        curr_row_data['Next_Pos'] = next_row_data['Pos']
        curr_row_data['Next_Next_Pos'] = next_next_row_data['Pos']

        curr_row_data['Prev_Prev_Number'] = prev_prev_row_data['Number']
        curr_row_data['Prev_Number'] = prev_row_data['Number']
        curr_row_data['Next_Number'] = next_row_data['Number']
        curr_row_data['Next_Next_Number'] = next_next_row_data['Number']

        curr_row_data['Prev_Prev_Gender'] = prev_prev_row_data['Gender']
        curr_row_data['Prev_Gender'] = prev_row_data['Gender']
        curr_row_data['Next_Gender'] = next_row_data['Gender']
        curr_row_data['Next_Next_Gender'] = next_next_row_data['Gender']

        curr_row_data['Prev_Word'] = prev_row_data['Word']
        curr_row_data['Next_Word'] = next_row_data['Word']

        curr_row_data['Prev_Token'] = prev_row_data['Token']
        curr_row_data['Next_Token'] = next_row_data['Token']

    def get_prev_curr_next_row_data(self, data, i):
        if i % 1000 == 0:
            print(i)
        if i <= 1:
            prev_prev_row = data.iloc[i]
            prev_row = data.iloc[i]
        else:
            prev_prev_row = data.iloc[i - 2]
            prev_row = data.iloc[i - 1]
        curr_row = data.iloc[i]
        if i >= len(data) - 2:
            next_row = data.iloc[i]
            next_next_row = data.iloc[i]
        else:
            next_row = data.iloc[i + 1]
            next_next_row = data.iloc[i + 2]
        prev_prev_row_data = dict(prev_prev_row)
        prev_row_data = dict(prev_row)
        curr_row_data = dict(curr_row)
        next_row_data = dict(next_row)
        next_next_row_data = dict(next_next_row)
        return prev_prev_row_data, prev_row_data, curr_row_data, next_row_data, next_next_row_data

    def fit(self, X, y=None):
        return self


And this is the 'dummy maker' code:

In [4]:
def make_dataset_with_dummies(X_transformed, dummies_cols):
    print(f"shape before dummies: {X_transformed.shape}")
    X_dummies = pd.get_dummies(X_transformed[dummies_cols])
    X_transformed = X_transformed.drop(columns=dummies_cols)
    X_final = pd.concat([X_transformed, X_dummies], axis=1)
    print(f"X_dummies.shape: {X_dummies.shape}, X_transformed.shape: {X_transformed.shape}, X_final.shape: {X_final.shape}")
    return X_final

Let's activate the feature exctractor on our dataset

In [None]:
dummies_cols = ['Person', 'Pos', 'Prev_Pos', 'Next_Pos', 'Suffix', 'Prefix']
feature_extractor = FeatureExtractor()
X_transformed = feature_extractor.transform(X)
X_final = make_dataset_with_dummies(X_transformed, dummies_cols)

Done loading gazzets
Loading Word Embeddings... Please wait...


Now, let's split to train & test.  
The train part will be used also as development because we'll make grid search with cross validation.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.20, shuffle=True)

Now, we want that the frequencies of each tag will be quiet close in the general set, train, and test.  
This is a function which returns a 'compare_df',  
which can be used in order to check that the frequencies are similar.  
If the frequencies aren't good enough,  
we can execute again the last cell and we'll get another frequencies (shuffle=True 👌)

In [None]:
def get_freqs(y_data):
    y_without_o = y_data[y_data.values != ['O']]
    y_freqs = y_without_o.value_counts().apply(lambda x: x / y_without_o.value_counts().sum())
    return y_freqs

def check_frequencies_of_labels_in_data(y, y_train, y_test):
    y_freqs = get_freqs(y)
    y_train_freqs = get_freqs(y_train)
    y_test_freqs = get_freqs(y_test)
    y_train_freqs = add_missing_columns(y_freqs, y_train_freqs)
    y_test_freqs = add_missing_columns(y_freqs, y_test_freqs)
    print("We got frequencies of labels in y, y_train, y_test :-) ")
    y_freqs.sum(), y_train_freqs.sum(), y_test_freqs.sum()
    compare_df = pd.DataFrame(columns=y_freqs.keys())
    compare_df.keys = ['y', 'y_train', 'y_test']
    compare_df.loc['y'] = y_freqs
    compare_df.loc['y_train'] = y_train_freqs
    compare_df.loc['y_test'] = y_test_freqs
    return compare_df

def add_missing_columns(all_y_cols, y_data):
    diff_train = set(all_y_cols.keys()).difference(set(y_data.keys()))
    if len(diff_train) > 0:
        for col in diff_train:
            y_data[col] = 0
    return y_data

compare_df = check_frequencies_of_labels_in_data(y, y_train, y_test)
compare_df

Let's now observe our new columns:

In [None]:
X_train.columns

We can see, as promised, only negligible amount of words don't have vector representation 👍:

number_of_found_word_vecs: 182467  
number_of_not_found_word_vecs: 6266  
percent of words without vects: 0.03320034122278563  

And let's see the Shapes of our train & test matrixes: 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

Now we need to choose a model.  
We tried a lot of sklearn options:
1. LogisticRegression
2. RandomForest
3. DecisionTree
4. SVM
5. NaiveBayes - both MultinomialNB and GaussianNB
6. PassiveAggressive
7. CRF
8. Multi-Layer perceptron

Also, because our data is imbalance, we tried to handle the imbalacing problem in serveral ways, including using      
**SMOTE: Synthetic Minority Over-sampling**  
https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

But finally, our best results were received with SVM model, which class_weight='balanced' parameter 👏

Now we have the code of our ClfModel, with few basic methods:  
1. init
2. train 
3. predict 
4. evaluate  

** The full ClfModel code with all of the other models is in NER.py. 

** Here we present the code of the chosen model

In [None]:
class ClfModel:
    def __init__(self, model_type):
        self.model_type = model_type
        self.clf = self.init_normal_model(model_type)
        self.classes_without_O = ['U-PERCENT', 'L-PERS', 'U-PERS', 'L-ORG', 'L-LOC', 'I-ORG', 'I-LOC', 'B-ORG', 'L-DATE', 'I-MONEY', 'B-MISC', 'L-MISC', 'L-MONEY', 'B-LOC', 'B-PERS', 'I-PERS', 'U-DATE', 'B-DATE', 'U-LOC', 'B-MONEY', 'U-MISC', 'I-MISC', 'I-DATE', 'L-PERCENT', 'I-TIME', 'U-ORG', 'L-TIME', 'B-PERCENT', 'B-TIME', 'U-TIME', 'I-PERCENT', 'U-MONEY' ]

    def init_normal_model(self, model_type):
        classifier = SVC(kernel='linear', C=1, class_weight='balanced')
        pipe = sklearn_pipeline([('classifier', classifier)])
        return pipe

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)
        
    def train_by_grid_search(self, X_train, y_train):
        parameters = self.prepare_svm_grid_params()
        grid_search = GridSearchCV(self.clf, parameters, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        print(best_parameters)
        self.clf = grid_search.best_estimator_

    def prepare_svm_grid_params(self):
        Cs = [0.001, 0.01, 0.1, 1, 10]
        gammas = [0.001, 0.01, 0.1, 1, 2]
        parameters = {
            'classifier__C': Cs,
            'classifier__gamma': gammas,
            'classifier__class_weight': ['balanced', None]
        }
        return parameters

    def predict(self, X_test):
        y_pred = self.clf.predict(X_test)
        return y_pred

    def evaluate(self, y_true, y_pred):
        y_true = pd.Series(y_true)
        y_pred = pd.Series(y_pred)
        cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
        report = classification_report(y_true, y_pred, labels=self.classes_without_O, target_names=self.classes_without_O)
        report_with_O = classification_report(y_true, y_pred)
        return cross_tab, report, report_with_O

In [None]:
model_type = SVM_MODEL

The grid search takes a lot of time, we've initialized the SVM model with the best parameters found in the grid search performed outside of the notebook.

Let's create our model and train! 

In [None]:
clf_model = ClfModel(model_type=model_type)
clf_model.train(X_train, y_train)
# clf_model.train_by_grid_search(X_train, y_train)

Predict & Evaluate. Notice that we want to see a report with & without 'O',  
because we want to see the results of the other tags in a clear way.  

In [None]:
y_pred = clf_model.predict(X_test=X_test)
cross_tab, report, report_with_O = clf_model.evaluate(y_true=y_test, y_pred=y_pred)

In [None]:
print(report)

In [None]:
print(report_with_O)

In [None]:
cross_tab

Now, we understand that the predicted tag might be helpfull in order to predict the current tag.

For example, if the previous tag is B-X, it's likely that the current one will be I-X. 

So here we'll try again with exploiting previous tags.

In [None]:
def retrain_with_exploit_previous_tags(X_test, X_train, y_test, y_train, clf_type):
    X_train_with_tag, y_train = make_train_data_with_tags(X_train, clf_type, y_train)
    clf_model_with_tags = ClfModel(model_type=clf_type)
    clf_model_with_tags.train(X_train_with_tag, y_train)
    X_test_with_tag = init_prev_tag_dummy_variables_for_test_data_like_the_train(X_test, X_train_with_tag)
    new_y_pred = loop_of_predict_with_previous_tag(X_test_with_tag, clf_model_with_tags)
    cross_tab, report, report_with_O = clf_model_with_tags.evaluate(y_true=y_test, y_pred=new_y_pred)
    return report, report_with_O, cross_tab

def loop_of_predict_with_previous_tag(X_test_with_tag, clf_model_with_tags):
    X_test_with_tag.loc[X_test_with_tag.index[0], 'prev_tag_O'] = 1
    new_y_pred = []
    for i in range(0, len(X_test_with_tag)):
        curr_df_to_predict = pd.DataFrame(X_test_with_tag.iloc[i]).T
        pred = clf_model_with_tags.predict(X_test=curr_df_to_predict)[0]
        if i + 1 < len(X_test_with_tag):
            X_test_with_tag.loc[X_test_with_tag.index[i + 1], 'prev_tag_' + pred] = 1
        new_y_pred.append(pred)
    return new_y_pred

def init_prev_tag_dummy_variables_for_test_data_like_the_train(X_test, X_train_with_tag):
    X_test_with_tag = deepcopy(X_test)
    all_prev_tag_train_dummies_cols = [col for col in X_train_with_tag.columns if col.startswith("prev_tag")]
    for col in all_prev_tag_train_dummies_cols:
        X_test_with_tag[col] = 0
    return X_test_with_tag

def make_train_data_with_tags(X_train, clf_type, y_train):
    X_train_with_tag = deepcopy(X_train)
    X_train_with_tag['prev_tag'] = ['O'] + list(y_train)[:-1]
    prev_tag_dummies = pd.get_dummies(X_train_with_tag[['prev_tag']])
    X_train_with_tag = pd.concat([X_train, prev_tag_dummies], axis=1, sort=False)
    return X_train_with_tag, y_train

In [None]:
report, report_with_O, cross_tab = retrain_with_exploit_previous_tags(X_test, X_train, y_test, y_train, model_type)

In [None]:
print(report)

In [None]:
print(report_with_O)

In [None]:
cross_tab

As we can see the results are sort of the same.  
We produced a lot of features, so probarely this one feature isn't strong enough to change.

### Conclusion

We presented you machine learning classifier with performance of 95% on hebrew corpus of 66K train instances.  
This was done with a lot of experimenting which let to the use of:

1. Good feature extraction of POS and Morphological attributes to begin with
2. BILUO taggging instead of BIO 
3. Using context features 
4. Using word embeddings 
5. Using Gazzet features
6. Relating to stop words
7. Choosing best ML model for our experiment and handeling the imbalance problem

Hope you had fun ✋ 