In [1]:
## Copyright 2020 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Set up the environment

In [2]:
import logging
import os
import pandas as pd
import numpy as np
from numpy import random
import nltk
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from nltk.corpus import stopwords
import re
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

Point to the training file and model output directory

In [3]:
training_data = "data/ground_truth.csv"
base_out_model_path='models'

The ground truth should contain two columns, sentenceText (containg the sentence) and Label (groundtruth) for this sentence classification task.

In [4]:
df = pd.read_csv(training_data, encoding = "utf-8", usecols=['Label', 'sentenceText'])
print ("len = ", len(df))
df.head()

len =  1415


Unnamed: 0,Label,sentenceText
0,travelrestrictions,Russia halts all international air traffic wit...
1,serviceorplaceclosed,"Moscow closes all restaurants, bars, parks, an..."
2,economy,"Sweden's central bank, Sveriges Riksbank, exte..."
3,unk,UK Prime Minister Boris Johnson announces that...
4,serviceorplaceclosed,"The 2020 Open Championship, was originally sch..."


Label distribution is shown below

In [5]:
df['Label'].value_counts()

unk                     516
serviceorplaceclosed    270
travelrestrictions      215
misc                    162
confinement             132
economy                  53
stateofemergency         45
gatheringrestriction     13
tracing                   9
Name: Label, dtype: int64

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

### Cross Validation

Doing 10 fold cross validation

In [7]:
k_fold=KFold(n_splits=10)

def cross_validate(model):
    all_y_test, all_y_pred=[],[]
    for train_index, test_index in k_fold.split(df):
        X_train, X_test = df.iloc[train_index]['sentenceText'].values, df.iloc[test_index]['sentenceText'].values
        y_train, y_test = df.iloc[train_index]['Label'].values, df.iloc[test_index]['Label'].values
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        all_y_test.extend(y_test)
        all_y_pred.extend(prediction)
    return all_y_test, all_y_pred

Fit + Save model on 100% of train data

In [8]:
def train_save_model(model, out_path):
    print (model)
    X_train, y_train = df.sentenceText, df.Label
    model.fit(X_train, y_train)
    pickle.dump(model, open(out_path, 'wb'))
    print ('Saved model in ', out_path)
    return model

### Linear SVM

In [9]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [10]:
y, yhat = cross_validate(sgd)
tags = list(sorted(df['Label'].unique()))
print('accuracy %s' % accuracy_score(yhat, y))
print(classification_report(y, yhat,target_names=tags))

accuracy 0.665017667844523
                      precision    recall  f1-score   support

         confinement       0.73      0.64      0.68       132
             economy       0.80      0.68      0.73        53
gatheringrestriction       1.00      0.08      0.14        13
                misc       0.33      0.08      0.13       162
serviceorplaceclosed       0.77      0.76      0.76       270
    stateofemergency       0.59      0.44      0.51        45
             tracing       1.00      0.33      0.50         9
  travelrestrictions       0.72      0.80      0.76       215
                 unk       0.61      0.79      0.68       516

           micro avg       0.67      0.67      0.67      1415
           macro avg       0.73      0.51      0.54      1415
        weighted avg       0.65      0.67      0.64      1415



In [11]:
saved_model = train_save_model(sgd, os.path.join(base_out_model_path, 'svm.sav'))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])
Saved model in  models/svm.sav


Saving SGD Estimator for confidence estimation purposes (to be used during apply)

In [12]:
sgd_mod = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='modified_huber', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
saved_model = train_save_model(sgd_mod, os.path.join(base_out_model_path, 'svm_estimator.sav'))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])
Saved model in  models/svm_estimator.sav


### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

In [14]:
y, yhat = cross_validate(logreg)
print('accuracy %s' % accuracy_score(y, yhat))
print(classification_report(y, yhat,target_names=tags))

accuracy 0.6530035335689046
                      precision    recall  f1-score   support

         confinement       0.73      0.68      0.70       132
             economy       0.84      0.68      0.75        53
gatheringrestriction       1.00      0.08      0.14        13
                misc       0.29      0.25      0.27       162
serviceorplaceclosed       0.78      0.76      0.77       270
    stateofemergency       0.68      0.42      0.52        45
             tracing       1.00      0.33      0.50         9
  travelrestrictions       0.71      0.76      0.74       215
                 unk       0.63      0.71      0.66       516

           micro avg       0.65      0.65      0.65      1415
           macro avg       0.74      0.52      0.56      1415
        weighted avg       0.65      0.65      0.65      1415



In [15]:
saved_model = train_save_model(logreg, os.path.join(base_out_model_path, 'lr.sav'))

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])
Saved model in  models/lr.sav


### BERT

In [16]:
from simpletransformers.classification import ClassificationModel

Various model hyperparameters. Set use_cuda=True when cuda is unavailable

In [17]:
use_cuda = False
model_arch= 'bert'
model_type = 'bert-base-uncased'
output_dir = os.path.join(base_out_model_path+'bert_model')
overwrite_output_dir = True
num_train_epochs = 3
train_size_ratio=0.8
early_stopping_patience=3
reprocess_input_data=True

Preparing the data

In [18]:
df['labels'] = df.apply(lambda row: tags.index(str(row['Label'])), axis=1)
df = df.rename(columns={'sentenceText':'text'})

Train Dev Splitting

In [19]:
train_size = (int) (train_size_ratio*len(df))
train_df = df[0:train_size]
eval_df = df[train_size:]

In [20]:
model = ClassificationModel(model_arch, model_type, num_labels=len(tags), use_cuda = use_cuda, args={'output_dir':output_dir,'reprocess_input_data': reprocess_input_data, 'overwrite_output_dir': overwrite_output_dir, 'early_stopping_patience': early_stopping_patience, 'num_train_epochs':num_train_epochs, 'learning_rate': 5e-5, 'evaluate_during_training': True})

In [21]:
model.train_model(train_df, eval_df=eval_df)

HBox(children=(FloatProgress(value=0.0, max=1132.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=142.0, style=ProgressStyle(descri…

Running loss: 2.006536


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=142.0, style=ProgressStyle(descri…

Running loss: 0.234115


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=142.0, style=ProgressStyle(descri…

Running loss: 0.032582



In [22]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=classification_report)

HBox(children=(FloatProgress(value=0.0, max=283.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




In [23]:
print(result['acc'])

              precision    recall  f1-score   support

           0       0.71      0.57      0.63        21
           1       1.00      0.67      0.80         6
           2       1.00      1.00      1.00         2
           3       0.80      0.26      0.39        46
           4       0.91      0.93      0.92        69
           5       1.00      1.00      1.00         1
           7       0.89      0.87      0.88        54
           8       0.63      0.90      0.74        84

   micro avg       0.77      0.77      0.77       283
   macro avg       0.87      0.78      0.80       283
weighted avg       0.79      0.77      0.75       283

