## Neural nets on labeled 10K dataset

We tried a few different approaches to training a neural net to predict labels on the 10K dataset - 
1. First, a plain vanilla neural net to classify excerpts as "Relevant" or "No Disclosure"
2. An LSTM model to do the same
3. A CNN, to eventually be used in transfer learning if successful enough

Before this, we train word embeddings on the labeled dataset and vectorize the text using these embeddings.


In [None]:
# Imports

# Keras and Tensorflow
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Conv1D, GlobalMaxPooling1D
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import tensorflow as tf

# Use scikit-learn for grid search, other basics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt


# Progress tracker
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# Import XGboost in case needed
import xgboost as xgb

# Define recall function for Keras
recall = tf.keras.metrics.Recall(
    thresholds=None, top_k=None, class_id=None, name=None, dtype=None
)

In [None]:
# More packages 

from sklearn.model_selection import train_test_split, GridSearchCV, \
StratifiedKFold, cross_val_predict, \
StratifiedShuffleSplit
from sklearn.feature_selection import chi2
from sklearn.metrics import roc_curve, \
precision_recall_curve, auc, make_scorer, \
recall_score, accuracy_score, precision_score, \
confusion_matrix, classification_report

#import gensim 
from gensim.models import Word2Vec, KeyedVectors

### Train and apply word embeddings
 

In [None]:
# Import json file
path = "/Users/ishashah/Documents/DFG/dfg-humanrights0/from-sasb"
os.chdir(path)
json = pd.read_json("di_hc_rel_train.json")
json.head()


In [None]:
# Import csv lookup
toplabel = pd.read_csv("disclosure_topic.csv")
toplabel.columns = map(str.lower, toplabel.columns)


In [None]:
# Create new label that flags labor only
toplabel["disclosure_islabor"] = toplabel["disclosure_topic_name"].str.contains("labor", case = False)
json = pd.merge(json, toplabel, how = "left",
                on = "disclosure_topic_id")


In [None]:
# Check excerpts more closely
pd.options.display.max_colwidth = 500
json["excerpt"].head()

# Create a flag for these
json["relevant_islabor"] = ((json["disclosure_islabor"]) & (json["relevance_assessment"] == "Relevant"))
json["relevant_islabor"].value_counts()

In [None]:
# Cleaning function
stopset = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(in_text):
    # Remove line breaks
    text = in_text.replace(r'\\n', ' ')
    
    # Lowercase
    text = word_tokenize(re.sub('[^A-z ]+', '', text.lower()))
    
    # Remove stopwords, remove numbers and punctuation, stem
    text = [stemmer.stem(w) for w in text if w.isalpha() and w not in stopset]
    
    # Return joined version
    text = (" ".join(text))
    
    return text

# Apply cleaning function to json file text
json["clean_text"] = json["excerpt"].progress_apply(clean_text)
json.head()

# Export csv of cleaned dataset
json.to_csv("json_clean.csv")


In [None]:
# Create list of cleaned words in each excerpt
json["cleantext_list"] = json["clean_text"].apply(lambda x: ','.join(word_tokenize(x)))
sent = [row.split(',') for row in json["cleantext_list"]]

# Train on corpus
model = Word2Vec(sent, min_count=5, size= 300,workers=3, window =3, sg = 1)

# Check vector size
model.vector_size

300

In [None]:
#  Save trained word embeddings
model.wv.save_word2vec_format('model.txt', binary=False)

In [None]:
# Load trained word embeddings
model = KeyedVectors.load_word2vec_format('model.txt', binary=False)

In [None]:
# Vectorize using embeddings
import numpy as np

def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
   
    return np.asarray(sent_vec) / numw


V=[]

for sentence in sent:
    V.append(sent_vectorizer(sentence, model))   
    
    

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


In [None]:
# Get into dataframe
json2 = pd.DataFrame(V, index = json['excerpt_id'])
json2 = json2.merge(right=json[["excerpt_id", "relevance_assessment"]], 
         left_index=True, right_on="excerpt_id")

json2[["excerpt_id", "relevance_assessment"]].head()
json2["relevance_assessment"].value_counts()

json.shape
json2.shape

In [None]:
# Save vectorized version
json2.to_csv("json2_clean.csv")

### 1. Basic neural net with some gridsearch-based tuning
Summary: not too successful, accuracy only about 0.67 at most


In [None]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive


In [None]:
# Keep all output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# # Set paths
# proxpath = "/Users/ishashah/Documents/DFG/dfg-humanrights0/trim_corp"
# os.chdir(proxpath)

# Import csv of cleaned dataset
json = pd.read_csv("drive/My Drive/DFG Cost of Human Rights Violations/Datasets/smallcorp_1/json_clean.csv")

# Import csv of cleaned dataset with vectorization using word embeddings trained on 10Ks
json2 = pd.read_csv("drive/My Drive/DFG Cost of Human Rights Violations/Datasets/smallcorp_1/json2_clean.csv")


In [None]:
# Fix random seed
seed = 7
np.random.seed(seed)

# Split X and y (using dataset vectorized w/ embeddings)
X = json2.iloc[:,0:300]
y = pd.DataFrame(pd.get_dummies(json2['relevance_assessment']))["Relevant"]
print(X.shape)

(20626, 300)


In [None]:
# Simple trial run for using gridsearch cv

def create_model(): 

	# Create model
	model = Sequential()
	model.add(Dense(128, input_dim=300, activation='relu'))
	model.add(Dense(64, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
    
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model


model = KerasClassifier(build_fn=create_model, epochs=100, verbose=0) 

# Building a simple search grid that adjusts epochs
param_grid = dict(epochs=[10,20,30]) 
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X, y)


In [None]:
# Print best number of epochs
# grid_result.cv_results_ for full results file
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: 0.682970 using {'epochs': 30}


In [None]:
# Tuning different parameters

def create_model():
	# Create model
	model = Sequential()
	model.add(Dense(128, input_dim=300, activation='relu'))
	model.add(Dense(64, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Also tried SGD as optimizer but did not work as well
	return model

# Call model function in KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=20, verbose=0)

# Define the grid search parameters
param_grid = dict(epochs=[10,20,30,50],
                  learn_rate = [0.001, 0.01]) # add additional parameters

grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X, y)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.675503 using {'epochs': 30}
0.639529 (0.122126) with: {'epochs': 10}
0.650824 (0.101179) with: {'epochs': 20}
0.675503 (0.072801) with: {'epochs': 30}
0.664255 (0.092698) with: {'epochs': 50}


### 2. LSTM approach out of the box
Summary: Also not quite successful, max accuracy with 10 epochs only reaches about the same as basic net, 0.67
Note: did not use previous embeddings

In [None]:
# Import packages for LSTM, in case

import seaborn as sns
import re
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))


In [None]:

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each excerpt.
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(json['clean_text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))




Found 24911 unique tokens.


In [None]:
# Original code for LSTM model, does not use pretrained embeddings

X = tokenizer.texts_to_sequences(json['clean_text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(json['relevance_assessment']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

Y_train = Y_train.reshape((-1,1))
Y_test = Y_test.reshape((-1,1))



Shape of data tensor: (20626, 300)
Shape of label tensor: (20626, 2)
(16500, 300) (16500, 2)
(4126, 300) (4126, 2)


In [None]:
import tensorflow as tf
recall = tf.keras.metrics.Recall(
    thresholds=None, top_k=None, class_id=None, name=None, dtype=None
)
from keras.layers import Flatten

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy', recall])

epochs = 10
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='accuracy', 
                                             patience=3, 
                                             min_delta=0.0001)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [None]:
model.save(f"/content/drive/My Drive/DFG Cost of Human Rights Violations/Datasets/smallcorp_1/lstm1.model")


### 3. Convolutional neural net
Summary: Ran into RAM issues on local machine, timeout on Colab


In [None]:
epochs = 20
embedding_dim = 300
maxlen = 300

def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# param_grid = dict(num_filters=[32, 64, 128],
#                   kernel_size=[3, 5, 7],
#                   vocab_size=[5000], 
#                   embedding_dim=[300],
#                   maxlen=[300])


sentences = json['clean_text'].values
y = json['relevance_assessment'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.20, random_state=seed)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=10,
                        verbose=False)
grid = GridSearchCV(estimator=model, param_grid=param_grid,
                    cv=5, verbose=1)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
model.save(f"/content/drive/My Drive/DFG Cost of Human Rights Violations/Datasets/smallcorp_1/cnn1.model")
