In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import time
from keras.wrappers.scikit_learn import KerasClassifier
import seaborn as sns
import matplotlib.pyplot as plt
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
import os            ##  This module is for "operating system" interfaces
import sys           ##  This module is for functionality relevant to the python run time
path_to_datafolder = 'C:/Users/mjdom/source/repos/mdst_nlp_2021/data'
print(os.listdir(path_to_datafolder))


['test.csv', 'train.csv']


In [3]:
df = pd.read_csv(path_to_datafolder+'/train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
X = df["text"].copy()
#X = df["text"]

authors = df["author"].copy()

# Label data
y = []
for author in authors:
    if author == "EAP":
        y.append([1, 0, 0])
    if author == "HPL":
        y.append([0, 1, 0])
    if author == "MWS":
        y.append([0, 0, 1])

y = np.array(y)

y_one_vector = []
for author in authors:
    if author == "EAP":
        y_one_vector.append(0)
    if author == "HPL":
        y_one_vector.append(1)
    if author == "MWS":
        y_one_vector.append(2)

y_one_vector = np.array(y_one_vector)

In [5]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(X)
vocab = encoder.get_vocabulary()


In [6]:
def create_model(embed_dim=64,num_layer = 1,lstm_units=64,dropout_rate = 0.2, lstm_dr = 0.2):
    with tf.device('/GPU:0'):

        # create model

        LSTM = tf.keras.Sequential()
        LSTM.add(encoder)
        LSTM.add(tf.keras.layers.Embedding(input_dim=len(vocab),output_dim=embed_dim,mask_zero=True))

        for n in range(num_layer):
            LSTM.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units,dropout=lstm_dr,return_sequences=True)))

        LSTM.add(tf.keras.layers.GlobalMaxPool1D())
        LSTM.add(tf.keras.layers.Dropout(dropout_rate))

        LSTM.add(tf.keras.layers.Dense(3, activation='softmax'))

        LSTM.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                     optimizer=tf.keras.optimizers.Adam(1e-3),
                     metrics=['accuracy'])
    return LSTM

In [7]:
with tf.device('/GPU:0'):

    # fix random seed for reproducibility
    seed = 7
    np.random.seed(seed)
    # load dataset

    # create model
    model = KerasClassifier(build_fn=create_model, batch_size=64, verbose=0)
    # define the grid search parameters
    embed_dim = [32,64,128,256]
    num_layer = [1,2,3]
    lstm_units =  [32,64,128,256]
    dropout_rate = [0.0,0.1,0.2,0.3]
    epochs = [1,2]
    lstm_dr = [0.0,0.1,0.2,0.3]

    param_grid = dict(embed_dim=embed_dim,num_layer = num_layer,lstm_units = lstm_units,
                      dropout_rate = dropout_rate, epochs=epochs,lstm_dr = lstm_dr)
    #grid = GridSearchCV(estimator=model, scoring = 'neg_log_loss', param_grid=param_grid, n_jobs=1, cv=3, verbose=3)
    grid = RandomizedSearchCV(model, param_grid, n_iter=10,scoring = 'neg_log_loss', n_jobs=1, cv=3, verbose=3)

  model = KerasClassifier(build_fn=create_model, batch_size=64, verbose=0)


In [None]:
start_time = time.time()
with tf.device('/GPU:0'):
    grid_result = grid.fit(X, y)
stop_time = time.time()

# summarize results
print('time search took:', stop_time - start_time)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END dropout_rate=0.0, embed_dim=64, epochs=2, lstm_dr=0.2, lstm_units=128, num_layer=2;, score=-0.490 total time=  34.6s
[CV 2/3] END dropout_rate=0.0, embed_dim=64, epochs=2, lstm_dr=0.2, lstm_units=128, num_layer=2;, score=-0.460 total time=  33.5s


In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [None]:
l = create_model()

In [14]:
with tf.device('/CPU:0'):
    l = create_model()
    l.fit(X, y, epochs = 2)

Epoch 1/2

KeyboardInterrupt: 

In [12]:
with tf.device('/GPU:0'):
    l = create_model()
    l.fit(X, y, epochs = 2)

Epoch 1/2
Epoch 2/2


In [None]:
d=pd.DataFrame(params)
d['Mean']=means
d['Std. Dev']=stds


In [None]:
param_ = ["num_heads","ff_dim","epochs","embed_dim","dropout_rate"]
import matplotlib.pyplot as plt
fig, ax = plt.subplots(2,3,figsize=(14,8), squeeze=False)
ax = ax.ravel()
for i in range(5):
    ax[i].set_title('Distribution of mean accuracy with {}'.format(param_[i]))
    sns.violinplot(x=param_[i],y='Mean',data=d,ax=ax[i])
fig.tight_layout(pad=1.5)

In [None]:
d.sort_values(by='Mean',ascending = False)