In [None]:
from files import read_training_data, read_test_data, read_val_data


df_train = read_training_data()
df_test = read_test_data()
df_val = read_val_data()

In [None]:
from pre_process import under_sample, encode_label

In [None]:
df_train, df_test, df_val = encode_label(
        df_train=df_train,
        df_test=df_test,
        df_val=df_val,
        source_label="l1",
        new_label_name="label_l1",
    )

In [None]:
import numpy as np
import tensorflow as tf
import keras_tuner as kt

print('loading embeddings')
train_embeddings = np.load('under_sampled_embeddings.npy')
test_embeddings = np.load('test_embeddings.npy')
val_embeddings = np.load('val_embeddings.npy')

print('checkpoint 5: getting labels')
y_train = df_train["label_l1"]
y_test = df_test["label_l1"]
y_val = df_val['label_l1']

y_train_one_hot = tf.one_hot(y_train, 9)
y_test_one_hot = tf.one_hot(y_test, 9)
y_val_one_hot = tf.one_hot(y_val, 9)

In [None]:
print(train_embeddings.shape)

In [None]:
def build_model(hp):
    inputs = tf.keras.Input(shape=(768,))
    units = hp.Int('units', min_value=10, max_value=600, step=20)
    x = tf.keras.layers.Dense(units = units, activation=tf.nn.relu)(inputs)
    outputs = tf.keras.layers.Dense(9, activation=tf.nn.softmax)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    
    model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
            loss='categorical_crossentropy',
            metrics=['accuracy'])
    
    return model

In [None]:
# tuner = kt.Hyperband(build_model,
#                      objective='val_accuracy',
#                      max_epochs=10,
#                      factor=3,
#                      directory='my_dir',
#                      project_name='kdd_2')

In [None]:
hp = kt.HyperParameters()
hp.values["model_type"] = "cnn"
# Build the model using the `HyperParameters`.
model = build_model(hp)
# Print a summary of the model.
model.summary()

# Do the same for MLP model.
hp.values["objective"] = "mlp"
model = build_model(hp)

model.summary()

In [None]:
tuner = kt.RandomSearch(
    build_model,
    overwrite=True,
    objective="val_accuracy",
    # Set a directory to store the intermediate results.
    directory="my_dir",
    project_name='kdd_f'
)

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/fit


In [None]:

tuner.search(
    train_embeddings,
    y_train_one_hot, 
    epochs=5, 
    validation_data=(val_embeddings, y_val_one_hot), 
    callbacks=[stop_early, tensorboard_callback]
    )

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
model = tuner.hypermodel.build(best_hps)
model.summary()


In [None]:
history = model.fit(train_embeddings, y_train_one_hot, epochs=20, validation_data=(val_embeddings, y_val_one_hot))

val_loss_per_epoch = history.history['val_loss']
best_epoch = val_loss_per_epoch.index(max(val_loss_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
from matplotlib import pyplot as plt

fig1, ax1 = plt.subplots()
ax1.plot(history.history['loss'])
ax1.plot(history.history['val_loss'])
ax1.set_title('model loss')
# ax1.set_ylim(0.95,1)
ax1.set_ylabel('loss')
ax1.set_xlabel('epoch')
ax1.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.summary()

In [None]:
fig, ax = plt.subplots()
ax.plot(history.history['accuracy'])
ax.plot(history.history['val_accuracy'])
ax.set_title('model accuracy')
ax.set_ylim(0.95,1)
ax.set_ylabel('accuracy')
ax.set_xlabel('epoch')
ax.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
# Retrain the model
hypermodel.fit(train_embeddings, y_train_one_hot, epochs=7, validation_data=(val_embeddings, y_val_one_hot))

In [None]:
eval_result = hypermodel.evaluate(test_embeddings, y_test_one_hot)
print("[test loss, test accuracy]:", eval_result)

In [None]:
y_pred = hypermodel.predict(test_embeddings)

In [None]:
y_pred_th = np.where(y_pred > 0.9, 1, 0)

In [None]:
y_pred_th.shape, y_test_one_hot.shape

In [None]:
y_pred_th.sum(1).max()

In [None]:
y_pred_eval =y_pred.argmax(1)

In [None]:
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_eval,)

In [None]:
# history = model.fit(train_embeddings, y_train_one_hot, epochs=10, validation_data=(val_embeddings, y_val_one_hot))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_eval, target_names=['']))

In [None]:
print(classification_report(y_test, y_pred_eval, target_names=['Agent', 'Device', 'Event', 'Place', 'Species', 'SportsSeason', 'TopicalConcept','UnitOfWork', 'Work']))
# df_test.groupby(['label_l1','l1']).size().reset_index().rename(columns={0:'count' })

In [None]:
from matplotlib import pyplot as plt


In [None]:
fig, ax = plt.subplots()
ax.plot(history.history['accuracy'])
ax.plot(history.history['val_accuracy'])
ax.set_title('model accuracy')
ax.set_ylim(0.95,1)
ax.set_ylabel('accuracy')
ax.set_xlabel('epoch')
ax.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:

fig1, ax1 = plt.subplots()
ax1.plot(history.history['loss'])
ax1.plot(history.history['val_loss'])
ax1.set_title('model loss')
# ax1.set_ylim(0.95,1)
ax1.set_ylabel('loss')
ax1.set_xlabel('epoch')
ax1.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:

results = model.evaluate(test_embeddings, y_test_one_hot, batch_size=128)
print("test loss, test acc:", results)

In [None]:
%load_ext tensorboard
%tensorboard --logdir /my_dir/kdd

In [None]:
## SVM

from files import read_training_data, read_test_data, read_val_data
from pre_process import under_sample, encode_label
from embeddings import create_embedding
from models import bert_model_and_tokenizer
from classifications import logisticRegression
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn import svm

import numpy as np



print('checkpoint 1: Loading data')
df_train = read_training_data()
df_test = read_test_data()
df_val = read_val_data()


    
df_train, df_test, df_val, LE = encode_label(
    df_train=df_train,
    df_test=df_test,
    df_val=df_val,
    source_label="l1",
    new_label_name="label_l1",
)
    
labels = LE.inverse_transform([0,1,2,3,4,5,6,7,8])
    

    
print('loading embeddings')
train_embeddings = np.load('under_sampled_embeddings.npy')
test_embeddings = np.load('test_embeddings.npy')

print('checkpoint 5: getting labels')
y_train = df_train["label_l1"]
y_test = df_test["label_l1"]

print('checkpoint 6: Running the classification model')
    
    

    # y_pred = logisticRegression(
    #     x_train=train_embeddings, x_test=test_embeddings, y_train=y_train, y_test=y_test
    # )
    
    


In [None]:
svm_clf = svm.SVC().fit(train_embeddings, y_train)
y_pred = svm_clf.predict(test_embeddings)

In [None]:
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
# confusion = confusion_matrix(y_pred=y_pred, y_true= y_test)

print(accuracy)
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred, )

cm = confusion_matrix(y_pred=y_pred, y_true= y_test, labels= svm_clf.classes_)
print(accuracy)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
disp.ax_.set_title("Confusion matrix for svm")
disp.ax_.xaxis.set_ticklabels(labels,  rotation = 90) 
disp.ax_.yaxis.set_ticklabels(labels)
# disp.ax_.set_xticklabels(disp.ax_.get_xticks(), rotation = 45)
plt.show()
print(classification_report(y_test, y_pred, target_names= labels))  
plt.show()



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth=5, random_state=0).fit(train_embeddings, y_train)

In [None]:
rf_clf_predicted_label = rf_clf.predict(test_embeddings)

accuracy_rf = accuracy_score(y_pred=rf_clf_predicted_label, y_true=y_test)
# confusion = confusion_matrix(y_pred=y_pred, y_true= y_test)

print(accuracy_rf)
# ConfusionMatrixDisplay.from_predictions(y_test, y_pred, )

cm_rf = confusion_matrix(y_pred=rf_clf_predicted_label, y_true= y_test, labels= rf_clf.classes_)
# print(accuracy)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp.plot()
disp.ax_.set_title("Confusion matrix for random forest")
disp.ax_.xaxis.set_ticklabels(labels,  rotation = 90) 
disp.ax_.yaxis.set_ticklabels(labels)
# disp.ax_.set_xticklabels(disp.ax_.get_xticks(), rotation = 45)
plt.show()
print(classification_report(y_test, rf_clf_predicted_label, target_names= labels))  
plt.show()