In [None]:
%load_ext nb_black

In [None]:
from os.path import join
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
from scipy.stats import pearsonr

rng_seed = 399
np.random.seed(rng_seed)
import persim
import joblib

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, LassoCV

import tensorflow as tf

try:
    # Disable all GPUS
    tf.config.set_visible_devices([], "GPU")
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != "GPU"
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

from tensorflow.keras.activations import relu
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l1
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Directory constants
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys

sys.path.append(root_code_dir)

from topological_data_analysis.topological_polysemy import tps
from word_embeddings.word2vec import load_model_training_output

## Prepare data

In [None]:
word_meaning_train_data = pd.read_csv("data/word_meaning_train_data.csv")
word_meaning_test_data = pd.read_csv("data/word_meaning_test_data.csv")
word_meaning_train_data.head()

In [None]:
# Split into X and y
tps_neighbourhood_sizes = np.linspace(start=10, stop=100, num=10, dtype=int)
feature_cols = np.array(
    [
        "estimated_id",
        *[f"tps_{n_size}" for n_size in tps_neighbourhood_sizes],
        *[f"tps_{n_size}_bottle" for n_size in tps_neighbourhood_sizes],
    ]
)
X_train = word_meaning_train_data[feature_cols].values
X_test = word_meaning_test_data[feature_cols].values
y_train = word_meaning_train_data["y"].values
y_test = word_meaning_test_data["y"].values

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(
        word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"
    ),
    model_name="word2vec",
    dataset_name="enwiki",
    return_normalized_embeddings=True,
)
last_embedding_weights_normalized = w2v_training_output[
    "last_embedding_weights_normalized"
]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]
word_counts = w2v_training_output["word_counts"]

In [None]:
# Load SemEval-2010 task 14 words
semeval_2010_14_word_senses = joblib.load(
    join(
        "..", "topological_data_analysis", "data", "semeval_2010_14_word_senses.joblib"
    )
)
semeval_target_words = np.array(list(semeval_2010_14_word_senses["all"].keys()))
semeval_target_words_in_vocab_filter = [
    i for i, word in enumerate(semeval_target_words) if word in word_to_int
]
semeval_target_words_in_vocab = semeval_target_words[
    semeval_target_words_in_vocab_filter
]
semeval_gs_clusters = np.array(list(semeval_2010_14_word_senses["all"].values()))
semeval_gs_clusters_in_vocab = semeval_gs_clusters[semeval_target_words_in_vocab_filter]

num_semeval_words = len(semeval_target_words_in_vocab)

## Do modeling

In [None]:
def create_model(
    input_dim: int,
    lasso_alpha: float,
    learning_rate: float,
) -> Model:
    """
    TODO: Docs
    """
    # Input layer
    input_layer = Input(shape=(input_dim,), name="input")

    # Output layer
    output_layer = Dense(1, kernel_regularizer=l1(lasso_alpha), name="output")(
        input_layer
    )

    # Create model and compile
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=SGD(learning_rate=learning_rate), loss=MSE)

    return model

In [None]:
def plot_corr(true_labels: np.ndarray, pred_labels: np.ndarray) -> None:
    """
    TODO: Docs
    """
    corr, _ = pearsonr(pred_labels, true_labels)
    plt.figure(figsize=(10, 10))
    plt.scatter(pred_labels, true_labels)
    plt.title(f"true/pred correlation: {corr:.3f}")
    plt.show()

### Hyperparameter search using grid search CV

In [None]:
model = KerasRegressor(build_fn=create_model)
lasso_alphas = np.linspace(0.0001, 0.9999, 1000)
param_grid = {
    "input_dim": [X_train.shape[1]],
    "lasso_alpha": lasso_alphas,
    "learning_rate": [0.025, 0.001, 0.0025, 0.0001, 0.00025],
}
n_epochs = 200
batch_size = 1024

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=5)
grid_result = grid.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(wm_model_hist.history["loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train"], loc="upper left")
plt.show()

In [None]:
# Lasso regression
lasso_cv = LassoCV(
    alphas=lasso_alphas, random_state=rng_seed, max_iter=100000, n_jobs=-1, cv=5
)
lasso_cv.fit(X_train, y_train)

y_pred = lasso_cv.predict(X_train)
mean_squared_error(y_train, y_pred)

In [None]:
lasso_cv.alpha_

In [None]:
sorted_coeffs = np.argsort((lasso_cv.coef_))[::-1]
list(zip(feature_cols[sorted_coeffs], lasso_cv.coef_[sorted_coeffs]))

In [None]:
plot_corr(y_train, y_pred)

In [None]:
y_pred_test = lasso_cv.predict(X_test)
mean_squared_error(y_test, y_pred_test)

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(x=y_pred_test, y=y_test)
plt.title(f"Correlation: {pearsonr(y_pred_test, y_test)[0]:.5f}")
plt.xlabel("Pred number of meanings")
plt.ylabel("GS")
plt.show()