In [None]:
%load_ext nb_black

In [None]:
from os.path import join
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
from scipy.stats import pearsonr
from IPython.display import display
import seaborn as sns

rng_seed = 399
np.random.seed(rng_seed)
import persim
import joblib

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from sklearn.linear_model import Lasso, LassoCV, LogisticRegressionCV
from sklearn.preprocessing import minmax_scale, RobustScaler

import tensorflow as tf

try:
    # Disable all GPUS
    tf.config.set_visible_devices([], "GPU")
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != "GPU"
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

from tensorflow.keras.activations import relu
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.regularizers import l1
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Directory constants
root_code_dir = ".."
output_dir = join(root_code_dir, "output")
word2vec_training_dir = join(output_dir, "word2vec_training")
word2vec_ann_indices_dir = join(output_dir, "word2vec_ann_indices")
word2vec_cluster_analysis_dir = join(output_dir, "word2vec_cluster_analysis")

# Extend sys path for importing custom Python files
import sys

sys.path.append(root_code_dir)

from topological_data_analysis.topological_polysemy import tps
from word_embeddings.word2vec import load_model_training_output
from analysis_of_embeddings.estimate_num_meanings_supervised import (
    plot_pred_vs_true_labels,
    create_multi_class_labels,
)

## Prepare data

In [None]:
word_meaning_train_data = pd.read_csv("data/word_meaning_train_data.csv")
word_meaning_test_data = pd.read_csv("data/word_meaning_test_data.csv")
word_meaning_semeval_test_data = pd.read_csv("data/word_meaning_semeval_test_data.csv")
word_meaning_data_cols = word_meaning_train_data.columns.values
word_meaning_data_feature_cols = np.array(
    [col for col in word_meaning_data_cols if col.startswith("X_")]
)

In [None]:
print("Train")
word_meaning_train_data

In [None]:
plt.hist(word_meaning_train_data["y"], bins=word_meaning_train_data["y"].max())
plt.xlabel("Label y")
plt.ylabel("Count")
plt.show()

In [None]:
print("Test")
word_meaning_test_data

In [None]:
plt.hist(word_meaning_test_data["y"], bins=word_meaning_test_data["y"].max())
plt.xlabel("Label y")
plt.ylabel("Count")
plt.show()

In [None]:
# Split into X and y
X_train = minmax_scale(word_meaning_train_data[word_meaning_data_feature_cols].values)
X_test = minmax_scale(word_meaning_test_data[word_meaning_data_feature_cols].values)
X_test_semeval = minmax_scale(
    word_meaning_semeval_test_data[word_meaning_data_feature_cols].values
)
y_train = word_meaning_train_data["y"].values
y_test = word_meaning_test_data["y"].values
y_test_semeval = word_meaning_semeval_test_data["y"].values

In [None]:
# Load output from training word2vec
w2v_training_output = load_model_training_output(
    model_training_output_dir=join(
        word2vec_training_dir, "word2vec_enwiki_jan_2021_word2phrase"
    ),
    model_name="word2vec",
    dataset_name="enwiki",
    return_normalized_embeddings=True,
)
last_embedding_weights_normalized = w2v_training_output[
    "last_embedding_weights_normalized"
]
words = w2v_training_output["words"]
word_to_int = w2v_training_output["word_to_int"]
word_counts = w2v_training_output["word_counts"]

In [None]:
# Load SemEval-2010 task 14 words
semeval_2010_14_word_senses = joblib.load(
    join(
        "..", "topological_data_analysis", "data", "semeval_2010_14_word_senses.joblib"
    )
)
semeval_target_words = np.array(list(semeval_2010_14_word_senses["all"].keys()))
semeval_target_words_in_vocab_filter = [
    i for i, word in enumerate(semeval_target_words) if word in word_to_int
]
semeval_target_words_in_vocab = semeval_target_words[
    semeval_target_words_in_vocab_filter
]
semeval_gs_clusters = np.array(list(semeval_2010_14_word_senses["all"].values()))
semeval_gs_clusters_in_vocab = semeval_gs_clusters[semeval_target_words_in_vocab_filter]

num_semeval_words = len(semeval_target_words_in_vocab)

## Do modeling

### Lasso regression with K-fold CV

In [None]:
# Load training result
lasso_cv = joblib.load("data/lasso_reg.joblib")
print(f"Selected alpha: {lasso_cv.alpha_}")

In [None]:
sorted_feature_weights_indices = np.argsort(np.abs(lasso_cv.coef_))[::-1]
sorted_features_arr = np.array(
    list(
        zip(
            word_meaning_data_feature_cols[sorted_feature_weights_indices],
            lasso_cv.coef_[sorted_feature_weights_indices],
        )
    )
)
sorted_features_df = pd.DataFrame(
    {"feature": sorted_features_arr[:, 0], "weight": sorted_features_arr[:, 1]}
)
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(sorted_features_df)

In [None]:
y_pred = lasso_cv.predict(X_train)
plot_pred_vs_true_labels(
    y_pred, y_train, xlabel="Pred number of synsets", ylabel="Synsets in Wordnet"
)

In [None]:
y_pred_test = lasso_cv.predict(X_test)
plot_pred_vs_true_labels(
    y_pred_test, y_test, xlabel="Pred number of meanings", ylabel="Clusters in GS"
)

In [None]:
y_pred_test_semeval = lasso_cv.predict(X_test_semeval)
plot_pred_vs_true_labels(
    y_pred_test_semeval,
    y_test_semeval,
    xlabel="Pred number of meanings",
    ylabel="Clusters in GS",
)

### Multi-class logistic regression with L1-penalty, finding optimal alpha with K-fold CV

In [None]:
# Create multi-class labels
max_y_multi = np.quantile(y_train, q=0.9)
y_train_multi_class = create_multi_class_labels(labels=y_train, max_label=max_y_multi)
y_test_multi_class = create_multi_class_labels(labels=y_test, max_label=max_y_multi)

In [None]:
# Load result from training
log_reg_cv = joblib.load("data/log_reg_cv.joblib")

In [None]:
1 / log_reg_cv.C_

In [None]:
labels_str = [
    str(label + 1) if i < 4 else ">= 5" for i, label in enumerate(log_reg_cv.classes_)
]

In [None]:
df_dict = {}
for label_str, coeffs in zip(labels_str, log_reg_cv.coef_):
    sorted_feature_weights_indices = np.argsort(np.abs(coeffs))[::-1]
    df_dict[f"feature_{label_str}"] = word_meaning_data_feature_cols[
        sorted_feature_weights_indices
    ]
    df_dict[f"weight_{label_str}"] = coeffs[sorted_feature_weights_indices]

sorted_features_df = pd.DataFrame(df_dict)
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(sorted_features_df)

In [None]:
y_pred = log_reg_cv.predict(X_train)
cm = confusion_matrix(y_train_multi_class, y_pred)

plt.figure(figsize=(10, 7))
sns.heatmap(
    cm,
    cmap="YlGnBu",
    annot=True,
    fmt="d",
    annot_kws={"size": 16},
    square=True,
    xticklabels=labels_str,
    yticklabels=labels_str,
)
plt.title("Word meaning prediction vs. true")
plt.xlabel("Predicted No. word meanings")
plt.ylabel("True No. word meanings")
plt.show()

pred_f1_score = f1_score(y_train_multi_class, y_pred, average="weighted")
print(f"F1-score: {pred_f1_score:.3f}")