In [None]:
import torch
import pandas as pd
import random
import seaborn as sns
from compare import getExampleSentencesBySense
import matplotlib.pyplot as plt
import json
from experiment import train_lemma_classifiers_with_vec
from bert import generate_vectorization
from vectrain import *
from lemmas import *

In [None]:
%matplotlib inline
sns.set(rc={'figure.figsize':(16,10)})

In [None]:
#Prepare Data for Bert Graphs
data_file = "data/bert_all_lemmas_data.csv"
df = pd.read_csv(data_file)
freq_dict = create_sense_freq_dict()
def get_freq(sense):
    return freq_dict[sense]
df = pd.concat([df, df["sense1"].apply(get_freq).rename("sense1_freq")], axis=1)
df = pd.concat([df, df["sense2"].apply(get_freq).rename("sense2_freq")], axis=1)
df = df.sort_values(by="best_avg_acc")

In [None]:
df

In [None]:
all_lemmas = df
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
all_lemmas_graph = sns.barplot(x="lemma", y="best_avg_acc", data=all_lemmas)

In [None]:
same_pos = df[df["pos1"] == df["pos2"]]
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
same_pos_graph = sns.barplot(x="lemma", y="best_avg_acc", data=same_pos)

In [None]:
diff_pos = df[df["pos1"] != df["pos2"]]
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
diff_pos_graph = specs = sns.barplot(x="lemma", y="best_avg_acc", data=diff_pos)

In [None]:


with open("data/sense_to_pofs_dict.json") as f:
    sense_pos_dict = json.load(f)
        
def get_pair_type(row):
    sense1_pos = sense_pos_dict[row[3]]
    sense2_pos = sense_pos_dict[row[4]]
    if sense1_pos > sense2_pos:
        sense1_pos, sense2_pos = sense2_pos, sense1_pos
    return sense1_pos+", "+sense2_pos

In [None]:
tmp = df.apply(get_pair_type, axis=1).rename("pos_combo")

df_with_pos_combo = pd.concat([df, tmp], axis=1)
df_with_pos_combo

In [None]:
#Remove columns with only one occurance
#include all with bars
df_with_pos_combo = df_with_pos_combo[df_with_pos_combo["pos_combo"].isin(["VERB, VERB",
"ADV, ADV","ADJ, ADJ","NOUN, NOUN", "ADJ, ADV", "NOUN, VERB", "ADJ, NOUN", "ADJ, VERB"])] 
df_with_pos_combo
diff_pos_graph = sns.barplot(x="pos_combo", y="best_avg_acc", data=df_with_pos_combo)

In [None]:
max_samp_data = df
plt.xticks(rotation=45)
#add a log scale on the x axis

max_samp_graph = sns.scatterplot(x="sense2_freq", y="best_avg_acc", data=max_samp_data)
max_samp_graph.set(xscale="log")

In [None]:
spec_data = pd.read_csv("classifier_data_spec8.csv")

#Note: this is based off of a subset of the words
specs = sns.barplot(x="spec", y="best_avg_acc", data=spec_data)
plt.xticks(rotation=45)
plt.show()

In [None]:
with open("neighbor_test_result.json", "r") as f:
    neighbors_data = json.load(f)
neighbors_data_graph =  sns.barplot(x=list(neighbors_data.keys()), y=list(neighbors_data.values()))

In [None]:
#Elmo goes below here.
elmo_file = "data/elmo_all_lemmas_data.csv"
elmo_df = pd.read_csv(elmo_file)
elmo_df = elmo_df.sort_values(by="best_avg_acc")

In [None]:
elmo_df

In [None]:
elmo_all_lemmas = elmo_df
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
elmo_all_lemmas_graph = sns.barplot(x="lemma", y="best_avg_acc", data=elmo_all_lemmas)

In [None]:
elmo_same_pos = elmo_df[elmo_df["pos1"] == elmo_df["pos2"]]
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
elmo_same_pos_graph = sns.barplot(x="lemma", y="best_avg_acc", data=elmo_same_pos)

In [None]:
elmo_diff_pos = elmo_df[elmo_df["pos1"] != elmo_df["pos2"]]
plt.xticks(rotation=45)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    labelbottom=False)
elmo_diff_pos_graph = sns.barplot(x="lemma", y="best_avg_acc", data=elmo_diff_pos)

In [None]:


with open("data/sense_to_pofs_dict.json") as f:
    sense_pos_dict = json.load(f)
        
def get_pair_type(row):
    sense1_pos = sense_pos_dict[row[3]]
    sense2_pos = sense_pos_dict[row[4]]
    if sense1_pos > sense2_pos:
        sense1_pos, sense2_pos = sense2_pos, sense1_pos
    return sense1_pos, sense2_pos

In [None]:
tmp = df.apply(get_pair_type, axis=1).rename("pos_combo")

elmo_df_with_pos_combo = pd.concat([elmo_df, tmp], axis=1)
elmo_df_with_pos_combo

In [None]:
#Note: many of the pos_combos have very few datapoints, in fact several have just one
plt.xticks(rotation=45)
elmo_df_with_pos_combo = df_with_pos_combo[df_with_pos_combo["pos_combo"].isin(["VERB, VERB",
"ADV, ADV","ADJ, ADJ","NOUN, NOUN", "ADJ, ADV", "NOUN, VERB", "ADJ, NOUN", "ADJ, VERB"])] 
df_with_pos_combo
elmo_diff_pos_graph = specs = sns.barplot(x="pos_combo", y="best_avg_acc", data=elmo_df_with_pos_combo)

In [None]:
elmo_max_samp_data = elmo_df
plt.xticks(rotation=45)
elmo_max_samp_graph = sns.scatterplot(x="sense2_freq", y="best_avg_acc", data=elmo_max_samp_data)
elmo_max_samp_graph.set(xscale="log")