In [108]:
from datasets import get_dataset_config_names, load_dataset
import gensim
import gensim.downloader
import gensim.parsing.preprocessing as pre
from gensim.models import KeyedVectors
from gensim import corpora
import polars as pl
import numpy as np
import pandas as pd
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.metrics.WEAT import WEAT
from wefe.utils import plot_queries_results, run_queries
from wefe.datasets import load_weat
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

TRAIN_EMBED = False

## Train Embeddings

In [2]:
configs = get_dataset_config_names("wikitext")
print(configs)

['wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1', 'wikitext-2-raw-v1']


In [3]:
data = load_dataset("wikitext", "wikitext-103-v1")

In [4]:
data["train"]

Dataset({
    features: ['text'],
    num_rows: 1801350
})

In [5]:
data["train"]["text"][5]

" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \n"

In [6]:
g_news = gensim.downloader.load('word2vec-google-news-300')

In [7]:
twitter = gensim.downloader.load("glove-twitter-50")

In [8]:
def compare_embeddings(wv):
    # Given examples
    output = []
    x0 = wv["piano"]
    output.append(x0)
    x1 = wv["cherry"] - wv["michigan"] + wv["georgia"]
    output.append(x1)
    # Classic
    x2 = wv["king"] - wv["man"] + wv["woman"]
    output.append(x2)
    x3 = wv["zoo"] + wv["pet"]
    output.append(x3)
    x4 = wv["game"] - wv["computer"]
    output.append(x4)
    x5 = wv["orchestra"] - wv["flute"] + wv["saxophone"]
    output.append(x5)
    x6 = wv["dinosaur"] + wv["fly"]
    output.append(x6)
    x7 = wv["book"] + wv["digital"]
    output.append(x7)
    output = [wv.most_similar(x) for x in output]
    return output

In [9]:
len(twitter)

1193514

In [10]:
len(g_news)

3000000

In [11]:
compare_embeddings(g_news)[-1]

[('digital', 0.7597694396972656),
 ('book', 0.7047815918922424),
 ('books', 0.6316168308258057),
 ('eBook', 0.6188921332359314),
 ('hardcover_paperback', 0.6001059412956238),
 ('ebook', 0.5957868695259094),
 ('downloadable_audiobook', 0.5945138931274414),
 ('ebooks', 0.5858449339866638),
 ('hardbound_edition', 0.5780873894691467),
 ('eBooks', 0.5696583390235901)]

In [140]:
preprocess = [pre.strip_multiple_whitespaces, pre.lower_to_unicode, pre.strip_punctuation, pre.strip_tags]
if TRAIN_EMBED:
    sentences = [pre.preprocess_string(x["text"], preprocess) for x in data["train"] if len(x["text"]) > 3]
    cbow = gensim.models.Word2Vec(sentences, vector_size=64, workers=8, epochs=10)
    cbow.wv.save("cbow.wordvectors")
else:
    cbow = KeyedVectors.load("cbow.wordvectors")

In [13]:
if TRAIN_EMBED:
    skip = gensim.models.Word2Vec(sentences, vector_size=64, workers=8, epochs=10, sg=1)
    skip.wv.save("skip.wordvectors")
else:
    skip = KeyedVectors.load("skip.wordvectors")

In [14]:
if TRAIN_EMBED:
    skip = skip.wv
    cbow = cbow.wv    

In [15]:
queries = ["piano", "cherry - michigan + georgia", "king - man + woman", "zoo + pet", "game - computer", "orchestra - flute + saxophone", 
           "dinosaur + fly", "book + digital"]
records = []
# emb = compare_embeddings(skip.wv)
methods = {"cbow": cbow, "skip": skip, 
           "twitter": twitter, "g_news": g_news}

for method, wv in methods.items():
    emb = compare_embeddings(wv)
    emb = [",".join([x[0] for x in list]) for list in emb]
    for k, v in zip(queries, emb):
        record = {"method": method, "query": k, "nearest": v}
        records.append(record)

df = pl.from_records(records)
df.head()

method,query,nearest
str,str,str
"""cbow""","""piano""","""piano,violin,c…"
"""cbow""","""cherry - michi…","""ginger,cherry,…"
"""cbow""","""king - man + w…","""queen,king,emp…"
"""cbow""","""zoo + pet""","""pet,zoo,goldfi…"
"""cbow""","""game - compute…","""season,game,ma…"


In [16]:
with open("data1.tex", "w") as f:
    f.write(r"\begin{lstlisting}")
    f.write("\n")
    for group in df.group_by(pl.col("query")):
        query = group[0]
        f.write(f"{query}\n")
        for row in group[1].select(["method", "nearest"]).rows():
            if len(row[1]) > 60:
                first = " ".join(row[1].split(",")[0:5])
                second = " ".join(row[1].split(",")[5:])
                f.write(f"{row[0]}: {first}\n\t\t{second}\n")
            else:
                f.write(f"{row[0]}: {row[1]}\n")
        f.write("\n")
    f.write(r"\end{lstlisting}")

In [50]:
def weat_culture_query():
    target_sets = [['asian', 'Chinese', 'Japanese'], ['white', 'European', 'American']]
    target_sets_names = ["Asian", "Western"]
    attribute_sets = [["piano", "violin", "prodigy"], ["sport", "sports", "athlete"]]
    attribute_sets_names = ["Music", "Sports"]
    query = Query(target_sets, attribute_sets, target_sets_names,
                attribute_sets_names)
    return query

### WEAT experiments

In [64]:
# WEAT example experiments + two custom experiments
weat_wordset = load_weat()

# Define the 10 Queries:
queries = [
    # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['flowers'], weat_wordset['insects']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Flowers', 'Insects'], ['Pleasant(5)', 'Unpleasant(5)']),

    # Instruments vs Weapons wrt Pleasant (5) and Unpleasant (5)
    Query([weat_wordset['instruments'], weat_wordset['weapons']],
          [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['Instruments', 'Weapons'], ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(5) vs African american names(5)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_5'],
        weat_wordset['african_american_names_5']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(5)', 'African american names(5)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (5) and Unpleasant (5)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_5'], weat_wordset['unpleasant_5']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(5)', 'Unpleasant(5)']),

    # European american names(7) vs African american names(7)
    # wrt Pleasant (9) and Unpleasant (9)
    Query([
        weat_wordset['european_american_names_7'],
        weat_wordset['african_american_names_7']
    ], [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
          ['European american names(7)', 'African american names(7)'],
          ['Pleasant(9)', 'Unpleasant(9)']),

    # Male and female names wrt Career and family
    Query([weat_wordset['male_names'], weat_wordset['female_names']],
          [weat_wordset['career'], weat_wordset['family']],
          ['Male names', 'Female names'], ['Career', 'Family']),

    # Math and arts wrt male and female terms
    Query([weat_wordset['math'], weat_wordset['arts']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Math', 'Arts'], ['Male terms', 'Female terms']),

    # Science and arts wrt male and female terms
    Query([weat_wordset['science'], weat_wordset['arts_2']],
          [weat_wordset['male_terms'], weat_wordset['female_terms']],
          ['Science', 'Arts 2'], ['Male terms', 'Female terms']),

    # Mental and Physical disease wrt Temporary and Permanent
    Query([weat_wordset['mental_disease'], weat_wordset['physical_disease']],
          [weat_wordset['temporary'], weat_wordset['permanent']],
          ['Mental disease', 'Physical disease'], ['Temporary', 'Permanent']),

    # Young people names and Old people names disease wrt Pleasant(9) and Unpleasant(9)
    Query(
        [weat_wordset['young_people_names'], weat_wordset['old_people_names']],
        [weat_wordset['pleasant_9'], weat_wordset['unpleasant_9']],
        ['Young names', 'Old names'],
        ['Pleasant(9)', 'Unpleasant(9)']), weat_culture_query(),
      Query([weat_wordset["male_names"], weat_wordset["female_names"]], 
            [["rpg", "tabletop", "card"], ["knit", "yarn", "crochet"]],
            ["Male", "Female"], ["Tabletop Games", "Textiles"])
]

models = [WordEmbeddingModel(wv, name) for name, wv in methods.items()]

In [65]:
metric_params = {
    "preprocessors": [{}, {"lowercase": True}],
}
wefe_results = run_queries(WEAT,
                      queries,
                      models,
                      metric_params=metric_params, warn_not_found_words=False).T.round(2)

# joined_results = pd.concat([wefe_results, original_results], axis=1)
# joined_results = joined_results.iloc[:, [2,0,3,1]] # reorder the columns
wefe_results

model_name,cbow,skip,twitter,g_news
query_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Flowers and Insects wrt Pleasant(5) and Unpleasant(5),2.77,2.27,3.06,1.41
Instruments and Weapons wrt Pleasant(5) and Unpleasant(5),4.63,3.48,2.61,1.75
European american names(5) and African american names(5) wrt Pleasant(5) and Unpleasant(5),1.26,0.98,2.56,0.45
European american names(7) and African american names(7) wrt Pleasant(5) and Unpleasant(5),0.36,0.3,1.43,0.45
European american names(7) and African american names(7) wrt Pleasant(9) and Unpleasant(9),0.37,0.25,1.18,0.34
Male names and Female names wrt Career and Family,2.67,2.06,0.74,1.25
Math and Arts wrt Male terms and Female terms,0.97,0.92,-0.21,0.23
Science and Arts 2 wrt Male terms and Female terms,0.7,0.38,-0.18,0.29
Mental disease and Physical disease wrt Temporary and Permanent,0.33,0.54,0.36,0.34
Young names and Old names wrt Pleasant(9) and Unpleasant(9),-0.32,-0.23,0.35,-0.05


In [66]:
wefe_results.to_latex("table2.tex", index=True)

  wefe_results.to_latex("table2.tex", index=True)


In [67]:
for col in methods:
    print(f"{col}: {wefe_results[col].abs().mean():.2f}")

cbow: 1.30
skip: 1.02
twitter: 1.22
g_news: 0.62


In [86]:
plot_queries_results(wefe_results)

## Classification

In [168]:
imdb = load_dataset("imdb")

In [147]:
train_clean = [pre.preprocess_string(x["text"], preprocess) for x in imdb["train"]]
test_clean = [pre.preprocess_string(x["text"], preprocess) for x in imdb["test"]]

In [159]:
len(imdb["train"]["text"])

25000

In [163]:
vectorizer = TfidfVectorizer(input="content")
train_x_one_hot = vectorizer.fit_transform(imdb["train"]["text"])
test_x_one_hot = vectorizer.transform(imdb["test"]["text"])

In [165]:
def eval_classifier(x_train, y_train, x_test, y_test):
    model = sklearn.linear_model.LogisticRegression(multi_class='multinomial', max_iter=500)
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print(y_test.shape)
    print(pred.shape)
    f1 = sklearn.metrics.precision_recall_fscore_support(y_test, pred, average="weighted")
    conf = sklearn.metrics.confusion_matrix(y_test, pred)
    # f1 = sklearn.metrics.f1_score(y_test, pred, average="micro")
    acc = sklearn.metrics.accuracy_score(y_test, pred)
    # return acc, f1
    return acc, f1, conf
    # return sklearn.metrics.classification_report(y_test, pred, labels=class_names)

eval_classifier(train_x_one_hot, np.array(imdb["train"]["label"]), test_x_one_hot, np.array(imdb["test"]["label"]))

(25000,)
(25000,)


(0.8868,
 (0.8868344722124228, 0.8868, 0.8867974780289344, None),
 array([[11144,  1356],
        [ 1474, 11026]]))

In [154]:
def compute_embeddings(sentences, lookup):
    out = []
    for sentence in sentences:
        avg = np.mean([lookup[x] for x in sentence if x in lookup], axis=0)
        out.append(avg)
    return out

train_x_emb = compute_embeddings(train_clean, skip)
test_x_emb = compute_embeddings(test_clean, skip)
eval_classifier(train_x_emb, np.array(imdb["train"]["label"]), test_x_emb, np.array(imdb["test"]["label"]))

(25000,)
(25000,)


(0.76152,
 (0.7619733606189527, 0.76152, 0.7614167793554204, None),
 array([[9779, 2721],
        [3241, 9259]]))

In [169]:
poem = load_dataset("poem_sentiment")

In [185]:
poem["train"]["verse_text"][0:2]

['with pale blue berries. in these peaceful shades--',
 'it flows so long as falls the rain,']

In [180]:
train_clean = [pre.preprocess_string(x["verse_text"], preprocess) for x in poem["train"]]
test_clean = [pre.preprocess_string(x["verse_text"], preprocess) for x in poem["test"]]

In [176]:
vectorizer = TfidfVectorizer(input="content")
train_x_one_hot = vectorizer.fit_transform(poem["train"]["verse_text"])
test_x_one_hot = vectorizer.transform(poem["test"]["verse_text"])

In [177]:
eval_classifier(train_x_one_hot, np.array(poem["train"]["label"]), test_x_one_hot, np.array(poem["test"]["label"]))

(104,)
(104,)


(0.6634615384615384,
 (0.44881221719457015, 0.6634615384615384, 0.535425101214575, None),
 array([[ 0,  1, 18],
        [ 1,  0, 15],
        [ 0,  0, 69]]))

In [184]:
train_x_emb = compute_embeddings(train_clean, skip)
test_x_emb = compute_embeddings(test_clean, skip)
eval_classifier(train_x_emb, np.array(poem["train"]["label"]), test_x_emb, np.array(poem["test"]["label"]))

(104,)
(104,)


(0.7596153846153846,
 (0.8086538461538461, 0.7596153846153846, 0.7080228200917856, None),
 array([[ 9,  0, 10],
        [ 0,  2, 14],
        [ 1,  0, 68]]))