In [54]:
from datasets import get_dataset_config_names, load_dataset
import gensim
import gensim.downloader
import gensim.parsing.preprocessing as pre
from gensim.models import KeyedVectors
import polars as pl
import numpy as np
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.metrics.WEAT import WEAT
from wefe.utils import plot_queries_results, run_queries

TRAIN_EMBED = False

In [2]:
configs = get_dataset_config_names("wikitext")
print(configs)

['wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1', 'wikitext-2-raw-v1']


In [3]:
data = load_dataset("wikitext", "wikitext-103-v1")

In [4]:
data["train"]

Dataset({
    features: ['text'],
    num_rows: 1801350
})

In [5]:
data["train"]["text"][5]

" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \n"

In [6]:
g_news = gensim.downloader.load('word2vec-google-news-300')

In [7]:
twitter = gensim.downloader.load("glove-twitter-50")

In [8]:
def compare_embeddings(wv):
    # Given examples
    output = []
    x0 = wv["piano"]
    output.append(x0)
    x1 = wv["cherry"] - wv["michigan"] + wv["georgia"]
    output.append(x1)
    # Classic
    x2 = wv["king"] - wv["man"] + wv["woman"]
    output.append(x2)
    x3 = wv["zoo"] + wv["pet"]
    output.append(x3)
    x4 = wv["game"] - wv["computer"]
    output.append(x4)
    x5 = wv["orchestra"] - wv["flute"] + wv["saxophone"]
    output.append(x5)
    x6 = wv["dinosaur"] + wv["fly"]
    output.append(x6)
    x7 = wv["book"] + wv["digital"]
    output.append(x7)
    output = [wv.most_similar(x) for x in output]
    return output

In [9]:
len(twitter)

1193514

In [10]:
len(g_news)

3000000

In [11]:
compare_embeddings(g_news)[-1]

[('digital', 0.7597694396972656),
 ('book', 0.7047815918922424),
 ('books', 0.6316168308258057),
 ('eBook', 0.6188921332359314),
 ('hardcover_paperback', 0.6001059412956238),
 ('ebook', 0.5957868695259094),
 ('downloadable_audiobook', 0.5945138931274414),
 ('ebooks', 0.5858449339866638),
 ('hardbound_edition', 0.5780873894691467),
 ('eBooks', 0.5696583390235901)]

In [12]:
if TRAIN_EMBED:
    preprocess = [pre.strip_multiple_whitespaces, pre.lower_to_unicode, pre.strip_punctuation, pre.strip_tags]
    sentences = [pre.preprocess_string(x["text"], preprocess) for x in data["train"] if len(x["text"]) > 3]
    cbow = gensim.models.Word2Vec(sentences, vector_size=64, workers=8, epochs=10)
    cbow.wv.save("cbow.wordvectors")
else:
    cbow = KeyedVectors.load("cbow.wordvectors")

In [13]:
if TRAIN_EMBED:
    skip = gensim.models.Word2Vec(sentences, vector_size=64, workers=8, epochs=10, sg=1)
    skip.wv.save("skip.wordvectors")
else:
    skip = KeyedVectors.load("skip.wordvectors")

In [14]:
if TRAIN_EMBED:
    skip = skip.wv
    cbow = cbow.wv    

In [22]:
queries = ["piano", "cherry - michigan + georgia", "king - man + woman", "zoo + pet", "game - computer", "orchestra - flute + saxophone", 
           "dinosaur + fly", "book + digital"]
records = []
# emb = compare_embeddings(skip.wv)
methods = {"cbow": cbow, "skip": skip, 
           "twitter": twitter, "g_news": g_news}

for method, wv in methods.items():
    emb = compare_embeddings(wv)
    emb = [",".join([x[0] for x in list]) for list in emb]
    for k, v in zip(queries, emb):
        record = {"method": method, "query": k, "nearest": v}
        records.append(record)

df = pl.from_records(records)
df.head()

method,query,nearest
str,str,str
"""cbow""","""piano""","""piano,violin,c…"
"""cbow""","""cherry - michi…","""ginger,cherry,…"
"""cbow""","""king - man + w…","""queen,king,emp…"
"""cbow""","""zoo + pet""","""pet,zoo,goldfi…"
"""cbow""","""game - compute…","""season,game,ma…"


In [16]:
with open("data1.tex", "w") as f:
    f.write(r"\begin{lstlisting}")
    f.write("\n")
    for group in df.group_by(pl.col("query")):
        query = group[0]
        f.write(f"{query}\n")
        for row in group[1].select(["method", "nearest"]).rows():
            if len(row[1]) > 60:
                first = " ".join(row[1].split(",")[0:5])
                second = " ".join(row[1].split(",")[5:])
                f.write(f"{row[0]}: {first}\n\t\t{second}\n")
            else:
                f.write(f"{row[0]}: {row[1]}\n")
        f.write("\n")
    f.write(r"\end{lstlisting}")

In [24]:
# Wefe example query
def wefe_example(model):
    target_sets = [['she', 'woman', 'girl'], ['he', 'man', 'boy']]
    target_sets_names = ['Female Terms', 'Male Terms']
    attribute_sets = [['poetry','dance','literature'], ['math', 'physics', 'chemistry']]
    attribute_sets_names = ['Arts', 'Science']
    query = Query(target_sets, attribute_sets, target_sets_names,
                attribute_sets_names)
    # instance a WEAT metric
    weat = WEAT()
    result = weat.run_query(query, model, calculate_p_value=True)
    return result

gender_bias = {}
for (name, wv) in methods.items():
    gender_bias[name] = wefe_example(WordEmbeddingModel(wv, name))

In [25]:
gender_bias

{'cbow': {'query_name': 'Female Terms and Male Terms wrt Arts and Science',
  'result': 0.46166253089904785,
  'weat': 0.46166253089904785,
  'effect_size': 1.357256198626603,
  'p_value': 0.038834951456310676},
 'skip': {'query_name': 'Female Terms and Male Terms wrt Arts and Science',
  'result': 0.41074570020039886,
  'weat': 0.41074570020039886,
  'effect_size': 1.3333383112146575,
  'p_value': 0.047156726768377254},
 'twitter': {'query_name': 'Female Terms and Male Terms wrt Arts and Science',
  'result': 0.07661843299865723,
  'weat': 0.07661843299865723,
  'effect_size': 0.806086739803075,
  'p_value': 0.15811373092926492},
 'g_news': {'query_name': 'Female Terms and Male Terms wrt Arts and Science',
  'result': 0.2539586052298546,
  'weat': 0.2539586052298546,
  'effect_size': 1.8524392657902091,
  'p_value': 0.018030513176144243}}

In [53]:
def wefe_example(model):
    target_sets = [['asian', 'chinese', 'japanese'], ['white', 'european', 'american']]
    target_sets_names = ["Asian", "Western"]
    attribute_sets = [["piano", "violin", "prodigy"], ["sport", "sports", "athlete"]]
    attribute_sets_names = ["Music", "Sports"]
    query = Query(target_sets, attribute_sets, target_sets_names,
                attribute_sets_names)
    weat = WEAT()
    result = weat.run_query(query, model, calculate_p_value=True)
    return result

run_queries(WEAT, )
culture_bias = {}
for (name, wv) in methods.items():
    culture_bias[name] = wefe_example(WordEmbeddingModel(wv, name))
culture_bias

{'cbow': {'query_name': 'Asian and Western wrt Music and Sports',
  'result': 0.009961791336536518,
  'weat': 0.009961791336536518,
  'effect_size': 0.02770080402656437,
  'p_value': 0.4618585298196949},
 'skip': {'query_name': 'Asian and Western wrt Music and Sports',
  'result': -0.06884972999493277,
  'weat': -0.06884972999493277,
  'effect_size': -0.2100393069555281,
  'p_value': 0.5963938973647711},
 'twitter': {'query_name': 'Asian and Western wrt Music and Sports',
  'result': 0.42101111014684045,
  'weat': 0.42101111014684045,
  'effect_size': 1.4366601251904048,
  'p_value': 0.05131761442441054},
 'g_news': {'query_name': 'Asian and Western wrt Music and Sports',
  'result': 0.23539705574512482,
  'weat': 0.23539705574512482,
  'effect_size': 1.2973573123875393,
  'p_value': 0.08183079056865465}}