In [144]:
import numpy as np
import gensim.downloader as api
from packaging import version

class Embedder:
    def __init__(self, model_name = "glove-wiki-gigaword-200", gensim_version = "4.3.1") -> None:
        self._model = api.load(model_name)
        self._version = gensim_version

    def word_vector(self, word:str) -> np.ndarray:
        embedding = np.zeros(200)
        if version.parse(self._version) >= version.parse("4.0.0"):
            if word in self._model.index_to_key:
                embedding = self._model[word]
                return embedding
        try:
            if word in self._model.vocab:
                embedding = self._model[word]
                return embedding

        except Exception as e:
            print(e)
        
        return embedding

global_embedding = Embedder()


In [7]:
s = "all men work harder than women"
sembed = []

for word in s:
    sembed.append(global_embedding.word_vector(word))


In [145]:
TargetSet1 = ["sister" , "female" , "woman" , "girl" , "daughter" , "she" , "hers" , "her"]
TargetSet2   = ["brother" , "male" , "man" , "boy" , "son" , "he" , "his" , "him"]  

ts1embed = np.array([global_embedding.word_vector(x) for x in TargetSet1])
ts2embed = np.array([global_embedding.word_vector(x) for x in TargetSet2])


In [146]:
testword1 = "sister"
testembed = global_embedding.word_vector(testword1)

cosine_similarity(testembed, ts1_centroid), cosine_similarity(testembed, ts2_centroid)

(0.78610134, 0.5603989)

In [147]:
res1 = np.zeros(200, dtype=np.float32)
res2 = np.zeros(200, dtype=np.float32)

for wv1, wv2 in zip(ts1embed, ts2embed):
    res1 = np.add(wv1, res1)
    res2 = np.add(wv2, res2)

ts1_centroid = res1/len(TargetSet1)
ts2_centroid = res2/len(TargetSet2)

ts2_centroid

array([ 0.01206375, -0.07944419, -0.10408063, -0.31214666, -0.01717401,
       -0.11277359, -0.12706774,  0.108562  ,  0.18310212, -0.0990435 ,
        0.09444749, -0.04034712,  0.30625486,  0.27373976,  0.08894188,
        0.051149  , -0.08148637,  0.26639086, -0.08206573, -0.0391455 ,
        0.53911   ,  2.5930874 ,  0.21234095, -0.06613263, -0.05418612,
       -0.14996573, -0.12196288, -0.47565588,  0.053641  , -0.055419  ,
       -0.22990675, -0.00714725,  0.19922075, -0.06963374,  0.13361025,
       -0.18342724, -0.66888374, -0.149113  , -0.2579364 ,  0.11621888,
       -0.5702888 , -0.18591113, -0.39070973,  0.2900375 , -0.23154894,
        0.10466513,  0.26367775,  0.43243372,  0.13435838,  0.12582338,
        0.13172275,  0.23172626,  0.01978325,  0.18352251,  0.21626629,
       -0.00465563,  0.01120687, -0.10032412,  0.12292199, -0.09141812,
       -0.31018165, -0.35901165, -0.53636247,  0.17737874,  0.07024096,
       -0.22957174,  0.21508287,  0.43140876,  0.12131831,  0.31

In [148]:
import pandas as pd
askmen = pd.read_csv("data/askmen.csv")
askmen_sample = askmen.sample(100)

askredd = pd.read_csv("data/test_data.csv")
askredd_sample = askredd.sample(7000)
askredd_sample

Unnamed: 0,Username,Subreddit,created_utc,Comment
586120,ComablackMM,beyondthebump,1.390153e+09,Glad it went fairly smoothly. I was a bridesma...
752259,SamTarlyLovesMilk,asoiaf,1.390476e+09,We know Mel does use powders to influence peop...
931693,MrN4T3,projectcar,1.390693e+09,No I've just put so much work into it trying t...
1004094,MadHatter69,videos,1.390912e+09,"Damn it, I had plans for today."
970583,razorbeamz,thatHappened,1.390801e+09,Not in Spanish. The official way of writing it...
...,...,...,...,...
643802,dodgermask,nfl,1.390161e+09,Woah didn't know this was a thing. Who let t...
754253,rocmisok,technology,1.390486e+09,A human put on earth by God.
985098,supermario420,truegaming,1.390854e+09,"Yes I most definitely have, I've just played t..."
706062,PM_ME_YOUR_FAVE_SONG,AskReddit,1.390342e+09,The day I found out I can have chicken *and* b...


In [149]:
import nltk
nltk.download("punkt")

def find_adjectives_nouns(sample_text:str):
    sample_text = nltk.word_tokenize(sample_text)
    semantics = nltk.pos_tag(sample_text)
    adjectives_nouns = []
    for word, semantic in semantics:
        if semantic == "JJ" or semantic == "JJR" or semantic == "JJS":
            adjectives_nouns.append(word)


    return adjectives_nouns

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prashant(Gaz)\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [150]:
find_adjectives_nouns(askmen_sample.iloc[10].Comment)

['particular']

In [151]:
import numpy as np

def cosine_similarity(word_vector1, word_vector2):
  # Calculate the dot product of the two word vectors.
  dot_product = np.dot(word_vector1, word_vector2)

  # Calculate the magnitude of the two word vectors.
  magnitude1 = np.linalg.norm(word_vector1)
  magnitude2 = np.linalg.norm(word_vector2)

  # Calculate the cosine similarity.
  cosine_similarity = dot_product / (magnitude1 * magnitude2)

  return cosine_similarity

cos_w_c1 = []
cos_w_c2 = []
adj_nouns = find_adjectives_nouns(askmen_sample.iloc[10].Comment)
for word in adj_nouns:
    word_embed = global_embedding.word_vector(word)
    cos_w_c1.append(cosine_similarity(word_embed, ts1_centroid))
    cos_w_c2.append(cosine_similarity(word_embed, ts2_centroid))


In [14]:
[x-y for x, y in zip(cos_w_c1, cos_w_c2)]

[0.07512390613555908,
 -0.01021432876586914,
 0.1007637083530426,
 0.0019506514072418213,
 5.4836273193359375e-05,
 0.1153692752122879,
 0.09286022186279297,
 0.09708663821220398,
 0.051609158515930176,
 0.03818514943122864,
 -0.017665743827819824,
 0.05924615263938904]

In [15]:
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer

def get_word_sentiment(word):
    synsets = wn.synsets(word)
    if not synsets:
        return 0  # Word not found in WordNet

    sentiment_scores = []
    sid = SentimentIntensityAnalyzer()

    for synset in synsets:
        synset_sentiment = sid.polarity_scores(synset.definition())
        sentiment_scores.append(synset_sentiment['compound'])

    average_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    return average_sentiment

# Example usage
word = "happiest"
sentiment = get_word_sentiment(word)
print(f"Sentiment of '{word}': {sentiment}")


Sentiment of 'happiest': 0.498325


In [16]:
SA = [get_word_sentiment(word) for word in adj_nouns]
SA

[0.10606666666666666,
 0,
 0.08392727272727275,
 0.12405454545454546,
 -0.0006,
 0.2553,
 0.378475,
 0.06171666666666666,
 -0.6039,
 0.10115,
 0.0,
 0.2935555555555555]

In [17]:
sentW = sum(SA)/len(adj_nouns)
sentW 

0.06664547558922561

In [152]:
# CLUSTERING WORDS

import numpy as np
from sklearn.cluster import AgglomerativeClustering, KMeans

# Word embeddings of the words (200-dimensional)
total_word_embeddings = {}
total_words = []
total_biases = []
# Centroid vectors for the male and female gender groups (200-dimensional)
# centroid_male = np.array([0.1, 0.2, ..., 0.3])
# centroid_female = np.array([0.5, 0.6, ..., 0.7])

for comment in askredd_sample.Comment:
    adj_nouns = find_adjectives_nouns(comment)
    for word in adj_nouns:
        if word in total_word_embeddings : continue

        word_embed = global_embedding.word_vector(word)
        if (word == "beautiful"):
            print(cosine_similarity(word_embed, ts1_centroid))
            print(cosine_similarity(word_embed, ts2_centroid))

        bias = cosine_similarity(word_embed, ts1_centroid) - cosine_similarity(word_embed, ts2_centroid)
        total_words.append(word)
        total_word_embeddings[word] = word_embed
        total_biases.append(bias)

# clf = KMeans(n_clusters=2)
# clf.fit(np.array(total_word_embeddings.values()).reshape(-1, 1))


The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and me

  cosine_similarity = dot_product / (magnitude1 * magnitude2)


The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and me

In [153]:
clf = KMeans(n_clusters=2)
clf.fit(np.array(list(total_word_embeddings.values())).reshape(-1, 1))

KMeans(n_clusters=2)

In [103]:
cl1 = [word for i, word in enumerate(total_words) if clf.labels_[i] == 0]
cl2 = [word for i, word in enumerate(total_words) if clf.labels_[i] == 1]


In [155]:
cl_wbias = [(total_words[i], b) for i, (_, b) in enumerate(zip(clf.labels_ , total_biases))]
cl_wbias.sort(key=lambda x : x[1])
cl_wbias

[('great', -0.15194038),
 ('good', -0.124341875),
 ('late', -0.114944816),
 ('better', -0.107393414),
 ('only', -0.10661352),
 ('able', -0.10522434),
 ('much', -0.1051029),
 ('official', -0.098887116),
 ('illegal', -0.091786996),
 ('big', -0.089235276),
 ('long', -0.08627012),
 ('i', -0.08240569),
 ('least', -0.08076236),
 ('more', -0.07979515),
 ('other', -0.078235626),
 ('elevated', -0.077527985),
 ('many', -0.076260895),
 ('ready', -0.07551125),
 ('live', -0.06057474),
 ('figure', -0.051110536),
 ('old', -0.03286302),
 ('Spanish', nan),
 ('poor', -0.08511603),
 ('public', -0.053845763),
 ('finish', -0.051558346),
 ('insane', -0.046693668),
 ('low', -0.04029584),
 ('additional', -0.03615892),
 ('mad', -0.0357894),
 ('distracted', -0.031624496),
 ('commendable', -0.031567167),
 ('interesting', -0.023363173),
 ('favorite', -0.010839194),
 ('normal', -0.008895695),
 ('horrible', -0.0036470443),
 ('frozen', -0.0031115413),
 ('trivial', 0.005471833),
 ('tagline', 0.010733325),
 ('*', 0.02

In [92]:
words_bias = [(word, bias) for word, bias in zip(total_words, total_biases)]
words_bias

[('other', 0.07823571562767029),
 ('much', 0.10510295629501343),
 ('particular', 0.08912786841392517),
 ('early', 0.0946040153503418),
 ('super', 0.06659221649169922),
 ('trauma', 0.004261001944541931),
 ('Seek', 0),
 ('same', 0.10040491819381714),
 ('douchebag', 0),
 ('most', 0.1046435534954071),
 ('nightshift', -0.10937779396772385),
 ('ok', 0.021766632795333862),
 ('many', 0.07626080513000488),
 ('good', 0.12434187531471252),
 ('best', 0.07849708199501038),
 ('inner', 0.03379160165786743),
 ('shitty', -0.015032857656478882),
 ('own', 0.10582613945007324),
 ('big', 0.08923527598381042),
 ('individualist', -0.023587927222251892),
 ('odd', 0.033532947301864624),
 ('whole', 0.05724853277206421),
 ('ridiculous', 0.018905028700828552),
 ('more', 0.07979512214660645),
 ('sure', 0.0789741575717926),
 ('wrong', 0.07946839928627014),
 ('worse', 0.08775311708450317),
 ('next', 0.08731654286384583),
 ('pretty', -0.014760792255401611),
 ('uncomfortable', -0.034107059240341187),
 ('figure', 0.051

In [93]:
words_bias.sort(key=lambda x : x[1])
words_bias

[('sex-positive', -0.24502624571323395),
 ('feminist', -0.24222198128700256),
 ('ditzy', -0.23421357572078705),
 ('sister', -0.22570228576660156),
 ('sparkly', -0.21725696325302124),
 ('pregnant', -0.21031558513641357),
 ('schoolgirl', -0.21022802591323853),
 ('blonde', -0.20907984673976898),
 ('lingerie', -0.20723314583301544),
 ('slutty', -0.206636443734169),
 ('lesbian', -0.198031485080719),
 ('sexy', -0.19681838154792786),
 ('busty', -0.19507433474063873),
 ('frumpy', -0.1929323822259903),
 ('curvy', -0.18788550794124603),
 ('feminine', -0.18525263667106628),
 ('trashy', -0.18445608764886856),
 ('lovely', -0.18385887145996094),
 ('free-spirited', -0.18340463191270828),
 ('motherly', -0.1830116342753172),
 ('unfeminine', -0.18141299113631248),
 ('girlfriend', -0.1802680492401123),
 ('petite', -0.17757748812437057),
 ('girl-next-door', -0.17656629532575607),
 ('flirty', -0.17647230625152588),
 ('waifish', -0.17562266066670418),
 ('girl', -0.17440158128738403),
 ('20-week', -0.1733605

In [94]:
male_biased = [(word, bias) for word, bias in words_bias if bias < 0]
female_biased = [(word, bias) for word, bias in words_bias if bias > 0]

In [None]:
female_biased

[('strange', 5.4836273193359375e-05),
 ('taller', 6.236135959625244e-05),
 ('late-night', 0.00011915713548660278),
 ('nonexistent', 0.00013811886310577393),
 ('serial', 0.00021600723266601562),
 ('unlucky', 0.0002333521842956543),
 ('respiratory', 0.00026026368141174316),
 ('lifestyle', 0.00026986002922058105),
 ('dry', 0.00032597780227661133),
 ('chill', 0.0003351867198944092),
 ('lick', 0.0004587620496749878),
 ('unemotional', 0.00047761574387550354),
 ('2-3', 0.0005043074488639832),
 ('oppressive', 0.0005068331956863403),
 ('converse', 0.0005839020013809204),
 ('obsolete', 0.0006929337978363037),
 ('semi', 0.0007715523242950439),
 ('damned', 0.0008497089147567749),
 ('communal', 0.0008646473288536072),
 ('behind-the-scenes', 0.0009113065898418427),
 ('relax', 0.0009367167949676514),
 ('sympathetic', 0.0009518563747406006),
 ('random', 0.0009652674198150635),
 ('coolest', 0.001024588942527771),
 ('armpit', 0.001117020845413208),
 ('habitable', 0.0012960880994796753),
 ('synthetic', 0

In [95]:
male_biased.sort(key=lambda x : x[1])
male_biased

[('sex-positive', -0.24502624571323395),
 ('feminist', -0.24222198128700256),
 ('ditzy', -0.23421357572078705),
 ('sister', -0.22570228576660156),
 ('sparkly', -0.21725696325302124),
 ('pregnant', -0.21031558513641357),
 ('schoolgirl', -0.21022802591323853),
 ('blonde', -0.20907984673976898),
 ('lingerie', -0.20723314583301544),
 ('slutty', -0.206636443734169),
 ('lesbian', -0.198031485080719),
 ('sexy', -0.19681838154792786),
 ('busty', -0.19507433474063873),
 ('frumpy', -0.1929323822259903),
 ('curvy', -0.18788550794124603),
 ('feminine', -0.18525263667106628),
 ('trashy', -0.18445608764886856),
 ('lovely', -0.18385887145996094),
 ('free-spirited', -0.18340463191270828),
 ('motherly', -0.1830116342753172),
 ('unfeminine', -0.18141299113631248),
 ('girlfriend', -0.1802680492401123),
 ('petite', -0.17757748812437057),
 ('girl-next-door', -0.17656629532575607),
 ('flirty', -0.17647230625152588),
 ('waifish', -0.17562266066670418),
 ('girl', -0.17440158128738403),
 ('20-week', -0.1733605

In [52]:
def Cluster(biasc1, biasc2, r, repeatk, verbose = True):
	'''
	biasc1 list<words> : List of words biased towards target concept1 as returned by GetTopMostBiasedWords
	biasc2 list<words> : List of words biased towards target concept2 as returned by GetTopMostBiasedWords
	r <int> : reduction factor used to determine k for the kmeans; k = r * len(voc) 
	repeatk <int> : Number of Clustering to perform only to keep the partition with best intrasim
	'''
	def getCosineDistance(embedding1, embedding2): 
		return spatial.distance.cosine(embedding1, embedding2)
	def getIntraSim(partition):
		iS = 0
		for cluster in partition:
			iS += getIntraSimCluster(cluster)
		return iS/len(partition)
	def getIntraSimCluster(cluster):
		if(len(cluster)<=1):
			return 0
		sim = 0; c = 0
		for i in range(len(cluster)):
			w1 = total_word_embeddings[cluster[i]]
			for j in range(i+1, len(cluster)):
				w2 = total_word_embeddings[cluster[j]]
				sim+= 1-getCosineDistance(w1,w2)
				c+=1
		return sim/c
	def createPartition(embeddings, biasw, k):
		preds = KMeans (n_clusters=k).fit_predict(embeddings)
		#first create the proper clusters, then estiamte avg intra sim
		all_clusters = []
		for i in range(0, k):
			clust = []
			indexes = np.where(preds == i)[0]
			for idx in indexes:
				clust.append(biasw[idx])
			all_clusters.append(clust)
		score = getIntraSim(all_clusters)
		return [score, all_clusters]


	k = int(r * (len(biasc1)+len(biasc2))/2)
	emb1, emb2  = [total_word_embeddings[w] for w in biasc1], [total_word_embeddings[w] for w in biasc2]
	mis1, mis2 = [0,[]], [0,[]]	#here we will save partitions with max sim for both target sets
	for run in range(repeatk):
		p1 = createPartition(emb1, biasc1, k)
		if(p1[0] > mis1[0]):
			mis1 = p1
		p2 = createPartition(emb2, biasc2, k)
		if(p2[0] > mis2[0]):
			mis2 = p2
		if(verbose == True):
			print('New partition for ts1, intrasim: ', p1[0])
			print('New partition for ts2, intrasim: ', p2[0])

	print('[*] Intrasim of best partition found for ts1, ', mis1[0])
	print('[*] Intrasim of best partition found for ts2, ', mis2[0])
	return [mis1[1], mis2[1]]
		


In [58]:
int(0.15 * (len(female_biased)+len(male_biased)))/2


2096.5

In [54]:
clust1, clust2 = Cluster(female_biased, male_biased, 0.15, 100)

  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_


New partition for ts1, intrasim:  0.504293893129771
New partition for ts2, intrasim:  0.23043893129770993


  return self.fit(X, sample_weight=sample_weight).labels_


KeyboardInterrupt: 

In [None]:
2/7000

[['unconnected'],
 ['girl', 'girl', 'girl', 'girl', 'girl', 'girl'],
 ['sexuality',
  'sex',
  'sex',
  'sex',
  'sex',
  'sex',
  'sex',
  'sex',
  'sex',
  'sexual'],
 ['unattractive', 'attractiveness'],
 ['*', '*', '*', '*', '*', '*', '*'],
 ['marriage',
  'marriage',
  'marriage',
  'marriage',
  'marriage',
  'marriage',
  'marriage',
  'divorce'],
 ['hair', 'hair', 'hair', 'hair', 'hair'],
 ['http', 'http', 'http'],
 ['okay', 'pretty', 'routine'],
 ['%', '%', '%'],
 ['comfort', 'stranger', 'casual', 'uncommon', 'desperate', 'shy'],
 ['attractive',
  'attractive',
  'attractive',
  'attractive',
  'attractive',
  'attractive',
  'attractive'],
 ['dancer', 'dance', 'dance', 'dance', 'dance', 'dancing'],
 ['housekeeper', 'babysitter', 'co-worker', 'manipulative', 'rep', 'crush'],
 ['ratio', 'ratio'],
 ['subtle', 'subtle', 'hint'],
 ['sister', 'mom', 'mother'],
 ['expectancy', 'obese'],
 ['feminism', 'feminism'],
 ['woman', 'woman', 'woman', 'woman', 'woman', 'woman', 'woman'],
 ['gr