# Form2Meaning Mapping

* Author: Stefano Scola
* Supervisor: Giovanni Cassani
* University: Tilburg University

The purpose of this notebook is to give users a clear and easy to use approach to generate alligned word embeddings (Word2Vec) using the Compass Aligned Distributional Embeddings (CADE) libary. Moreover, it allows user to generate form-based semantic vectors from the names of fictional characters, by deploying two form-meaning mapping functions: Orthographic Semantic Consistency (OSC) & Linear Discriminative Learning (LDL).



# Word2Vec HyperParameters and Imports libraries


---



In [None]:
min_count = 5
size = 50
model_type = 1
window = 5

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)



Mounted at /content/drive


In [None]:
PATH = "drive/MyDrive/word_embeddings/ch_ya/"

In [None]:
!pip install cade

In [None]:
# Imports

# Plaintext2Emb
import nltk
import string
import gensim
import spacy
import pickle as p
import numpy as np

from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer

# Visualising the wordspace
from sklearn.decomposition import PCA
from matplotlib import pyplot

import pandas as pd

# Loading the model
from scipy import stats, spatial
from sklearn import preprocessing
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt

# CADE
import os
import pickle
#from cade.cade import CADE
from sklearn.manifold import TSNE
from gensim.models.word2vec import Word2Vec

# Visualisation
import plotly.express as px


# Classifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Importing data
Our approach takes two types of files. First, we need a labeled character list extracted from corpuses we wish to investigate. To do so create a .csv/.xlsx file, or multiple, in the labeled_names folder with this given structure:

* title: the name of the file which contains the corpus where the name appears
* full_name: the name of the character
* newID: how the name is referenced in the corpus
* gender: male/female
* age_recoded: young/old
* freq_sum: the cumulative frequency
* type: Real/Talking/Madeup

The second file we need are the corpus where each name appears, this shoule be placed in the corpus folder.

## Names

In [None]:
# loading names selection
complete_df = pd.read_csv(PATH + "labeled_names/total_ChYA_names.csv")
# dropping useless? columns
complete_df.drop(columns=["id", "age.stage.original", "name_age"], inplace=True)

complete_df.loc[ 28, "full_name"] = "Sprout"
complete_df.loc[ 23, "full_name"] = "Silky"
complete_df.loc[ 24, "full_name"] = "Myrtle"
complete_df.loc[ 24, "full_name"] = "Virginia"

In [None]:
complete_df.head(80)

Unnamed: 0,title,full_name,gender,rs_frequency,author,newID,age_recoded,freq_sum,class
0,WILSON_dustbinbaby_2001,April Johnson,female,1525,WILSON,April_Johnson_WILSON_char,young,3122,Talking
1,ALMOND_jackdawsummer_2008,Ball,male,95,ALMOND,Ball_ALMOND_char,old,95,Talking
2,GAVIN_blackberryblue_2013,Blackberry Blue,female,17,GAVIN,Blackberry_Blue_GAVIN_char,young,249,Talking
3,FINE_charmschool_1999,Bonny Bramble,female,1387,FINE,Bonny_Bramble_FINE_char,young,1393,Talking
4,MURPHY_firstprizefortheworstwitch_2018,Brilliantine,female,143,MURPHY,Brilliantine_MURPHY_char,old,279,Talking
...,...,...,...,...,...,...,...,...,...
75,FINE_uponcloudnine_2002,Stuart Terence Oliver,male,20,FINE,Stuart_Terence_Oliver_FINE_char,young,1782,Real
76,FINE_troubleintoadpool_2012,Susan Harlow,female,533,FINE,Susan_Harlow_FINE_char,old,533,Real
77,ROWLING_harrypotterandthedeathlyhallows_2007,Sybill Trelawney,female,6,ROWLING,Sybill_Trelawney_ROWLING_char,old,922,Real
78,FINE_eatingthingsonsticks_2009,Tristram,male,1392,FINE,Tristram_FINE_char,old,2266,Real


## Corpus

The corpus consist of 94 different books. 

Let's check if the names we wish to investigate appear in those books.

In [None]:

#checking how many times we do not find the id in the story
issues = 0
for i in range(len(complete_df)):


    # Test open
    corpus = complete_df.loc[i].title
    file_path = PATH + 'final_recoded_corpus/' + corpus + '_recoded.txt'
    txt = open(file_path, "r").read()



    # Testing name
    name_tag = ''
    name = complete_df['full_name'].loc[i]
    #print('\nName to look up:', name)
    #print('\nFile path: ', file_path)
    #print('Preview: ', txt[:500])
    id = complete_df["newID"].loc[i]
    #print("\nID to check:", id)

    
    # Name search
    count_check = 0

    for word in txt.split():

        #tag = word.split('_')
        #if word.endswith("_char"):
        if word == id:
          count_check += 1
          if name_tag == '':
            name_tag = word

    if count_check == 0:
      issues += 1
      print('\nName to look up:', name)
      print('\nStory file: ', corpus)
      print('Preview: ', txt[:500])
      print("\nID to check:", id)

      print('Tag: ', name_tag)
      print('\nExpected name count: ', complete_df['rs_frequency'].loc[i])
      print('Resulting name count: ', count_check)
      print('\nCumulative freq count: ', complete_df['freq_sum'].loc[i])
      print("index in the dataframe:", i)

print("\nNumber of issues", issues)

# Cleaning corpus

The corpus is cleaned and saved in the folder "cleaned_corpus". 

We have lowercased, tokenized and removed stopwords.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Function to clean and prepare the corpus for word2vec

def clean_text(test_text, preview=False):
    # Separating the sentences of the corpus
    nltk_tokens = nltk.sent_tokenize(test_text)

    # Split each sentence into word lists
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9_\.]+\b')
    split_sent = [tokenizer.tokenize(sent.lower()) for sent in nltk_tokens] # mod... sent.lower() to avoid doubles

    # Lemmatization with Spacy
    nlp = spacy.load('en')
    
    #temp = []
    lemma_sent = []

    for sent in split_sent:
        doc = nlp(" ".join(sent))
        temp = []
        for token in doc:
            #print(type(token))
            if token.lemma_ != '-PRON-':
                temp.append(token.lemma_)
            elif str(token).endswith("_char"):
              temp.append(str(token))  
            else:
                temp.append(str(token))
        lemma_sent.append(temp)

    # DO NOT REMOVE PRONOUNS, HIM HER ETC.
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.remove('him')
    stop_words.remove('her')
    stop_words.remove('hers')
    stop_words.remove('his')
    stop_words.remove('he')
    stop_words.remove('she')
    no_stop = []
    for sent in lemma_sent:
        tokens_without_sw = [word for word in sent if not word in stop_words]
        no_stop.append(tokens_without_sw)



    # Print previews
    if preview:
        print('-'*40 + ' Original file samples ' + '-'*40)
        print(test_text[:500])
        print('-'*40 + ' Sentence samples ' + '-'*40)
        for sent in nltk_tokens[:35]: print(sent)
        print('-'*35 + ' No punctuation sentence samples ' + '-'*35)
        for sent in split_sent[:35]: print(sent)
        print('-'*40 + ' Lemma-sentence samples ' + '-'*40)
        for sent in lemma_sent[:35]: print(sent)
        print('-'*35 + ' No stop words sentence samples ' + '-'*35)
        for sent in no_stop[:35]: print(sent)

    return no_stop

In [None]:
# Clean all stories and save them in a pickle format
file_list = os.listdir(PATH + 'final_recoded_corpus')
print(len(file_list))
for idx, file in enumerate(file_list):
    # Opening file
    print(file)
    print(idx)
    txt = open(PATH + 'final_recoded_corpus/'+file, "r").read()
    
    # Saving in pickle to keep list structure
    p.dump( clean_text(txt), open(PATH + 'cleaned_corpus/'+file[:-12] + ".p", "wb" ))

# Compass

We create the compass, that is to say a concatenation of all the books. Compass file is saved in the folder "CADE".

We use the compass to train word2vec in order to get a general semantic space.

In [None]:
compass = ''
clean_compass = []
file_list = os.listdir(PATH + 'cleaned_corpus')

for idx, file in enumerate(file_list):
    print("currently processing " + file)
    print(idx)
    
    # Get corpus name
    # story_name = complete_df.loc[index].title + '.p'

    # Load file
    # print("opening pickled file" + file)
    sentence_list = p.load(open( PATH + 'cleaned_corpus/' + file, "rb" ))
    
    temp = []
    
    # Joining all sentences to make a single plaintext file
    for sentence in sentence_list:
        #print(sentence)
        temp.append(" ".join(sentence))

    final = " ".join(temp)
    clean_compass.append(sentence_list)
    
    # Saving individual stories
    #if index < 10:
    text_file = open(PATH + "CADE/training/"+ file[:-2] +".txt", "w")
    #else:
        #text_file = open(PATH + "CADE/training/corpus_"+str(index)+".txt", "w")
        
    text_file.write(" ".join(temp))
    text_file.close()
    
    # Adding corpus to compass
    compass += final
    
# Saving the Compass
text_file = open(PATH + "CADE/compass.txt", "w")
text_file.write(compass)
text_file.close()

In [None]:
train = []
for i in clean_compass:
    for e in i:
        train.append(e)
    
len(train)

284537

## Word2Vec training on Compass

The model is saved either in the folder "skipgram" or "CBOW".

In [None]:
# Params for save file



# train model
# sg default 0 = CBOW; 1 = skipgram
model = Word2Vec(train, 
                     min_count = min_count,
                     window=window, 
                     size=size,
                     alpha=0.01,
                     sg=model_type, 
                     #min_alpha=0.0007,  
                     negative=10) # 10^-4

# summarize the loaded model
print(model)

# summarize vocabulary
words = list(model.wv.vocab)
print("\nVocabulary (first 50): \n", words[:50])

# model folder
model_folder = ''
if model_type:
    model_folder = 'skipgram'
else:
    model_folder = 'CBOW'

# save model
model.save(PATH + model_folder+'/model_'+str(min_count)+'_'+str(size)+'.bin')

Word2Vec(vocab=16822, size=50, alpha=0.01)

Vocabulary (first 50): 
 ['together', 'year', 'call', 'bad', 'lad', 'joke', 'mischief', 'maker', 'pest', 'never', 'cause', 'proper', 'trouble', 'least', 'till', 'autumn', 'round', 'time', 'turn', '13', 'klaus_vogel_almond_char', 'come', 'regular', 'tonto', 'court', 'dan', 'digby', 'spark', 'twins', 'fred', 'frank', 'fill', 'felling', 'go', 'st', 'john', 'joe_gillespie_almond_char', 'old', 'rest', 'us', 'keep', 'bit', 'apart', 'leader', 'great', 'long', 'curl', 'collar', 'wear', 'fade']



This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



## Inspecting the vocab and loading the model

We can inspect the vocabulary of the model that we have trained on the compass.

Moreover, we load the model (twice). We will need it for the form-meaning mapping functions.





In [None]:
#print("\nVocabulary (first 50): \n", words[:500])

clean_compass_model = Word2Vec.load(PATH + "skipgram/model_5_50.bin")
compass_model = Word2Vec.load(PATH + "skipgram/model_5_50.bin")

words = list(compass_model.wv.vocab)
#print(len(words))
#print(words[:50])


## MEN benchmark on Compass
SpearmanrResults:
1.   correlation=0.4004791629689339
2.   pvalue=1.2052347801339114e-96



In [None]:
# benchmark dataframe
spear_df = pd.DataFrame(columns=(['model', 'vocab_size', 'spearmanr_corr', 'spearmanr_p']))

# Importing MEN dataset
dataset = open(PATH + "MEN/MEN_dataset_lemma_form_full", "r").read()
dataset = dataset.split('\n')

In [None]:
print(dataset)

['sun-n sunlight-n 50.000000', 'automobile-n car-n 50.000000', 'river-n water-n 49.000000', 'stair-n staircase-n 49.000000', 'morning-n sunrise-n 49.000000', 'rain-n storm-n 49.000000', 'cat-n kitten-n 49.000000', 'dance-n dancer-n 49.000000', 'camera-n photography-n 49.000000', 'cat-n feline-j 48.000000', 'sunny-j sunshine-n 48.000000', 'pregnancy-n pregnant-j 48.000000', 'beach-n sand-n 48.000000', 'bakery-n bread-n 48.000000', 'flower-n garden-n 48.000000', 'grass-n lawn-n 48.000000', 'copper-n metal-n 48.000000', 'photo-n photography-n 47.000000', 'cemetery-n graveyard-n 47.000000', 'gravestone-n graveyard-n 47.000000', 'sun-n sunshine-n 47.000000', 'black-j dark-j 47.000000', 'cathedral-n church-n 47.000000', 'frozen-j ice-n 47.000000', 'station-n subway-n 47.000000', 'child-n kid-n 46.000000', 'aquarium-n fish-n 46.000000', 'light-n lighting-n 46.000000', 'fungus-n mushroom-n 46.000000', 'frost-n snow-n 46.000000', 'burn-v flame-n 46.000000', 'ocean-n sea-n 46.000000', 'candy-n c

In [None]:
df = pd.DataFrame(columns=(['word_1', '1_in_vocab',
                            'word_2', '2_in_vocab',
                            'MEN_score', 'EMB_cos']))

for line in dataset:
    temp_line = line.split()
    try:
        first_word = temp_line[0].split('-')[0]
        second_word = temp_line[1].split('-')[0]
        
        df = df.append({'word_1': first_word,
                        '1_in_vocab': first_word in model.wv.vocab,
                        'word_2': second_word,
                        '2_in_vocab': second_word in model.wv.vocab,
                        'MEN_score':  temp_line[2],
                        'EMB_cos': model.wv.similarity(first_word, second_word)},
                        ignore_index=True)
    except:

        pass

In [None]:
print('MEN rows: ', df.size/len(df.columns))
print('Dictionary size: ', len(model.wv.vocab))
print('Total missing words: ', df[df['1_in_vocab'] == False].size + df[df['2_in_vocab'] == False].size) 

MEN rows:  2682.0
Dictionary size:  16822
Total missing words:  0


In [None]:
# Normalizing the MEN_score

MEN_score = df.MEN_score.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(MEN_score.reshape(-1, 1))
df['norm_MEN_score'] = pd.DataFrame(x_scaled)

print(stats.spearmanr(df.norm_MEN_score, df.EMB_cos))

SpearmanrResult(correlation=0.40683729980717315, pvalue=1.8937703629283692e-107)


# Model(s) with CADE

We use CADE to generate **aligned** word embeddings (comparable even if coming from different sources).

CADE uses the Compass to generate a DSM for each story.

CADE saves the model files in the folder "CADE/models".


In [None]:

aligner = CADE(size=size,
               sg=model_type,
               min_count=min_count,
               window=window,
               opath=PATH + 'CADE/models/model_'+str(min_count)+'_'+str(size))

# train on the compass: the text should be the concatenation of the text from the slices
aligner.train_compass(PATH + "CADE/compass.txt", overwrite=False) # keep an eye on the overwrite behaviour



Training the compass from scratch.



This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function



In [None]:
# After having created the Compass we train a model on each corpus (found in the stories folder)
slice_list = []
path = PATH + 'CADE/training'
folder = os.listdir(path)
print(sorted(folder))

for story in sorted(folder):
    if story[0].isupper():
        slice_list.append(aligner.train_slice(path+'/'+story, save=True))

print(f'Number of models trained: {len(slice_list)}')

# Word-Form Matrix
 
In order to deploy the form-meaning mapping functions we need to vectorize the orthographic form of the names we want to investigate. 

To this end, we use letter n-grams to featurize the name of the characters.


## n-grams featurization Class

In [None]:
class FormMatrix:
   """
  Takes a list of words to be vectorized using letter ngrams to then generate form vectors for each word.

  word_list : a list of words
  n: the number of ngrams
  map_dict: ngrams mapping

  return word-form matrix, each row is a word, the columns are the number of unique ngrams
  """
  def __init__(self, word_list, n, map_dict = None):
    self.word_list = word_list
    self.ngram_size = n
    self.idx2word = {i: w for i, w in enumerate(self.word_list)}
    self.word2idx = {w: i for i, w in enumerate(self.word_list)}
    self.map_dict = map_dict
    if not map_dict:
      self.form_matrix, self.map_dict = self.ngrams_encoding(self.word_list, n)
    else:
      self.form_matrix, self.map_dict = self.ngrams_encoding(self.word_list, n, self.map_dict)


  def ngram_featurizer(self, s, n):
    
    """takes in a string and an integer defining the size of ngrams.
     Returns the ngrams of desired size in the input string"""
    if n == 1:
        t = 1
    else:
        t = n-1
    s = '#'*t + s + '#'*t
    ngrams = [s[i:i+n] for i in range(len(s)-n+1)]
    
    return ngrams


  
  def ngrams_encoding(self, word_list, n, mapping=None):
    
    """
    Takes in a list of strings, an integer indicating the character ngrams' size,
    and a dictionary mapping ngrams to numerical indices. If no dictionary is passed,
    one is created inside the function.
    The function outputs a 2d NumPy array with as many rows as there are strings in 
    the input list, and the mapping from ngrams to indices, representing the columns 
    of the NumPy array.
    """
    
    if not mapping:
        all_ngrams = set()
        for word in word_list:


            all_ngrams = all_ngrams.union(set(self.ngram_featurizer(word, n)))
    
        mapping = {char: i for i, char in enumerate(all_ngrams)}
    
    matrix = np.zeros((len(word_list), len(mapping)))
    for i, instance in enumerate(word_list):
        for ngram in self.ngram_featurizer(instance, n):
            try:
                matrix[i, mapping[ngram]] += 1
            except KeyError:
                pass
    
    return matrix, mapping




## Creating Form Matrix

Instatiating the Word-Form Matrix by feeding a clean version of the vocabulary (Compass). We have removed the name of the characters in order to avoid bias when estimating form-based semantic vectors.

In [None]:
w2v_clean_vocab = [word for word in words if not word.endswith("_char") and word.isalpha()]

# instatiating the FormMatrix
ngram_size = 1
FM = FormMatrix(w2v_clean_vocab, ngram_size)


# the lenght of the vocab and the rows of the form matrix will be the same
print(len(w2v_clean_vocab))

FM.form_matrix.shape

15361


(15361, 27)

# Mapping Functions



## Orthographic Semantic Consistency

In [None]:
def find_most_similar(TW, FM, k=5):
  """Calculates the most similar k words, given the target vector of a word. Returns the most similar words
  inputs: target_word_form_matrix (TW)
  general_word_form_matrix (FM)
  number of neighbors (k)
  
  returns: list with k most similar words"""
  most_similar = []
  # calculating similarity and appending a tuple to most_similar
  # tuple ->(similarity, word)
  for idx, vector in enumerate(FM.form_matrix):
    cosine_similarity = 1 - spatial.distance.cosine(TW.form_matrix, vector)
    most_similar.append((cosine_similarity, FM.idx2word[idx]))
  
  # sorting ascending
  most_similar = sorted(most_similar, reverse=True)

  # making sure there isn't a case in which similarity is 1
  most_similar = [(sim , name) for sim , name in most_similar if sim != 1]

  # getting the similarity of neighbors at k index
  k_distance = most_similar[k-1][0]
  
  # new list with only relevant words
  k_similar = most_similar[:k]

  # looping over a slice, excluding already added words 
  for w in most_similar[k:]:
    # if similarity of current word == similarity of k neighbors
    if w[0] == k_distance:
      k_similar.append(w)
    else:
      break 


  return k_similar


In [None]:
def osc(target_s, model, FM, k):
   """
  We return a form-based semantic vector by averaging semantic vectors of words with a similar orthographic form 

  target_s: input name string
  model: semantic model to retrieve semantic vectors
  FM: form matrix to retrive form vector
  k: number of similar word (in form space) to include in the computation
  """
  # form-encoding of target word
  target_w = target_s.split()
  TW = FormMatrix(target_w, FM.ngram_size, FM.map_dict)

  k_similar = find_most_similar(TW=TW, FM=FM, k=k)
  # sizes of the model
  vectors_sum = np.zeros(model.vector_size)
  sim_sum = 0
  for sim, word in k_similar:
    sim_sum += sim
    print("most similar in form space: ", word, sim)

    w2v_word = model.wv[word]
    vectors_sum += (w2v_word*sim)


  osc = vectors_sum/sim_sum
  
  return osc






## Linear Discriminative Learning


In [None]:
FM.form_matrix.shape

(15361, 5411)

In [None]:
clean_compass_model.wv.vectors.shape

(16822, 50)

In [None]:
# WE MULTIPLY THE SEMANTIC MATRIX (S) BY THE INVERSE OF THE FORM MATRIX (F)
# F and S MUST MUST HAVE SAME NUMBER OF ROWS
# need to do some cleaning



# clean the semantic matrix, excluding numbers and the annotated character names (ending with _char)
# TRIMMING DIRTY WORDS, THAT IS CHARACHTER NAMES AND NON ALPHABETIC
w2v_dirty_vocab = [word for word in words if word.endswith("_char") or not word.isalpha()]

ids_to_trim = [clean_compass_model.wv.vocab[w].index for w in w2v_dirty_vocab]

for w in w2v_dirty_vocab:
    del clean_compass_model.wv.vocab[w]

clean_compass_model.wv.vectors = np.delete(clean_compass_model.wv.vectors, ids_to_trim, axis=0)
clean_compass_model.wv.init_sims(replace=True)

for i in sorted(ids_to_trim, reverse=True):
    del(clean_compass_model.wv.index2word[i])


In [None]:
# the number of rows of the semantic matrix should now be equal to the the number of rows of the form matrix
clean_compass_model.wv.vectors.shape
print(FM.form_matrix.shape)

(15361, 50)

In [None]:
def ldl_mapping(FormM, w2v,):
  """ 
  It calculates the inverse of the Word-Form Matrix to then compute a form-meaning mapping function.
  param1: Word-Form Matrix, in which each word is a row
  param2: word2vec model, in which each word is a row
  the number of rows of the word-form matrix and of the w2v model matrix has to be the same
  returns: a form-meaning mapping matrix, in with as many rows as colums in the Word-Form Matrix and as many columns as dimension in the w2v model  """
  # F = [(15361, 573)]; S = [(15361, 50)]
  inv = np.linalg.pinv(FormM.form_matrix) # inverse of F -> [(573, 15361)]

  # matrix multiplication [(573, 15361)] * [(15361, 50)]
  ldl_mapping = np.matmul(inv, w2v.wv.vectors) # mapping matrix is -> (573, 50)
  return ldl_mapping


In [None]:
# we pass to the semantic matrix and the word-form matrix to genrate mapping matrix
# we use it later when generating form-based semantic vectors
linearDL_mapping = ldl_mapping(FormM=FM, w2v=clean_compass_model)

In [None]:
linearDL_mapping.shape

(5411, 50)

In [None]:
def ldl(target_str, ldl_mapping, FormM):
  """
  It generates a form-based semantic vector using Linear Discriminant Learning
  param1: target name
  param2: linear discriminant learning mapping matrix used to map the word-form to semantic dimensions
  param3: Word-Form Matrix, to compute vectorization of target name
  return form-based semantic vector
  """
  # input is in form of list
  target_w = target_str.split()

  # vectorizing the target name in ngrams
  # size of the vector is [(1, 573)]
  TW = FormMatrix(target_w, FormM.ngram_size, FormM.map_dict)

  # multiplying size vector with ldl mapping matrix
  # [(1, 573)] * [(573, 50)]
  ldl = np.matmul(TW.form_matrix, ldl_mapping)

  # return a vector of size [(1, 50)]
  return np.squeeze(ldl)


  

# Generation of Form Based Semantic Vectors 

We deploy the form-meaning mapping functions in order to generated form-based semantic vectors from the orthographic form of character names.

Form-based semantic vectors are saved in a pickle file.


In [None]:

# PREPARING DATASET
complete_df.loc[ 0 ,"osc_vector"] = 0

complete_df.loc[ 0, "ldl_vector"] = 0

complete_df["osc_vector"] = complete_df["osc_vector"].astype(object)
complete_df["ldl_vector"] = complete_df["ldl_vector"].astype(object)

for i in range(len(complete_df)):
  full_name = complete_df.loc[i, "full_name"].lower()
  #print(full_name, i)
  name = complete_df.loc[i, "full_name"].lower().split()[0]
  print(name, i)

  

In [None]:
# APPENDING TO THE DATASET OSC & LDL VECTORS FOR EACH CHARACHTER
for i in range(len(complete_df)):
  # first name of character
  name = complete_df.loc[i, "full_name"].lower().split()[0]
  print(name, i)
  # generate osc
  osc_form_based = osc(target_s= name, model=compass_model, FM=FM, k=5)
  # generate ldl
  ldl_form_based = ldl(target_str=name , ldl_mapping=linearDL_mapping, FormM = FM)
  complete_df["osc_vector"][i] = osc_form_based
  complete_df["ldl_vector"][i] = ldl_form_based





In [None]:
complete_df.to_pickle(PATH + "complete_df.pkl")

# Evaluation



*   Semantic Neighborhood Density (SND)
*   OSC and LDL vs centroid
*   form-based vs context based
*   Cosine similarity with words of interest.



## SND on COMPASS & similarity of OSC and LDL to centroid


In [None]:
def semantic_n_distance(w2v_model, target_vector, N = 5):
  """ returns the semantic neighbor density of the target vector, given N most similar words in COMPASS semantic space"""
  # get the most similar words to vector
  similars =  w2v_model.wv.similar_by_vector(target_vector, topn=N)
  print(similars)
  # averaging over the similarity of the most similar
  snd = np.mean([sim for _ , sim in similars])
  return snd

In [None]:
def diff_sim_wordlist(words_list_1, words_list_2, a_vector, centroid_v, a_model):
  """Return the diff between the sum of similarity of two word list, given a semantic vector and a w2vmodel """
  similarity_sum_1 = []
  similarity_sum_2 = []
  # looping over the first list
  for w1 in words_list_1:
    # get a semantic vector for each word
    word_vector = a_model.wv[w1]
    # calculates similarity with the semantic target vector given as a input
    sim_vector = 1 - spatial.distance.cosine(a_vector, word_vector)
    sim_centroid = 1 - spatial.distance.cosine(centroid_v, word_vector)
    # add similarity to total
    similarity_sum_1.append(sim_vector - sim_centroid) 

  # looping over the second list
  for w2 in words_list_2:
    # get a semantic vector for each word
    word_vector = a_model.wv[w2]
    # calculates similarity with the semantic target vector given as a input
    sim_vector = 1 - spatial.distance.cosine(a_vector, word_vector)
    sim_centroid = 1 - spatial.distance.cosine(centroid_v, word_vector)
    # add similarity to total
    similarity_sum_2.append(sim_vector - sim_centroid)

  return np.mean(similarity_sum_1) - np.mean(similarity_sum_2)



In [None]:
# APPENDING TO DATAFRAME SND FOR EACH CHARACTER IN COMPASS SPACE
# getting centroid of the semantic space based on compass
centroid = np.mean(compass_model.wv.vectors, axis=0)
snd_df = pd.DataFrame(columns=["Class", "SND_OSC", "centroid_vs_OSC", "centroid_vs_LDL", "SND_LDL"], dtype=float)

for i in range(len(complete_df)):
  Class = complete_df.loc[i, "class"]
  # getting the form-based semantic vectors from the dataset
  osc_vector = complete_df.loc[i, "osc_vector"]
  ldl_vector = complete_df.loc[i, "ldl_vector"]
  print(i)
  #print("SND of: ", name)
  # COMPUTING SND
  osc_snd = semantic_n_distance(compass_model, osc_vector, N = 20)
  ldl_snd = semantic_n_distance(compass_model, ldl_vector, N = 20)

  # similarity between osc and ldl
  osc_vs_ldl = 1 - spatial.distance.cosine(osc_vector, ldl_vector)
  #distance to centroid
  centroid_vs_osc = 1 - spatial.distance.cosine(centroid, osc_vector)
  centroid_vs_ldl = 1 - spatial.distance.cosine(centroid, ldl_vector)
  # appending to dataset
  snd_df.loc[ i, "SND_OSC"] = osc_snd
  snd_df["SND_LDL"][i] = ldl_snd
  snd_df["Class"][i] = Class
  #snd_df["osc_vs_ldl"][i] = osc_vs_ldl
  snd_df["centroid_vs_OSC"][i] = centroid_vs_osc
  snd_df["centroid_vs_LDL"][i] = centroid_vs_ldl

In [None]:
snd_df.to_csv(PATH + "snd_df.csv")

## form_based vs context_based, factoring in centroid. CADE aligned slices


In [None]:


# creating new column as object
#complete_df.loc[ 0 ,"CADE_slices_male_female"] = 0
#complete_df["CADE_slices_male_female"] = complete_df["CADE_slices_male_female"].astype(object)
#fab.loc[0, "wordform_vector"] = 0
#fab["wordform_vector"] = fab["wordform_vector"].astype(object)


form_context_df = pd.DataFrame(columns=["Class", "form_vs_context_OSC", "form_vs_context_LDL"], dtype=float)
path = PATH + 'CADE/models/model_5_50'
cade_models = os.listdir(path)
print(sorted(cade_models))
print(len(cade_models)) # LENGHT IS ONE TOO MANY COZ THERE IS A LOG.TXT FILE


# form vs context
for i in range(len(complete_df)):
  osc_sim_temp = []
  ldl_sim_temp = []
  #fab_vectors = []
  #name = complete_df.loc[i, "full_name"].lower().split()[0]
  # vectorize target name
  #target_name = name.split()
  #TW = FormMatrix(target_name, FM.ngram_size, FM.map_dict)
  #fab["wordform_vector"][i] = TW.form_matrix

  #ldl_sim_temp = []
  #ldl_sim_temp = []
  #gender_diff_temp = []

  # getting character annotation in corpus
  id_name = complete_df.loc[i, "newID"].lower()
  print(i, id_name)
  # retrieving author from the character
  author = complete_df.loc[i, "author"]
  Class = complete_df.loc[i, "class"]
  # retrieving form based semantic vector
  osc_vector = complete_df.loc[i, "osc_vector"]
  ldl_vector = complete_df.loc[i, "ldl_vector"]
  form_context_df.loc[ i , "Class"] = Class
  # loop over CADE slices
  for file in sorted(cade_models):
    # if the slice has the same author of the character
    if author in file:
      # load the model
      model_cade = Word2Vec.load(path + "/" + file)
      try:
        # try if name in vocab

        # get context vector for word in cade slice
        context_vector = model_cade.wv[id_name]
        #calculate centroid of the semantic space
        centroid_cade = np.mean(model_cade.wv.vectors, axis = 0)
        # calculate similarity between centroid and context, as a baseline
        centroid_context = 1 - spatial.distance.cosine(centroid_cade, context_vector)


        # calculate similarity between form based and context based ORTHOGRAPHIC SEMANTIC CONSINSTENCY
        osc_context = 1 - spatial.distance.cosine(osc_vector, context_vector)

        # positive difference signify that osc_vector does better than centroid in resembling the context based
        osc_centroid_diff = osc_context - centroid_context
        # appending similarity to temp list: OSC
        osc_sim_temp.append(osc_centroid_diff)

        # calculate similarity between form based and context based LINEAR DISCRIMINANT LEARNING
        ldl_context = 1 - spatial.distance.cosine(ldl_vector, context_vector)

        # positive difference signify that osc_vector does better than centroid in resembling the context based
        ldl_centroid_diff = ldl_context - centroid_context
        # appending similarity to temp list: LDL
        ldl_sim_temp.append(ldl_centroid_diff)


        # most similar neighbors to form based semantic vector in CADE slice ORTHOGRAPHIC SEMANTIC CONSINSTENCY
        #osc_m_similars = model_cade.wv.similar_by_vector(osc_vector, topn = 10)
        # appeding neighbors to temp list OSC
        #osc_most_sim_temp.append(osc_m_similars)

        # most similar neighbors to form based semantic vector in CADE slice LINEAR DISCRIMINANT LEARNING
        #ldl_m_similars = model_cade.wv.similar_by_vector(ldl_vector, topn = 10)
        # appeding neighbors to temp list LDL
        #ldl_most_sim_temp.append(ldl_m_similars)
        print("done")

        #try:

          # calculate distance from female and male terms for context vector in cade slice
          #gender_diff_cade = diff_sim_wordlist(male_terms, female_terms, context_vector, model_cade)
          #gender_diff_temp.append(gender_diff_cade)
        #except Exception as e:
          #print(e)
          #print("goin wrong")

      except KeyError:
        pass
  






      
  #adding temp list to dataframe
  form_context_df.loc[ i, "form_vs_context_OSC"] = np.mean(osc_sim_temp)
  form_context_df["form_vs_context_LDL"][i] = np.mean(ldl_sim_temp)
  #fab.loc[i , "cade_vector"] = np.mean(fab_vectors)
  #complete_df["most_similar_to_OSC"][i] = osc_most_sim_temp
  #complete_df["most_similar_to_LDL"][i] = ldl_most_sim_temp
  #complete_df["CADE_slices_male_female"][i] = gender_diff_temp

  








In [None]:
form_context_df.to_csv(PATH + "form_context_df.csv")



## Cosine between target and theme words for gender in:
- FORMSPACE
- SEMANTIC COMPASS for context based

- SEMANTIC COMPASS for form based (OSC & LDL)

In [None]:
female_terms = ["she", "daughter", "hers", "her", "mother",
"woman",
"girl",
"female",
"sister",
"aunt",
"niece",]
male_terms = ["he",
"son",
"his",
"him",
"father",
"man",
"boy",
"male",
"brother",
"uncle",
"nephew"]

centroid = np.mean(compass_model.wv.vectors, axis=0)
gender_df = pd.DataFrame(columns=["Class", "gender", "OSC_male_female" , "LDL_male_female", "context_male_female", "WordForm_male_female"], dtype=float)


for i in range(len(complete_df)):
  # IN SEMANTIC SPACE COMPASS

  #filling up evaluation dataframe
  Class = complete_df.loc[i, "class"]
  gender_df.loc[ i, "Class"] = Class
  gender = complete_df.loc[i, "gender"]
  gender_df.loc[ i, "gender"] = gender


 

  # getting id of character
  id_name = complete_df.loc[i, "newID"].lower()
  # get context vector based on compass
  context_compass_vector = compass_model.wv[id_name]
  # get form based vectors
  osc_vector = complete_df.loc[i, "osc_vector"]

  ldl_vector = complete_df.loc[i, "ldl_vector"]

  # calculating difference in similarity between osc_vector and male & female terms
  gender_diff_osc = diff_sim_wordlist(male_terms, female_terms, a_vector = osc_vector, centroid_v=centroid ,a_model = compass_model)

  # calculating difference in similarity between ldl_vector and male & female terms
  gender_diff_ldl = diff_sim_wordlist(male_terms, female_terms, ldl_vector, centroid, compass_model)

  # calculating difference in similarity between context_compass_vector and male & female terms
  gender_diff_context = diff_sim_wordlist(male_terms, female_terms, context_compass_vector, centroid, compass_model)



  # appending to dataframe
  # FACTORING IN THE SIMILARITY TO CENTROID
  # IF THE VALUE IS POSITIVE IT MEANS THAT THE SEMANTIC VECTOR DOES BETTER THAN CENTROID
  gender_df["OSC_male_female"][i] = gender_diff_osc
  gender_df["LDL_male_female"][i] = gender_diff_ldl

  gender_df["context_male_female"][i] = gender_diff_context


  # IN FORM SPACE
  # getting name of character
  name = complete_df.loc[i, "full_name"].lower().split()[0]
  # vectorize target name
  target_name = name.split()
  TW = FormMatrix(target_name, FM.ngram_size, FM.map_dict)

  # computes similarity in form space
  form_male_sim = 0
  form_female_sim = 0
  for m_t in male_terms:
    male_sim = 1 - spatial.distance.cosine(TW.form_matrix[0],  FM.form_matrix[FM.word2idx[m_t]])
    form_male_sim += male_sim

  for f_t in female_terms:
    female_sim = 1 - spatial.distance.cosine(TW.form_matrix[0],  FM.form_matrix[FM.word2idx[f_t]])
    form_female_sim += female_sim

  # positive values indicates that the form vector is closer to male terms
  gender_df["WordForm_male_female"][i] = form_male_sim - form_female_sim



In [None]:
gender_df.to_csv(PATH + "gender_df.csv")

## Cosine between target and theme words for age in:
- FORMSPACE
- SEMANTIC COMPASS for context based

- SEMANTIC COMPASS for form based (OSC & LDL)

In [None]:
young_terms = ["kid", "youth", "young", "youngster", "infant", "junior", "child", "adolescent", "teenager"]
old_terms = ["grandfather", "grandmother", "old", "senior", "elderly", "elder"]

centroid = np.mean(compass_model.wv.vectors, axis=0)
age_df = pd.DataFrame(columns=["Class", "age", "OSC_young_old" , "LDL_young_old", "context_young_old", "WordForm_young_old"], dtype=float)


for i in range(len(complete_df)):
  # IN SEMANTIC SPACE COMPASS

  #filling up evaluation dataframe
  Class = complete_df.loc[i, "class"]
  age_df.loc[ i, "Class"] = Class
  age = complete_df.loc[i, "age_recoded"]
  age_df.loc[ i, "age"] = age


 

  # getting id of character
  id_name = complete_df.loc[i, "newID"].lower()
  # get context vector based on compass
  context_compass_vector = compass_model.wv[id_name]
  # get form based vectors
  osc_vector = complete_df.loc[i, "osc_vector"]

  ldl_vector = complete_df.loc[i, "ldl_vector"]
  
  

  # calculating difference in similarity between osc_vector and male & female terms
  age_diff_osc = diff_sim_wordlist(young_terms, old_terms, a_vector = osc_vector, centroid_v=centroid, a_model = compass_model)

  # calculating difference in similarity between ldl_vector and male & female terms
  age_diff_ldl = diff_sim_wordlist(young_terms, old_terms, ldl_vector, centroid ,compass_model)

  # calculating difference in similarity between context_compass_vector and male & female terms
  age_diff_context = diff_sim_wordlist(young_terms, old_terms, context_compass_vector, centroid, compass_model)



  # appending to dataframe
  # FACTORING IN THE SIMILARITY TO CENTROID
  # IF THE VALUE IS POSITIVE IT MEANS THAT THE SEMANTIC VECTOR DOES BETTER THAN CENTROID
  age_df["OSC_young_old"][i] = age_diff_osc
  age_df["LDL_young_old"][i] = age_diff_ldl
  age_df["context_young_old"][i] = age_diff_context

  # IN FORM SPACE
  # getting name of character
  name = complete_df.loc[i, "full_name"].lower().split()[0]
  # vectorize target name
  target_name = name.split()
  TW = FormMatrix(target_name, FM.ngram_size, FM.map_dict)

  # computes similarity in form space
  form_young_sim = 0
  form_old_sim = 0
  for y_t in young_terms:
    young_sim = 1 - spatial.distance.cosine(TW.form_matrix[0],  FM.form_matrix[FM.word2idx[y_t]])
    form_young_sim += young_sim

  for o_t in old_terms:
    old_sim = 1 - spatial.distance.cosine(TW.form_matrix[0],  FM.form_matrix[FM.word2idx[o_t]])
    form_old_sim += old_sim

  # positive values indicates that the form vector is closer to male terms
  age_df["WordForm_young_old"][i] = form_young_sim - form_old_sim

In [None]:
age_df.to_csv(PATH + "age_df.csv")

# Descriptive Stats

In [None]:
snd_df = pd.read_csv(PATH + "snd_df.csv")

In [None]:
form_context_df = pd.read_csv(PATH + "form_context_df.csv")

In [None]:
age_df = pd.read_csv(PATH + "age_df.csv")

In [None]:
gender_df = pd.read_csv(PATH + "gender_df.csv")

In [None]:
# AVERAGE OF SND AND CONTEXT VS FORM FOR EACH CLASS AND ATTRIBUTES
snd_df.groupby(by = ["Class"]).mean()

In [None]:
form_context_df.groupby(by = ["Class"]).mean()

In [None]:
gender_df.groupby(by = ["Class", "gender"]).mean()

In [None]:
age_df.groupby(by = ["Class", "age"]).mean()

In [None]:
age_df.to_csv(PATH + "age_df.csv")
gender_df.to_csv(PATH + "gendef_df.csv")
snd_df.to_csv(PATH + "snd_df.csv")
form_context_df.to_csv(PATH + "form_context_df.csv")