<a href="https://colab.research.google.com/github/LuciaPitarch/Colexification-Patterns/blob/main/3_Feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Load data

In [None]:
# Import libraries
import pandas
from google.colab import files
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
# download pre-trained embedding model from nltk library
nltk.download('word2vec_sample')
path_to_word2vec_sample = nltk.data.find('models/word2vec_sample/pruned.word2vec.txt')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [None]:
# Import csv files
!gdown --id 1GOhH20R9ToDnsoCqO0MjX0NUVSmZ2vqG #polynesian colexifications df
!gdown --id 1HYTxiQMrpz_IYCsXoZIYFoH54mdkbixR #romance colexifications df
!gdown --id 1YJz8XHfCn6_HGR7gbRwB2Ry2Z4ZSvYWK #swow_strength

Downloading...
From: https://drive.google.com/uc?id=1GOhH20R9ToDnsoCqO0MjX0NUVSmZ2vqG
To: /content/polynesian_df_colexifications.csv
100% 515k/515k [00:00<00:00, 33.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1HYTxiQMrpz_IYCsXoZIYFoH54mdkbixR
To: /content/romance_df_colexifications.csv
100% 256k/256k [00:00<00:00, 74.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YJz8XHfCn6_HGR7gbRwB2Ry2Z4ZSvYWK
To: /content/st_swow.csv
18.5MB [00:00, 86.3MB/s]


In [None]:
# Read files
polynesian_df = pandas.read_csv('polynesian_df_colexifications.csv')
romance_df = pandas.read_csv('romance_df_colexifications.csv')
swow = pandas.read_csv('st_swow.csv', delimiter=(';'))

  interactivity=interactivity, compiler=compiler, result=result)


First lowercase words and make word pairs which will be needed to compute some of the features.

In [None]:
def lowercase (df):
  df['Concepticon_Gloss.x'] = df['Concepticon_Gloss.x'].str.lower()
  df['Concepticon_Gloss.y'] = df['Concepticon_Gloss.y'].str.lower()
  return df

In [None]:
def add_colex_pairs (df):
  df['pairs'] = list(zip(df['Concepticon_Gloss.x'], df['Concepticon_Gloss.y']))
  return df

# 1. Features
To compute some of this features wordnet was used. From it just the most frequent synset was taken. 

#1.1.  Grammatical features: part of speech

In [None]:
def part_of_speech (x):
  #uses pos of most frequent synset
  if wn.synsets(x) != []:
    x = (wn.synsets(x)[0].pos())
    return x
  else:
    return None

In [None]:
#which pos
def add_pos (df):
  df['pos.x'] = df['Concepticon_Gloss.x'].apply(part_of_speech)
  df['pos.y'] = df['Concepticon_Gloss.y'].apply(part_of_speech)
  df['pos_pairs'] = list(zip(df['pos.x'], df['pos.y']))
  for i in range(len(df['pos.x'])):
    if df['pos.x'].iloc[i] == None or df['pos.y'].iloc[i]== None:
      df['pos_pairs'].iloc[i] = None
  return df

In [None]:
# same or different pos
def add_compared_pos(df):
  df['pos_same']=0
  for i in range(len(df['pos.x'])):
    if df['pos.x'].iloc[i] == None or df['pos.y'].iloc[i]== None:
      df['pos_same'].iloc[i] = None
    elif df['pos.x'].iloc[i] == df['pos.y'].iloc[i]:
      df['pos_same'].iloc[i] = 1 
    else:
      df['pos_same'].iloc[i] = 0
  return df

# 1.2. Semantic features: 
cosine similarity, semantic taxonomic similarity, semantic field and ontological entity

In [None]:
#cosine similarity
words = []
vectors = []
word2vec = {}
with open(path_to_word2vec_sample) as file:
    for i, line in enumerate(file):
        if i == 0:
            continue
        line = line.split()
        word = line[0]
        vector = line[1:]
        vector = [float(s) for s in vector]
        
        words.append(word)
        vectors.append(vector)
        word2vec[word] = vector
  
def norm(vector):
    return np.sqrt(sum([a*a for a in vector]))

def dot(vector1, vector2):
    return sum([i * j for i, j in zip(vector1, vector2)])

def cosine(vector1, vector2):
    return dot(vector1, vector2) / (norm(vector1) * norm(vector2))

def concept_to_embedding (concept1, concept2):
  if concept1[0] in word2vec and concept2[0] in word2vec:
    return cosine(word2vec[concept1[0]], word2vec[concept2[0]])
  else:
    return None

def add_cosine_sim(df):
  df['cosine_sim'] = df.apply(lambda row: concept_to_embedding([row['Concepticon_Gloss.x']], 
                                                 [row['Concepticon_Gloss.y']]), axis=1) 
  return df

In [None]:
def taxonomic_similarity (x, y, output):
  # we use path and wup similarity because they are the most straight forward to retrieve
  # lichenstein requires same part of speech
  # also possible res, lin and jcn similarity but require IC info

  path_similarity = []
  wup_similarity = []


  if wn.synsets(x) != [] and wn.synsets(y) != []:  
    synx = wn.synsets(x)[0]
    syny = wn.synsets(x)[0]
    for syny in wn.synsets(y):
        if synx.path_similarity(syny) != None:
          path_similarity.append(synx.path_similarity(syny))
        if synx.wup_similarity(syny) != None:
          wup_similarity.append(synx.wup_similarity(syny))


      
    if output == 'path_similarity':
      if path_similarity != []:
        return ((sum(path_similarity))/(len(path_similarity))) 
      else:
        return None

    if output == 'wup_similarity':
      if wup_similarity != []:
        return ((sum(wup_similarity))/(len(wup_similarity))) 
      else:
        return None

In [None]:
def add_wup_pairs(df):
  wup_pairs = []
  for i, row in df.iterrows():
    wup_pairs.append(taxonomic_similarity(row['Concepticon_Gloss.x'], row['Concepticon_Gloss.y'], output='wup_similarity'))
  df['wup_pairs'] = np.array(wup_pairs)
  return df

In [None]:
def add_path_pairs(df):
  path_pairs = []
  for i, row in df.iterrows():
    path_pairs.append(taxonomic_similarity(row['Concepticon_Gloss.x'], row['Concepticon_Gloss.y'], output='path_similarity'))
  df['path_pairs'] = np.array(path_pairs)
  return df

In [None]:
def add_ontological_pairs (df):
  df['Ontological_pairs']=0
  for i in range(len(df['Ontological_Category.x'])):
    if df['Ontological_Category.x'].iloc[i] == None or df['Ontological_Category.y'].iloc[i]== None:
      df['Ontological_pairs'].iloc[i] = None
    elif df['Ontological_Category.x'].iloc[i] == df['Ontological_Category.y'].iloc[i]:
      df['Ontological_pairs'].iloc[i] = 1 
    else:
      df['Ontological_pairs'].iloc[i] = 0
  return df 

In [None]:
def add_semantic_pairs (df):
  df['Semantic_pairs']=0
  for i in range(len(df['Semantic_Field.x'])):
    if df['Semantic_Field.x'].iloc[i] == None or df['Semantic_Field.y'].iloc[i]== None:
      df['Semantic_pairs'].iloc[i] = None
    elif df['Semantic_Field.x'].iloc[i] == df['Semantic_Field.y'].iloc[i]:
      df['Semantic_pairs'].iloc[i] = 1 
    else:
      df['Semantic_pairs'].iloc[i] = 0
  return df

#1.3. phonetic features: 
word length and phonetic similarity between two word-forms. 

In [None]:
def min_edit_distance (x, y):
  #to explore ortographic difference between Form and Clics normalized form
  from nltk import edit_distance
  return edit_distance(x, y)

In [None]:
def add_word_length (df):
  df['n_char'] = df['clics_form'].str.len()
  return(df)

In [None]:
def add_phonetic_similarity (df):
  phonetic_similarity = []
  for i, row in df.iterrows():
    phonetic_similarity.append(min_edit_distance(row['Form.x'], row['clics_form']))
  df['phonetic_pairs'] = np.array(phonetic_similarity)
  return df

# 1.4. Descriptive features 

needed for the analysis and data visualization

Mixed df with attested and unattested computer generated colexifications

In [None]:
def add_unattested (df):
  #flag attested colexifications
  df['colexifies'] = 1
  #generate and add unattested colexifications (credits: Sara) 
  df_unattested = df.copy()
  attested_col = list (df_unattested['pairs'])
  # create list with single items
  attested_col_l = [item for t in attested_col for item in t]
  attested_col_l = list (set(attested_col_l)) # delete duplicates
  len(attested_col_l)
  import random
  # We create unattested colexifications randomly picking elements from the list, and keeping only those that are not in the original dataframe.
  unattested_col = []
  totN = df.shape[0]
  for i in range(totN):
    el1 = random.choice(attested_col_l)
    el2 = random.choice(attested_col_l)
    while el1 == el2:
      el2 = random.choice(attested_col_l)
    supp_t = (el1, el2)
    if supp_t not in set(attested_col): # take only tuple that is not in attested ones --> size not the same but almost
      unattested_col.append(supp_t)
      # fix difference in number i--
  df_unattested_supp = pandas.DataFrame(unattested_col, columns =['Concepticon_Gloss.x', 'Concepticon_Gloss.y'])
  df_unattested_supp['pairs'] = unattested_col
  df_unattested_supp['colexifies'] = 0
  #add unattested colex to df
  mixed_df = pandas.concat([df, df_unattested_supp])
  return mixed_df

Flag the colexifications as maintained or lost

In [None]:
def add_status (old_variety, df):
  oldest_variety_df = df[df['variety']== old_variety]
  maintained_colex_df = pandas.DataFrame()
  for i in oldest_variety_df['pairs']:
    maintained_colex_i = (df[df['pairs'] == i])
    j = 0
    while j in range(len(maintained_colex_i['pairs'])):
      maintained_colex_df = maintained_colex_df.append(maintained_colex_i.iloc[j])
      j+=1

  maintained_colex_df = maintained_colex_df[maintained_colex_df['variety'] != old_variety]
  maintained_colex_df['maintained']=1


  lost_colex_df = oldest_variety_df
  i = 0
  while i in range(len(maintained_colex_df['pairs'])):
    lost_colex_df = lost_colex_df[lost_colex_df['pairs'] != maintained_colex_df['pairs'].iloc[i]]
    i+=1
  lost_colex_df['maintained']=0
  mixed_df = pandas.concat([lost_colex_df, maintained_colex_df])
  return mixed_df


# 2. Add all features to the dfs, save and download

In [None]:
def add_features (old_variety, df):
  lowercase(df)
  add_colex_pairs(df)
  df = add_status(old_variety, df)
  add_phonetic_similarity(df)
  df = add_unattested(df)
  # add_associativity(df) # arreglar
  add_cosine_sim(df)
  add_word_length(df)
  add_pos(df)
  add_compared_pos(df)
  add_path_pairs(df)
  add_wup_pairs(df)
  add_semantic_pairs(df)
  add_ontological_pairs(df)
  return df

In [None]:
 def save_and_download (df, csv_name): 
  df.to_csv(csv_name)
  files.download(csv_name)

In [None]:
polynesian_df = add_features('Proto Polynesian', polynesian_df)
save_and_download(polynesian_df, 'polynesian_df_features.csv')
romance_df = add_features('Latin', romance_df)
save_and_download(romance_df, 'romance_df_features.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>