<a href="https://colab.research.google.com/github/LuciaPitarch/Colexification-Patterns/blob/main/4_Data_analysis_and_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This codes takes the CLICS3 colexification data as base and explores the diachronical patterns of colexification in romance and polynesian languages. 

This file is both in Python and R. 

# 0. LOAD LIBRARIES AND DATA

In [None]:
#import libraries
import pandas
import numpy as np

In [None]:
#download data
!gdown --id 1AgNMJq7hhuL2hsxrj5hGEDKfrTsNUPAF #polynesian features
!gdown --id 1xlmO2gnPnncoLjfO8-_qYm_EVAg4KSVF #romance features

Downloading...
From: https://drive.google.com/uc?id=1AgNMJq7hhuL2hsxrj5hGEDKfrTsNUPAF
To: /content/polynesian_df_features.csv
100% 317k/317k [00:00<00:00, 46.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xlmO2gnPnncoLjfO8-_qYm_EVAg4KSVF
To: /content/romance_df_features.csv
100% 348k/348k [00:00<00:00, 49.8MB/s]


In [None]:
#load the data
polynesian_df = pandas.read_csv('polynesian_df_features.csv')
romance_df = pandas.read_csv('romance_df_features.csv')

In [None]:
#merge polynesian and romance df into a single one
df = pandas.concat([polynesian_df, romance_df])

# 1. DATA OVERVIEW

In [None]:
#data summary
print(df.count())
print(polynesian_df.groupby('maintained').size())
print(romance_df.groupby('maintained').size())
df

Unnamed: 0                3653
Form.x                    1829
clics_form                1829
Glottocode                1829
Concepticon_ID.x          1829
Concepticon_Gloss.x       3653
Family                    1829
variety                   1829
Ontological_Category.x    1829
Semantic_Field.x          1829
Form.y                    1829
Concepticon_ID.y          1829
Concepticon_Gloss.y       3653
Ontological_Category.y    1829
Semantic_Field.y          1829
pairs                     3653
maintained                1829
phonetic_pairs            1829
colexifies                3653
cosine_sim                2742
n_char                    1829
pos.x                     3387
pos.y                     3355
pos_pairs                 3120
pos_same                  3120
path_pairs                2678
wup_pairs                 2678
Semantic_pairs            3653
Ontological_pairs         3653
dtype: int64
maintained
0.0    594
1.0    286
dtype: int64
maintained
0.0    333
1.0    616
dtype: in

Unnamed: 0.1,Unnamed: 0,Form.x,clics_form,Glottocode,Concepticon_ID.x,Concepticon_Gloss.x,Family,variety,Ontological_Category.x,Semantic_Field.x,Form.y,Concepticon_ID.y,Concepticon_Gloss.y,Ontological_Category.y,Semantic_Field.y,pairs,maintained,phonetic_pairs,colexifies,cosine_sim,n_char,pos.x,pos.y,pos_pairs,pos_same,path_pairs,wup_pairs,Semantic_pairs,Ontological_pairs
0,187,*nuku,nuku,poly1242,626.0,land,Austronesian,Proto Polynesian,Person/Thing,The physical world,*nuku,2023.0,crowd,Person/Thing,Quantity,"('land', 'crowd')",0.0,1.0,1,0.084035,4.0,n,n,"('n', 'n')",1.0,0.100000,0.307692,0,1
1,210,*refu,refu,poly1242,2.0,dust,Austronesian,Proto Polynesian,Person/Thing,The physical world,*refu,646.0,ash,Person/Thing,The physical world,"('dust', 'ash')",0.0,1.0,1,0.460755,4.0,n,n,"('n', 'n')",1.0,0.125397,0.476035,1,1
2,211,*lefu,lefu,poly1242,2.0,dust,Austronesian,Proto Polynesian,Person/Thing,The physical world,*lefu,1843.0,thousand,Number,Quantity,"('dust', 'thousand')",0.0,1.0,1,0.124580,4.0,n,n,"('n', 'n')",1.0,0.076923,0.250000,0,0
3,218,*qoru,qoru,poly1242,640.0,mud,Austronesian,Proto Polynesian,Person/Thing,The physical world,*qoru,1145.0,swamp,Person/Thing,The physical world,"('mud', 'swamp')",0.0,1.0,1,0.407885,4.0,n,n,"('n', 'n')",1.0,0.081169,0.260504,1,1
4,219,*pela,pela,poly1242,640.0,mud,Austronesian,Proto Polynesian,Person/Thing,The physical world,*pela,1558.0,similar,Property,Spatial relations,"('mud', 'similar')",0.0,1.0,1,-0.001168,4.0,n,a,"('n', 'a')",0.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1891,942,,,,,side,,,,,,,fowl,,,"('side', 'fowl')",,,0,0.064977,,n,n,"('n', 'n')",1.0,0.088462,0.320513,0,0
1892,943,,,,,swamp,,,,,,,lagoon,,,"('swamp', 'lagoon')",,,0,0.410354,,n,n,"('n', 'n')",1.0,0.111111,0.333333,0,0
1893,944,,,,,fishing line,,,,,,,barley,,,"('fishing line', 'barley')",,,0,,,,n,,,,,0,0
1894,945,,,,,towel,,,,,,,rag,,,"('towel', 'rag')",,,0,0.432502,,n,n,"('n', 'n')",1.0,0.129936,0.335935,0,0


In [None]:
%load_ext rpy2.ipython

In [None]:
%%R # to run R on colab

library(dplyr)
library(readr)
library(tidyr)

polynesian_df <- read_csv('polynesian_df_features.csv')
romance_df <- read_csv('romance_df_features.csv')
df <- merge(romance_df, polynesian_df, all.x=T, all.y=T)

#set categorical data
cat_cols <- c('maintained', 'colexifies', 'pos_pairs', 'pos_same', 
              'Semantic_pairs', 'Ontological_pairs', 'Family')
df[,cat_cols] <- lapply(df[,cat_cols], as.factor)

#data overview
print(summary(df))
str(df)


── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  X1 = col_double(),
  Concepticon_ID.x = col_double(),
  Concepticon_ID.y = col_double(),
  maintained = col_double(),
  phonetic_pairs = col_double(),
  colexifies = col_double(),
  cosine_sim = col_double(),
  n_char = col_double(),
  pos_same = col_double(),
  path_pairs = col_double(),
  wup_pairs = col_double(),
  Semantic_pairs = col_double(),
  Ontological_pairs = col_double()
)
ℹ Use `spec()` for the full column specifications.


── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  X1 = col_double(),
  Concepticon_ID.x = col_double(),
  Concepticon_ID.y = col_double(),
  maintained = col_double(),
  phonetic_pairs = col_double(),
  colexifies = col_double(),
  cosine_sim = col_double(),
  n_char = col_double(),
  pos_same = col_double(),
  path_pairs = col_double(),
  wup_pairs = col_double(),
  

# 2. QUANTITATIVE ANALYSIS: LOGISTIC REGRESSION MODELS

 2.1. Which features predict diachronic patterns of colexification (which colexifications are loss/maintained)? 

To run this model we select just the attested colexifications as data. Then we run different logistic regression models to analyse which features better predict the loss and maintanace of colexifications. 

In [None]:
%%R
#logreg model
#model1:all features, loss/maintained

m1 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ phonetic_pairs+cosine_sim +n_char+pos_same+path_pairs+wup_pairs+ 
          Semantic_pairs+Ontological_pairs,
          family='binomial')
summary(m1)
#models for individual features, loss/maintained

m2 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ phonetic_pairs, 
          family='binomial')


m3 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ cosine_sim, 
          family='binomial')

m4 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ n_char, 
          family='binomial')

m5 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ pos_same, 
          family='binomial')

m6 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ path_pairs, 
          family='binomial')

m7 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ wup_pairs, 
          family='binomial')

m8 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ Semantic_pairs, 
          family='binomial')

m9 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ Ontological_pairs, 
          family='binomial')

#best model(in terms of AIC)

m10 <- glm(formula = maintained ~ phonetic_pairs + cosine_sim + path_pairs + 
    wup_pairs + Semantic_pairs + Ontological_pairs, 
    family = "binomial", 
    data = df[df$colexifies == 1, ])

print('ALL FEATURES')
print(summary(m1))
print('INDIVIDUAL MODELS')
print(summary(m2))
print(summary(m3))
print(summary(m4))
print(summary(m5))
print(summary(m6))
print(summary(m7))
print(summary(m8))
print(summary(m9))
print('BEST MODEL')
print(summary(m10))

Now for the best model, train and test it 10 times to check for accuracy and robustness of the model

In [None]:
#multiple models at once
import random
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#set random seed
random.seed(10)
#fit the model with the wanted features
model = LogisticRegression()
supp_df = df.dropna()
x = supp_df[['phonetic_pairs' , 'cosine_sim' , 'path_pairs' , 
    'wup_pairs' , 'Semantic_pairs' , 'Ontological_pairs']]
y = supp_df['maintained']
model.fit(x, y)
supp_df['log_reg_prediction'] = model.predict(x)
#now lets set several train-test sets (in this case 10) to acquire a more robust model
#this way we ensure the result is not just because of the data set in the train or test model
accuracies = []
for i in range(10):
  data_train, data_test = sklearn.model_selection.train_test_split(supp_df) # defalut is 20/80
  x_train = data_train[['phonetic_pairs' , 'cosine_sim' , 'path_pairs' , 
    'wup_pairs' , 'Semantic_pairs' , 'Ontological_pairs']]
  y_train = data_train['maintained']
  x_test = data_test[['phonetic_pairs' , 'cosine_sim' , 'path_pairs' , 
    'wup_pairs' , 'Semantic_pairs' , 'Ontological_pairs']]
  y_test = data_test['maintained']
  accuracies.append(model.score(x_test, y_test))
print(accuracies)
#mean of all the accuracies
print(sum(accuracies)/10)

[0.780327868852459, 0.8, 0.7901639344262295, 0.8262295081967214, 0.780327868852459, 0.8229508196721311, 0.8360655737704918, 0.8, 0.8163934426229508, 0.7967213114754098]
0.8049180327868852


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


2.2 Check for crosslinguistic heterogeneity


In [None]:
%%R
clm1 <- glm(data=df[df$colexifies ==1, ], 
          maintained ~ Family+phonetic_pairs+cosine_sim +n_char+pos_same+path_pairs+wup_pairs+ 
            Semantic_pairs+Ontological_pairs,
          family='binomial')
print(summary(clm1))

clm2 <- glm(data=df[df$colexifies ==1, ], 
          maintained~Family,
          family='binomial')
print(summary(clm2))


Call:
glm(formula = maintained ~ Family + phonetic_pairs + cosine_sim + 
    n_char + pos_same + path_pairs + wup_pairs + Semantic_pairs + 
    Ontological_pairs, family = "binomial", data = df[df$colexifies == 
    1, ])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.2185  -0.6933   0.2190   0.6736   2.9576  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -1.36883    0.40343  -3.393 0.000691 ***
FamilyIndo-European  0.09884    0.17607   0.561 0.574533    
phonetic_pairs      -0.74017    0.08747  -8.462  < 2e-16 ***
cosine_sim           7.03741    0.61139  11.511  < 2e-16 ***
n_char              -0.02584    0.04698  -0.550 0.582322    
pos_same1            0.04699    0.32600   0.144 0.885390    
path_pairs           4.63146    1.88200   2.461 0.013858 *  
wup_pairs           -2.89712    0.82136  -3.527 0.000420 ***
Semantic_pairs1      0.58966    0.17649   3.341 0.000835 ***
Ontological_pairs1   0.34554    0.2042

# 3. QUALITATIVE ANALYSIS

plots for every feature with percentages comparing loss/maintained and between linguistic families to further analyse what happens to colexifications trough time

In [None]:
%%R
#quantitative analysis
#first select just the attested colexifications and split the df into polynesian and romance
attested_colex = df[df$colexifies ==1, ]
pol_att = attested_colex[attested_colex$Family=='Indo-European',]
rom_att = attested_colex[attested_colex$Family=='Proto Polynesian',]

#turn count data into proportions for a better comparison
#change DF, var1 and var2 and title parameters to the chosen ones
counts <- table(DF$var1, DF$var2, dnn=c(var1, var2))
props <- (prop.table(counts,2))
props <- as.data.frame(props)
#plot the data
ggplot(props, aes(x=var1, y=Freq, fill=var2)) + 
  geom_col(position='dodge') +
  ggtitle('title')