<a href="https://colab.research.google.com/github/LuciaPitarch/Colexification-Patterns/blob/main/1_Raw_to_clean_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas
import nltk
nltk.download('punkt')
from google.colab import files


In [None]:
# Load raw data. Accesible in: https://clics.clld.org/download
!gdown --id 1EySPG8ZDMfMTvhUQaw8w1k6NgywI4N4S #df_all_raw
raw_df = pandas.read_csv('df_all_raw.csv')

This part shows an overview of the raw data

In [None]:
# Raw data overview
raw_df.info()

In [None]:
# concepts per language family:
raw_df.groupby('Family').size().sort_values(ascending=False)

In [None]:
# varieties per language family:
df_fam = raw_df[['Family','Glottocode']].copy()
df_fam = df_fam.drop_duplicates()
n_varieties = df_fam.groupby('Family').size()
n_varieties = n_varieties.sort_values(ascending=False)
n_varieties

In [None]:
# Diacrhronic data overview
#tokenize variety, add flag column
raw_df['tk_variety']=raw_df['variety'].apply(nltk.tokenize.word_tokenize)
raw_df['flag']=0
#find diachronic varieties by keywords
diachronic_varieties = ['Old', 'Middle', 'Classic', 'Ancient', 'Proto']
for i in range(len(raw_df['tk_variety'])):
  r = raw_df['tk_variety'].iloc[i]
  if any((v in diachronic_varieties) for v in r):
    raw_df['flag'].iloc[i]+=1
#show data for diachronic varieties
diachronic_varieties = raw_df[raw_df['flag']>0]
diachronic_varieties

In [None]:
#show data for diachronic varieties with more concepts (Indo-European and Proto-Polynesian)
print(diachronic_varieties[diachronic_varieties['Family']=='Indo-European'].groupby('variety').size())
print(diachronic_varieties[diachronic_varieties['Family']=='Austronesian'].groupby('variety').size())

Data cleaning

In [None]:
# Select wanted columns
clean_df = raw_df[['Form', 'clics_form', 'Concepticon_Gloss', 'Ontological_Category', 
                   'Semantic_Field', 'variety', 'Family', 'Latitude', 'Longitude']].copy()

In [None]:
# Select wanted varieties (Romance and Polynesian)
def select_varieties (language_list):
  new_df = clean_df #creates a new df to preserve original df intact
  new_df['flag']=0 # add flag column
  #flag wanted varieties
  for i in range(len(new_df['variety'])):
    if any(l==new_df['variety'].loc[i] for l in language_list):
      new_df['flag'].iloc[i]+=1
  # delete not flagged columns
  new_df = new_df.drop(new_df[new_df.flag < 1].index)
  # reset index
  new_df = new_df.reset_index(drop=True)
  new_df['flag']=0 #reset flag column
  return(new_df)

In [None]:
#lists of hand_selected wanted varieties grouped by language family
romance_languages = ['Spanish', 'Friulian', 'Ladin', 'Seychelles creole', 'Portuguese', 'Catalan', 'French', 'Provençal', 'Italian', 'Latin', 'Sardinian', 'Latin-std', 'Romanian', 'Occitan'
'Galician-std', 'Romagnol-std', 'Old Spanish', 'Old Italian', 'Old French', 'Old Provençal', 'Middle French']
polynesian_languages = ['Proto Polynesian','Tongan', 'Tikopia', 'Anuta', 'Rennell', 'Samoan', 'Rapa Nui', 'Rapanui', 'Tahitian', 'Maori', 'Tuamotuan', 'Marquesan', 'Hawaiian']

In [None]:
#df with all the data for each wanted language family
romance_df = select_varieties(romance_languages)
polynesian_df = select_varieties(polynesian_languages)

In [None]:
#download csv
romance_df.to_csv('romance_df.csv')
files.download('romance_df.csv')
polynesian_df.to_csv('polynesian_df.csv')
files.download('polynesian_df.csv')