<a href="https://colab.research.google.com/github/Garrafao/WUGs/blob/main/scripts/misc/one_for_all.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook loads datasets of semantic proximity (Word-in-Context) for various languages the [WUG format](https://www.ims.uni-stuttgart.de/en/research/resources/experiment-data/wugs/). We provide the data in a minimal and an extended format. There are in total 4 dataframes: judgments_full, judgments_wug, uses_full and uses_wug. There are 20 transformed datasets. The notebook should run of-the-shelf in a colab environment with python 3.8.

Many of the data sets are transformed when running the notebook. We cannot guarantee that there are no errors. Hence, please make sure that you compare the created data frames to the original data sets before doing serious research with them.

Note: Please run this script without gpu on colab.

The datasets and their versions are as follows:

#RuDSI - Russian
rudsi = 'https://github.com/kategavrishina/RuDSI/tree/main/data'

#NorDiaChange - Norwegian
nordia1 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset1/data'
nordia2 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset2/data'

#RuShiftEval - Russian
rushifteval1 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval1/data'
rushifteval2 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval2/data'
rushifteval3 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval3/data'

#RuSemShift - Russian
rusemshift1 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_1/DWUG/data'
rusemshift2 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_2/DWUG/data'

#DiscoWUG - German (Version: 1.1.1)
https://zenodo.org/record/7396225/files/discowug.zip



#SURel - German (Version: 3.0.0)
https://zenodo.org/record/5784569/files/surel.zip


#DURel - German (Version: 3.0.0)
https://zenodo.org/record/5784453/files/durel.zip


#DWUG DE- German (Version: 2.3.0)
https://zenodo.org/record/7441645/files/dwug_de.zip


#RefWUG - German (Version: 1.1.0)
https://zenodo.org/record/5791269/files/refwug.zip


#DWUG EN - English (Version: 2.0.1)
https://zenodo.org/record/7387261/files/dwug_en.zip


#DWUG SV - Swedish(Version: 2.0.1)
https://zenodo.org/record/7389506/files/dwug_sv.zip


#DWUG ES - Spanish(Version: 4.0.0)
https://zenodo.org/record/6433667/files/dwug_es.zip


#DiaWUG - Spanish (Version: 1.1.0)
https://zenodo.org/record/5791193/files/diawug.zip


#DUPS_WUG - English (version 2.0.0)
https://zenodo.org/record/5500223/files/DUPS-WUG.zip

#WIC - English (version v1.0)
https://pilehvar.github.io/wic/package/WiC_dataset.zip

#TempoWIC - English
https://codalab.lisn.upsaclay.fr/my/datasets/download/3e22f138-ca00-4b10-a0fd-2e914892200d

#Raw-C - English
https://raw.githubusercontent.com/seantrott/raw-c/main/data/processed/raw-c.csv

#Usim - English
http://www.dianamccarthy.co.uk/downloads/WordMeaningAnno2012/

#CosimLex - English, Croatian, Finnish
https://www.clarin.si/repository/xmlui/handle/11356/1308/allzip


In [141]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import io
import numpy as np
import os
from zipfile import ZipFile
import csv

In [142]:
!git clone https://github.com/Garrafao/WUGs.git #contains transformation scripts

fatal: destination path 'WUGs' already exists and is not an empty directory.


In [143]:
!pip install fuzzywuzzy #needed for rawc script

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!cd /content/WUGs/scripts/misc && bash -e usim2data.sh #transform USim to WUG

In [None]:
!cd /content/WUGs/scripts/misc && bash -e evonlp2wug.sh  #transforms tempowic to wug

In [None]:
!python3 -m spacy download fi_core_news_sm #needed for cosimlex

In [None]:
!python3 -m spacy download hr_core_news_sm #needed for cosimlex

In [None]:
!cd /content/WUGs/scripts/misc && bash -e cosimlex2wug.sh #transforms cosimlex to wug

In [None]:
#RuDSI
rudsi = 'https://github.com/kategavrishina/RuDSI/tree/main/data'

#NorDiaChange
nordia1 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset1/data'
nordia2 = 'https://github.com/ltgoslo/nor_dia_change/tree/main/subset2/data'

#RuShiftEval
rushifteval1 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval1/data'
rushifteval2 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval2/data'
rushifteval3 = 'https://github.com/akutuzov/rushifteval_public/tree/main/durel/rushifteval3/data'

#RuSemShift
rusemshift1 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_1/DWUG/data'
rusemshift2 = 'https://github.com/juliarodina/RuSemShift/tree/master/rusemshift_2/DWUG/data'

#Discowug
!wget https://zenodo.org/record/7396225/files/discowug.zip
with ZipFile('discowug.zip', 'r') as discowug:
    discowug.extractall()


#surel
!wget https://zenodo.org/record/5784569/files/surel.zip
with ZipFile('surel.zip', 'r') as surel:
    surel.extractall()

#durel
!wget https://zenodo.org/record/5784453/files/durel.zip
with ZipFile('durel.zip', 'r') as durel:
    durel.extractall()

#DWUG DE
!wget https://zenodo.org/record/7441645/files/dwug_de.zip
with ZipFile('dwug_de.zip', 'r') as dwug_de:
    dwug_de.extractall()

#RefWUG
!wget https://zenodo.org/record/5791269/files/refwug.zip
with ZipFile('refwug.zip', 'r') as refwug:
    refwug.extractall()

#DWUG EN
!wget https://zenodo.org/record/7387261/files/dwug_en.zip
with ZipFile('dwug_en.zip', 'r') as dwug_en:
    dwug_en.extractall()


#DWUG SV
!wget https://zenodo.org/record/7389506/files/dwug_sv.zip
with ZipFile('dwug_sv.zip', 'r') as dwug_sv:
    dwug_sv.extractall()


#DWUG ES
!wget https://zenodo.org/record/6433667/files/dwug_es.zip
with ZipFile('dwug_es.zip', 'r') as dwug_es:
    dwug_es.extractall()

#DiaWUG
!wget https://zenodo.org/record/5791193/files/diawug.zip
with ZipFile('diawug.zip', 'r') as diawug:
    diawug.extractall()


#DUPS_WUG
!wget https://zenodo.org/record/5500223/files/DUPS-WUG.zip
with ZipFile('DUPS-WUG.zip', 'r') as dups:
    dups.extractall()




In [None]:
%run /content/WUGs/scripts/misc/wic2wug.ipynb #transforms WIC dataset to wug

In [151]:
%run /content/WUGs/scripts/misc/rawc2wug.py #Raw-C to wug

In [152]:
direc = []
i = os.listdir('WUGs/scripts/misc/wugdata')
direc.append(i)

k = os.listdir('WUGs/scripts/misc/wugformat')
direc.append(k)                                   #all data directories extracted from tempowic, cosimlex and wic



In [153]:
paths = []          #list of directory paths
for i in direc[0]:
    paths.append('WUGs/scripts/misc/wugdata/'+i+ '/data/')     #tempowic

for i in direc[1]:
    paths.append('WUGs/scripts/misc/wugformat/'+ i + '/wug_all/data/all') #cosimlex


paths.append('/content/WiC_data/') #wic

paths.append("WUGs/scripts/misc/data/")  #usim
paths.append("/content/raw-c/") #rawc




In [154]:
folders = []                       #list of all folders names(lemma wise) in tempowic, cosimlex, wic, usim, rawc
for ds in paths:
    path = os.listdir(ds)
    folders.append(path)


In [155]:
#final list judgments paths for tempowic, cosimlex and usim
path_j = []

path_usim = []
for i in folders[0]:
     pathj = paths[0] + i + "/judgments.csv" #tempowic
     path_j.append(pathj)
for i in folders[3]:
     pathj = paths[3] + i + "/judgments.csv" #tempowic
     path_j.append(pathj)
for i in folders[5]:
     pathj = paths[5] + i + "/judgments.csv"  #tempowic
     path_j.append(pathj)

for i in folders[10]:            #usim
     pathj = paths[10] + i + "/judgments.csv"
     path_usim.append(pathj)

In [156]:
pathco = []
pat = paths[6] + "/judgments.csv" #cosimlex
pathco.append(pat)

pat = paths[7]  + "/judgments.csv" #cosimlex
pathco.append(pat)

pathj = paths[8]  + "/judgments.csv" #cosimlex
pathco.append(pat)


In [158]:
#final list judgments paths and dataframe for wic and rawc
path_k = []
p = []
for i in folders[9]:
     pathj = paths[9] + i + "/judgments.csv"      #wic
     path_k.append(pathj)
for i in folders[11]:                             #rawc
    pathj = paths[11] + i + "/judgments.csv"
    p.append(pathj)
#judgements dataframe for rawc and wic datasets
wic_df = pd.DataFrame()
rawc_df = pd.DataFrame()
for i in path_k:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[3]
   wic_df = pd.concat([wic_df, Tmp])

for i in p:
  tmp_df =  pd.read_csv(i, delimiter='\t', quoting = 3)
  tmp_df['dataset'] = i.split('/')[2]
  rawc_df = pd.concat([rawc_df, tmp_df])

In [159]:
raw_df = pd.DataFrame
raw_df = pd.concat([wic_df, rawc_df])

In [160]:
raw_df['language'] = 'English'

In [161]:
raw_df = raw_df.reset_index(drop = True)

In [163]:
raw_df.loc[raw_df["dataset"] == "dev", "dataset"] = 'wic_dev'
raw_df.loc[raw_df["dataset"] == "train", "dataset"] = 'wic_train'
raw_df.loc[raw_df["dataset"] == "test", "dataset"] = 'wic_test'


In [164]:
cosim_df = pd.DataFrame()             #cosimlex judgments dataframe
for i in pathco:
   Tmp = pd.read_csv(i, delimiter='\t', quoting =3)
   Tmp['dataset'] = i.split('/')[4]
   cosim_df = pd.concat([cosim_df, Tmp])

In [165]:
cosim_df.loc[cosim_df["dataset"] == "fi", "language"] = 'Finnish'
cosim_df.loc[cosim_df["dataset"] == "hr", "language"] = 'Croatian'
cosim_df.loc[cosim_df["dataset"] == "en", "language"] = 'English'


In [166]:
cosim_df.loc[cosim_df["language"] == "Finnish", "dataset"] = 'Cosimlex_fi'
cosim_df.loc[cosim_df["language"] == "Croatian", "dataset"] = 'Cosimlex_hr'
cosim_df.loc[cosim_df["language"] == "English", "dataset"] = 'Cosimlex_en'

In [167]:
#cosim_df['dataset'] = 'Cosimlex'
cosim_df = cosim_df.reset_index(drop = True)

In [168]:
path_usim.remove('WUGs/scripts/misc/data/dwug_en/judgments.csv')

In [169]:
judge_df = pd.DataFrame()
jud_df =  pd.DataFrame()                 #judgments dataframe for tempowic and usim
for i in path_j:
    Tmp = pd.read_csv(i, delimiter='\t', quoting =3)
    Tmp['dataset'] = i.split('/')[4]
    judge_df = pd.concat([judge_df, Tmp])

for i in path_usim:
    Temp = pd.read_csv(i, delimiter='\t', quoting =3)
    Temp['dataset'] = i.split('/')[3]
    jud_df = pd.concat([jud_df, Temp])


In [170]:
judgemt_df = pd.DataFrame()
judgemt_df = pd.concat([judge_df, jud_df])

In [172]:
judgemt_df.loc[judgemt_df["dataset"] == "data", "dataset"] = 'USim'

In [173]:
judgemt_df.loc[judgemt_df["dataset"] == "TempoWic", "language"] = 'English'
judgemt_df.loc[judgemt_df["dataset"] == "USim", "language"] = 'English'

In [174]:
judgemt_df = judgemt_df.reset_index(drop = True)

In [175]:
dwugde = "dwug_de/data"                          #WUG data directory paths
dwugen = "dwug_en/data"
dwugsv = "dwug_sv/data"
discowugg = "discowug/data"
durel = "durel/data"
surel = "surel/data"
refwug = "refwug/data"
dwuges = 'dwug_es/data'
diawug = 'diawug/data'
dups = 'DUPS-WUG/data'
dupswug = ''
dwug = [dwugde, dwugen,dwugsv,discowugg, durel, surel, refwug, dwuges, diawug, dups]
dirlist = []
for dataset in dwug:
  dir = os.listdir(dataset)
  dirlist.append(dir)

In [176]:
dwug_j = []                                                #dwug data paths
for i in dirlist[0]:
  dwugde_j = "dwug_de/data/" + i + "/judgments.csv"
  dwug_j.append(dwugde_j)
for i in dirlist[1]:
  dwugen_j = "dwug_en/data/" + i + "/judgments.csv"
  dwug_j.append(dwugen_j)
for i in dirlist[2]:
  dwugsv_j = "dwug_sv/data/" + i + "/judgments.csv"
  dwug_j.append(dwugsv_j)
for i in dirlist[3]:
  discowugg_j = "discowug/data/" + i + "/judgments.csv"
  dwug_j.append(discowugg_j)
for i in dirlist[4]:
  durel_j = "durel/data/" + i + "/judgments.csv"
  dwug_j.append(durel_j)
for i in dirlist[5]:
  surel_j = "surel/data/" + i + "/judgments.csv"
  dwug_j.append(surel_j)
for i in dirlist[6]:
  refwug_j = "refwug/data/" + i + "/judgments.csv"
  dwug_j.append(refwug_j)
for i in dirlist[7]:
  dwuges_j = "dwug_es/data/" + i + "/judgments.csv"
  dwug_j.append(dwuges_j)
for i in dirlist[8]:
  diawug_j = "diawug/data/" + i + "/judgments.csv"
  dwug_j.append(diawug_j)
for i in dirlist[9]:
  dups_j = "DUPS-WUG/data/" + i + "/judgments.csv"
  dwug_j.append(dups_j)

In [177]:
judgemnt_df = pd.DataFrame()            #dwug data judgments df
for i in dwug_j:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[0]
   judgemnt_df = pd.concat([judgemnt_df, Tmp])


In [179]:
path_u = []
path_us = []                   #uses paths for tempowic and usim
for i in folders[0]:
    pathj = paths[0] + i + "/uses.csv"    #tempowic
    path_u.append(pathj)
for i in folders[3]:
    pathj = paths[3] + i + "/uses.csv"     #tempowic
    path_u.append(pathj)
for i in folders[4]:
    pathj = paths[4] + i + "/uses.csv"     #tempowic
    path_u.append(pathj)
for i in folders[10]:
     pathj = paths[10] + i + "/uses.csv"   #usim
     path_us.append(pathj)

In [180]:
path_cou = []                               #for cosimlex uses paths
pat = paths[6] + "/uses.csv"
path_cou.append(pat)

pat = paths[7] + "/uses.csv"
path_cou.append(pat)

pat = paths[8] +  "/uses.csv"
path_cou.append(pat)

In [181]:
cosim_uses_df = pd.DataFrame()            #cosimlex uses df
for i in path_cou:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[4]
   cosim_uses_df = pd.concat([cosim_uses_df, Tmp])

In [182]:
cosim_uses_df.loc[cosim_uses_df["dataset"] == "fi", "language"] = 'Finnish'
cosim_uses_df.loc[cosim_uses_df["dataset"] == "hr", "language"] = 'Croatian'
cosim_uses_df.loc[cosim_uses_df["dataset"] == "en", "language"] = 'English'

In [183]:
cosim_uses_df.loc[cosim_uses_df["language"] == "Finnish", "dataset"] = 'Cosimlex_fi'
cosim_uses_df.loc[cosim_uses_df["language"] == "Croatian", "dataset"] = 'Cosimlex_hr'
cosim_uses_df.loc[cosim_uses_df["language"] == "English", "dataset"] = 'Cosimlex_en'

In [184]:
#cosim_uses_df['dataset'] = 'Cosimlex'
cosim_uses_df = cosim_uses_df.reset_index(drop = True)

In [186]:
path_k = []
path_r = []                         #wic and rawc uses df
for i in folders[9]:
    pathj = paths[9] + i + "/uses.csv" #wic
    path_k.append(pathj)
for i in folders[11]:
    pathj = paths[11] + i + "/uses.csv" #rawc
    path_r.append(pathj)
raw_u_df = pd.DataFrame()
raw_us_df = pd.DataFrame()
for i in path_k:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[3]
   raw_u_df = pd.concat([raw_u_df, Tmp])

for i in path_r:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[2]
   raw_us_df = pd.concat([raw_us_df, Tmp])

In [187]:
raw_uses_df = pd.DataFrame()
raw_uses_df = pd.concat([raw_u_df, raw_us_df])

In [188]:
raw_uses_df['language'] = 'English'

In [189]:
raw_uses_df.loc[raw_uses_df["dataset"] == "dev", "dataset"] = 'wic_dev'
raw_uses_df.loc[raw_uses_df["dataset"] == "train", "dataset"] = 'wic_train'
raw_uses_df.loc[raw_uses_df["dataset"] == "test", "dataset"] = 'wic_test'

In [190]:
path_us.remove("WUGs/scripts/misc/data/dwug_en/uses.csv")

In [191]:
u_df = pd.DataFrame()
ud_df =  pd.DataFrame()                 #uses dataframe for tempowic and usim
for i in path_u:
    Tmp = pd.read_csv(i, delimiter='\t', quoting =3)
    Tmp['dataset'] = i.split('/')[4]
    u_df = pd.concat([u_df, Tmp])

for i in path_us:
    Tmp = pd.read_csv(i, delimiter='\t', quoting =3)
    Tmp['dataset'] = i.split('/')[3]
    ud_df = pd.concat([ud_df, Tmp])


In [192]:
use_df = pd.DataFrame()
use_df = pd.concat([u_df, ud_df])

In [193]:
#use_df.loc[use_df["dataset"] == "wugdata", "dataset"] = 'TempoWic'
use_df.loc[use_df["dataset"] == "data", "dataset"] = 'USim'

In [195]:
use_df['language'] = 'English'

In [196]:
dwug_u = []                                           #dwug data uses paths
for i in dirlist[0]:
  dwugde_u = "dwug_de/data/" + i + "/uses.csv"
  dwug_u.append(dwugde_u)
for i in dirlist[1]:
  dwugen_u = "dwug_en/data/" + i + "/uses.csv"
  dwug_u.append(dwugen_u)
for i in dirlist[2]:
  dwugsv_u = "dwug_sv/data/" + i + "/uses.csv"
  dwug_u.append(dwugsv_u)
for i in dirlist[3]:
  discowugg_u = "discowug/data/" + i + "/uses.csv"
  dwug_u.append(discowugg_u)
for i in dirlist[4]:
  durel_u = "durel/data/" + i + "/uses.csv"
  dwug_u.append(durel_u)
for i in dirlist[5]:
  surel_u = "surel/data/" + i + "/uses.csv"
  dwug_u.append(surel_u)
for i in dirlist[6]:
  refwug_u = "refwug/data/" + i + "/uses.csv"
  dwug_u.append(refwug_u)
for i in dirlist[7]:
  dwuges_u = "dwug_es/data/" + i + "/uses.csv"
  dwug_u.append(dwuges_u)
for i in dirlist[8]:
  diawug_u = "diawug/data/" + i + "/uses.csv"
  dwug_u.append(diawug_u)
for i in dirlist[9]:
  dups_u = "DUPS-WUG/data/" + i + "/uses.csv"
  dwug_u.append(dups_u)

In [197]:
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_de", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_en", "language"] = 'English'
judgemnt_df.loc[judgemnt_df["dataset"] == "DUPS-WUG", "language"] = 'English'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_es", "language"] = 'Spanish'
judgemnt_df.loc[judgemnt_df["dataset"] == "dwug_sv", "language"] = 'Swedish'
judgemnt_df.loc[judgemnt_df["dataset"] == "durel", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "surel", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "discowug", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "refwug", "language"] = 'German'
judgemnt_df.loc[judgemnt_df["dataset"] == "diawug", "language"] = 'Spanish'


In [198]:
#final judgments df (without russian and norwegian datasets)
judgment_df = pd.DataFrame()
judgment_df = pd.concat([judgment_df, judgemt_df], axis = 0)
judgment_df = pd.concat([judgment_df, judgemnt_df], axis = 0)
judgment_df = pd.concat([judgment_df, raw_df], axis = 0)
judgment_df = pd.concat([judgment_df, cosim_df], axis = 0)

In [199]:
judgment_df = judgment_df.reset_index(drop=True)

In [200]:
usee_df = pd.DataFrame()            #uses dwug df
for i in dwug_u:
    Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
    Tmp['dataset'] = i.split('/')[0]
    usee_df = pd.concat([usee_df, Tmp])

In [201]:
usee_df.loc[usee_df["dataset"] == "dwug_de", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "dwug_en", "language"] = 'English'
usee_df.loc[usee_df["dataset"] == "DUPS-WUG", "language"] = 'English'
usee_df.loc[usee_df["dataset"] == "dwug_es", "language"] = 'Spanish'
#usee_df.loc[usee_df["dataset"] == "dwug_la", "language"] = 'latin'
usee_df.loc[usee_df["dataset"] == "dwug_sv", "language"] = 'Swedish'
usee_df.loc[usee_df["dataset"] == "durel", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "surel", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "discowug", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "refwug", "language"] = 'German'
usee_df.loc[usee_df["dataset"] == "diawug", "language"] = 'Spanish'

In [202]:
#combining uses df
uses_full_df = pd.concat([usee_df, use_df], axis = 0)
uses1_df = pd.concat([uses_full_df, raw_uses_df], axis = 0)
uses_df_full = pd.concat([uses1_df, cosim_uses_df], axis = 0)

In [203]:
#getting the data
rudsi = [rudsi]
nordia = [nordia1, nordia2]
rushift = [rushifteval1, rushifteval2, rushifteval3]
rusem = [rusemshift1, rusemshift2]
find_class_nordia = []
find_class_rudsi = []
find_class_rusem = []
find_class_rushift = []


In [204]:
def get_class(URL):
      page = requests.get(URL)
      soup = BeautifulSoup( page.content , 'html.parser')
      classy = soup.find_all('a', class_="Link--primary")[3:]
      return classy

In [205]:
for URL in rusem:
  classed = get_class(URL)
  find_class_rusem.append(classed)
for URL in rushift:
  classed = get_class(URL)
  find_class_rushift.append(classed)
for URL in nordia:
  classed = get_class(URL)
  find_class_nordia.append(classed)
for URL in rudsi:
  classed = get_class(URL)
  find_class_rudsi.append(classed)

In [206]:
judgements_rusem = []
judgements_nordia = []
judgements_rudsi = []
judgements_rushift = []
uses_rusem = []
uses_nordia = []
uses_rudsi = []
uses_rushift = []


In [207]:
for j in find_class_rudsi :
    for i in j:
      judgements_rudsi.append("https://raw.githubusercontent.com/"+i['href']+"/judgments.csv")
      uses_rudsi.append("https://raw.githubusercontent.com/"+i['href']+"/uses.csv")
for j in find_class_rusem :
    for i in j:
      judgements_rusem.append("https://raw.githubusercontent.com/"+i['href']+"/judgments.csv")
      uses_rusem.append("https://raw.githubusercontent.com/"+i['href']+"/uses.csv")
for j in find_class_rushift :
    for i in j:
      judgements_rushift.append("https://raw.githubusercontent.com/"+i['href']+"/judgments.csv")
      uses_rushift.append("https://raw.githubusercontent.com/"+i['href']+"/uses.csv")
for j in find_class_nordia :
    for i in j:
      judgements_nordia.append("https://raw.githubusercontent.com/"+i['href']+"/judgments.csv")
      uses_nordia.append("https://raw.githubusercontent.com/"+i['href']+"/uses.csv")

In [208]:
#judgments dataframe for rudsi, rusemshift, rushifteval, nordiachange
jud_rudsi = pd.DataFrame()
for i in judgements_rudsi:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[5]
   jud_rudsi = pd.concat([jud_rudsi, Tmp])


In [209]:
jud_rusems = pd.DataFrame()
for i in judgements_rusem:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[8]
   jud_rusems = pd.concat([jud_rusems, Tmp])


In [210]:
jud_rushift = pd.DataFrame()
for i in judgements_rushift:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[9]
   jud_rushift = pd.concat([jud_rushift, Tmp])

In [211]:
jud_nordia = pd.DataFrame()
for i in judgements_nordia:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[8]
   jud_nordia = pd.concat([jud_nordia, Tmp])

In [212]:
jud_nordia.loc[jud_nordia['dataset'] == 'subset1', 'dataset'] = 'NorDiaChange1'
jud_nordia.loc[jud_nordia['dataset'] == 'subset2', 'dataset'] = 'NorDiaChange2'

In [213]:
judgements_df = pd.DataFrame()
judgements_df = pd.concat([judgements_df, jud_rudsi])
judgements_df = pd.concat([judgements_df, jud_rusems])
judgements_df = pd.concat([judgements_df, jud_rushift])
judgements_df = pd.concat([judgements_df, jud_nordia])

In [214]:
judgements_df["language"] = "Russian"

In [215]:
judgements_df.loc[judgements_df["dataset"] == "NorDiaChange1", "language"] = 'Norwegian'
judgements_df.loc[judgements_df["dataset"] == "NorDiaChange2", "language"] = 'Norwegian'

In [216]:
use_rudsi = pd.DataFrame()
for i in uses_rudsi:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[5]
   use_rudsi = pd.concat([use_rudsi, Tmp])

In [217]:
use_rusems = pd.DataFrame()
for i in uses_rusem:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[8]
   use_rusems = pd.concat([use_rusems, Tmp])

In [218]:
use_rushift = pd.DataFrame()
for i in uses_rushift:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[9]
   use_rushift = pd.concat([use_rushift, Tmp])

In [219]:
use_nordia = pd.DataFrame()
for i in uses_nordia:
   Tmp = pd.read_csv(io.StringIO(requests.get(i.replace("/tree","")).content.decode('utf-8')), delimiter='\t')
   Tmp['dataset'] = i.split('/')[8]
   use_nordia = pd.concat([use_nordia, Tmp])

In [220]:
usees_df = pd.DataFrame()
usees_df = pd.concat([usees_df, use_rudsi])
usees_df = pd.concat([usees_df, use_rusems])
usees_df = pd.concat([usees_df, use_rushift])
usees_df = pd.concat([usees_df, use_nordia])

In [221]:
usees_df['language'] = 'Russian'
usees_df.loc[usees_df["dataset"] == "NorDiaChange1", "language"] = 'Norwegian'
usees_df.loc[usees_df["dataset"] == "NorDiaChange2", "language"] = 'Norwegian'

In [222]:
#final judgments dataframe full format
judgments_full = pd.concat([judgment_df, judgements_df], axis = 0)


In [223]:
#final uses dataframe full format
uses_full = pd.concat([uses_df_full, usees_df], axis=0)

In [224]:
#resetting the index of uses and judgments dataframes because they have repeated indices
judgments_full = judgments_full.reset_index(drop= True)
uses_full = uses_full.reset_index(drop= True)

In [225]:
#final uses and judgments in wug format
judgments_wug = judgments_full[["identifier1", "identifier2", "annotator", "judgment", "comment", "lemma", "dataset", "language"]]
uses_wug= uses_full[['lemma', 'pos', 'date', 'grouping', 'identifier', 'description', 'context', 'indexes_target_token', 'indexes_target_sentence', 'dataset', 'language']]

In [233]:
dup = uses_full[uses_full.duplicated()] #to gwt duplicates

In [227]:
for i in list(judgments_wug["dataset"].value_counts().index):
    df_temp = judgments_wug[judgments_wug["dataset"]==i]
    if not os.path.exists(i):
        os.mkdir(i)
    df_temp.to_csv(i +'/judgments.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')

In [228]:
for i in list(uses_wug["dataset"].value_counts().index):
    df_temp = uses_wug[uses_wug["dataset"]==i]
    if not os.path.exists(i):
        os.mkdir(i)
    df_temp.to_csv(i +'/uses.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')

In [229]:
for i in list(judgments_full["dataset"].value_counts().index):
    df_temp = judgments_full[judgments_full["dataset"]==i]
    if not os.path.exists(i):
        os.mkdir(i)
    df_temp.to_csv(i +'/judgments.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')

In [230]:
for i in list(uses_full["dataset"].value_counts().index):
    df_temp = uses_full[uses_full["dataset"]==i]
    if not os.path.exists(i):
        os.mkdir(i)
    df_temp.to_csv(i +'/uses.csv',index = False, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE, quotechar = '')