In [1]:
import json
import re
import pandas as pd
import numpy as np
import os

In [2]:
# Function for extracting the case information from the morph tag
def get_case(ma):
    indeclinable = ['ind','prep','interj','prep','conj','part']
    case_list = ['nom','voc','acc','i','inst','dat','abl','g','loc']
    gender_list = ['n','f','m','*']
    person_list = ['1','2','3']
    no_list = ['du','sg','pl']
    pops = [' ac',' ps']
    ma=ma.replace('sgpl','sg').replace('sgdu','sg')
    temp = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []')
    temp = temp.split('.')
    if temp[-1] == '':
        temp.pop(-1)
    # Remove active passive
    case=''
    no=''
    person=''
    gender=''
    tense=''
    coarse=''
    for a,b in enumerate(temp):
        if b in pops:
            temp.pop(a)
    # Get gender
    for a,b in enumerate(temp):
        if b.strip() in gender_list:
            gender = b.strip()
            temp.pop(a)
    # Get case
    for a,b in enumerate(temp):
        if b.strip() in case_list:
            case = b.strip()
            temp.pop(a)
    if case!= '':
        coarse ='Noun'
    # Get person
    for a,b in enumerate(temp):
        if b.strip() in person_list:
            person = b.strip()
            temp.pop(a)
    # Get no
    for a,b in enumerate(temp):
        if b.strip() in no_list:
            no = b.strip()
            temp.pop(a)
    # Get Tense
    for b in temp:
        tense=tense+ ' '+b.strip()
    tense=tense.strip()

    if tense == 'adv':
        coarse = 'adv'
    for ind in indeclinable:
        if tense == ind:
            coarse = 'Ind'
    if tense == 'abs' or tense == 'ca abs':
        coarse = 'IV'
    if tense!='' and coarse=='':
        if person !='' or no!='':
            coarse= 'FV'
        else:
            coarse = 'IV'
    if case == 'i':
        return 'inst'

    if case !='':
        return case
    else:
        return coarse

In [3]:
store = pd.DataFrame(columns = ['modifier_case','head_case','relation','modifier_pos','head_pos','file_name'])
for file in os.listdir('./files/data/STBC_Train/'):
    df = pd.read_csv('./files/data/STBC_Train/'+file)
    for i in range(len(df)):
        if df.iloc[i,-1]!=df.iloc[i,-1]:
            continue
        store = store.append({'modifier_case' : get_case(df.iloc[i,3]), 
                              'head_case' : get_case(df.iloc[int(df.iloc[i,-1]),3]), 
                              'relation' : df.iloc[i,-2],
                              'modifier_pos' : df.iloc[i,3],
                              'head_pos' :df.iloc[int(df.iloc[i,-1]),3],
                              'file_name': file},ignore_index = True)
store

Unnamed: 0,modifier_case,head_case,relation,modifier_pos,head_pos,file_name
0,nom,Ind,karwa,nom. sg. f.,conj.,Gold_1852.csv
1,loc,Ind,axikaranam,loc. sg. n.,conj.,Gold_1852.csv
2,Ind,FV,prayojanam,conj.,pr. md. sg. 3,Gold_1852.csv
3,nom,nom,viseranam,nom. sg. f.,nom. sg. f.,Gold_1852.csv
4,Ind,nom,sambanxah,conj.,nom. sg. n.,Gold_1852.csv
...,...,...,...,...,...,...
20095,nom,nom,viseranam,nom. sg. m.,nom. sg. m.,Shishu_train_218.csv
20096,nom,FV,karma,nom. sg. m.,pr. ps. sg. 3,Shishu_train_218.csv
20097,Ind,FV,sambanxah,ind.,pr. ps. sg. 3,Shishu_train_218.csv
20098,inst,FV,karwa,i. pl. m.,pr. ps. sg. 3,Shishu_train_218.csv


In [4]:
# Load the triplet file
# df = pd.read_csv("files/tripples.csv")
data = store.groupby(["head_case","modifier_case","relation"]).count().sort_values("head_pos",ascending=False)
data['Link'] = ''
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,modifier_pos,head_pos,file_name,Link
head_case,modifier_case,relation,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FV,nom,karwa,2036,2036,2036,
nom,nom,viseranam,1401,1401,1401,
FV,acc,karma,1180,1180,1180,
acc,acc,viseranam,922,922,922,
FV,Ind,sambanxah,922,922,922,


In [5]:
# Add the GitHub links for the evidence examples whose frequencey is less than 30
tripple = dict()
for i in range(len(data)):
    if int(data.iloc[i,0]) < 30:
        temp = list(data.index[i])
        if temp[2] == 'bavalakranasapwami_samanakalah':
            temp[2]= 'bavalakranasapwamisamanakalah'
        data.iloc[i,-1] = 'https://github.com/Jivnesh/STBC_Analysis/blob/master/'+str(data.iloc[i,0])+'_'+'_'.join(temp)
        tripple['_'.join(temp)] = int(data.iloc[i,0])
data.to_excel("./files/tripples.xlsx")  

In [13]:
# tripple_files is the dict to store the list of files having the coarse-coarse-relation evidences
# Store the names of these evidence files in the dictionary
tripple_files = dict()
df = store
for key in tripple:
    tripple_files[key] = []
for key in tripple_files.keys():
    a,b,c = key.split('_')
    for i in range(len(df)):
        if df.iloc[i,1] == a and df.iloc[i,0] == b and df.iloc[i,2].replace('_','') == c:
            if df.iloc[i,-1] not in tripple_files[key]:
                tripple_files[key].append(df.iloc[i,-1])
    

In [15]:
tripple_files

{'Ind_nom_karwa': ['Gold_1852.csv',
  'Gold_490.csv',
  'Gold_1677.csv',
  'Gold_1029.csv',
  'Gold_2093.csv',
  'Gold_211.csv',
  'Gold_1307.csv',
  'Gold_2087.csv',
  'Gold_1961.csv',
  'Gold_1849.csv',
  'Gold_668.csv',
  'Gold_2256.csv',
  'Gold_1650.csv',
  'Gold_347.csv',
  'Gold_1346.csv',
  'Gold_2463.csv',
  'Gold_996.csv',
  'Gold_2217.csv',
  'Gold_506.csv',
  'Gold_725.csv',
  'Gold_1670.csv',
  'Shishu_train_276.csv',
  'Shishu_train_299.csv',
  'Gold_1184.csv',
  'Gold_831.csv',
  'Gold_71.csv',
  'Gold_589.csv'],
 'inst_IV_viseranam': ['Shishu_train_7.csv',
  'Shishu_train_60.csv',
  'Shishu_train_40.csv',
  'Shishu_train_22.csv',
  'Shishu_train_105.csv',
  'Shishu_train_296.csv',
  'Shishu_train_200.csv',
  'Shishu_train_87.csv',
  'Shishu_train_169.csv',
  'Shishu_train_131.csv',
  'Shishu_train_153.csv',
  'Shishu_train_143.csv',
  'Shishu_train_160.csv',
  'Shishu_train_11.csv',
  'Shishu_train_209.csv',
  'Shishu_train_61.csv',
  'Shishu_train_8.csv',
  'Shishu_tra

In [23]:
# Write down the files which has evidence examples
# For example, IV_IV_prayojanam: denotes all the examples with these instances
for key in tripple_files.keys():
    file_name = str(tripple[key])+'_'+key
    w = open('./files/samples/'+file_name,'w')
    
    h,c,l = key.split('_')
    for file in set(tripple_files[key]):
        f = open('./files/data/STBC_Train/'+file,'r')
        print(file)
        lines = f.readlines()
        f.close()
        j=0
        for line in lines:
            if j==0:
                j+=1
                continue
            if line.split(',')[5] == line.split(',')[5] and line.split(',')[5] != '':
                r = line.split(',')[5].replace('_','')
            else:
                r = 'root'
                
            if line.split(',')[6] == line.split(',')[6] and line.split(',')[6] != '\n':
                try:
                    t = get_case(lines[int(line.split(',')[6])+1].split(',')[3])
                except:
                    t = 'ROOT'
            else:
                t = 'ROOT'
            if c == get_case(line.split(',')[3]) and l == r and h == t:
                w.write(line.replace('\n',' ####### '+ key + ' #######'+'\n'))
            else:
                w.write(line)
            j=j+1
        w.write('\n')
    w.close()   

Gold_996.csv
Gold_506.csv
Shishu_train_299.csv
Gold_2256.csv
Gold_589.csv
Gold_725.csv
Gold_2463.csv
Gold_71.csv
Gold_831.csv
Gold_1184.csv
Gold_2217.csv
Shishu_train_276.csv
Gold_2093.csv
Gold_211.csv
Gold_1670.csv
Gold_1852.csv
Gold_1961.csv
Gold_1849.csv
Gold_347.csv
Gold_1307.csv
Gold_490.csv
Gold_1650.csv
Gold_1346.csv
Gold_668.csv
Gold_1677.csv
Gold_1029.csv
Gold_2087.csv
Shishu_train_61.csv
Shishu_train_60.csv
Shishu_train_109.csv
Shishu_train_80.csv
Shishu_train_22.csv
Shishu_train_131.csv
Shishu_train_40.csv
Shishu_train_182.csv
Shishu_train_144.csv
Shishu_train_296.csv
Shishu_train_222.csv
Shishu_train_53.csv
Shishu_train_153.csv
Shishu_train_87.csv
Shishu_train_142.csv
Shishu_train_143.csv
Shishu_train_8.csv
Shishu_train_160.csv
Shishu_train_200.csv
Shishu_train_177.csv
Gold_14.csv
Shishu_train_11.csv
Shishu_train_169.csv
Shishu_train_209.csv
Shishu_train_105.csv
Shishu_train_7.csv
Shishu_train_136.csv
Gold_2251.csv
Gold_1944.csv
Gold_832.csv
Gold_1315.csv
Gold_1038.csv
Gold

Gold_866.csv
Gold_2120.csv
Gold_100.csv
Gold_1113.csv
Gold_2213.csv
Gold_61.csv
Gold_306.csv
Gold_1657.csv
Gold_415.csv
Gold_544.csv
Gold_1946.csv
Gold_635.csv
Gold_1614.csv
Gold_1331.csv
Gold_1126.csv
Gold_1297.csv
Gold_966.csv
Gold_1181.csv
Gold_633.csv
Shishu_train_205.csv
Gold_1222.csv
Shishu_train_263.csv
Shishu_train_221.csv
Gold_98.csv
Shishu_train_207.csv
Gold_1723.csv
Gold_1883.csv
Shishu_train_151.csv
Gold_1551.csv
Gold_1615.csv
Shishu_train_183.csv
Gold_1541.csv
Gold_1958.csv
Gold_737.csv
Gold_1879.csv
Gold_2164.csv
Gold_1739.csv
Gold_161.csv
Gold_1265.csv
Gold_353.csv
Gold_1248.csv
Gold_990.csv
Gold_858.csv
Gold_1073.csv
Gold_1183.csv
Gold_2442.csv
Gold_1871.csv
Gold_2201.csv
Gold_646.csv
Shishu_train_71.csv
Gold_1888.csv
Gold_1735.csv
Gold_1801.csv
Gold_2282.csv
Gold_1202.csv
Gold_505.csv
Shishu_train_289.csv
Gold_509.csv
Gold_2165.csv
Shishu_train_161.csv
Gold_634.csv
Gold_1406.csv
Gold_515.csv
Gold_2264.csv
Gold_1685.csv
Gold_1732.csv
Gold_1611.csv
Gold_1418.csv
Gold_488

Gold_1211.csv
Gold_2098.csv
Gold_2437.csv
Gold_101.csv
Gold_1524.csv
Gold_2423.csv
Gold_739.csv
Gold_1723.csv
Gold_722.csv
Gold_996.csv
Gold_2380.csv
Shishu_train_276.csv
Gold_1486.csv
Gold_24.csv
Gold_624.csv
Gold_1858.csv
Gold_910.csv
Gold_834.csv
Gold_746.csv
Gold_281.csv
Shishu_train_72.csv
Shishu_train_82.csv
Gold_2323.csv
Shishu_train_42.csv
Gold_75.csv
Gold_1043.csv
Gold_533.csv
Gold_331.csv
Gold_2062.csv
Gold_2022.csv
Gold_510.csv
Gold_2057.csv
Gold_446.csv
Gold_1709.csv
Gold_1185.csv
Gold_1889.csv
Shishu_train_66.csv
Gold_2418.csv
Gold_333.csv
Gold_1976.csv
Gold_634.csv
Gold_776.csv
Gold_726.csv
Gold_2290.csv
Gold_164.csv
Gold_506.csv
Gold_148.csv
Shishu_train_158.csv
Gold_2483.csv
Gold_1379.csv
Gold_2205.csv
Gold_1182.csv
Gold_117.csv
Gold_1134.csv
Gold_317.csv
Gold_1841.csv
Gold_1088.csv
Gold_1525.csv
Gold_707.csv
Gold_1650.csv
Gold_2224.csv
Gold_624.csv
Gold_1724.csv
Gold_510.csv
Gold_391.csv
Gold_2062.csv
Gold_1718.csv
Shishu_train_125.csv
Gold_2423.csv
Gold_354.csv
Gold_1