In [1]:
import json
import re
import pandas as pd
import numpy as np
import os

In [2]:
# Function for extracting the case information from the morph tag
def get_case(ma):
    indeclinable = ['ind','prep','interj','prep','conj','part']
    case_list = ['nom','voc','acc','i','inst','dat','abl','g','loc']
    gender_list = ['n','f','m','*']
    person_list = ['1','2','3']
    no_list = ['du','sg','pl']
    pops = [' ac',' ps']
    ma=ma.replace('sgpl','sg').replace('sgdu','sg')
    temp = re.sub("([\(\[]).*?([\)\]])", "\g<1>\g<2>", ma).replace('[] ','').strip(' []')
    temp = temp.split('.')
    if temp[-1] == '':
        temp.pop(-1)
    # Remove active passive
    case=''
    no=''
    person=''
    gender=''
    tense=''
    coarse=''
    for a,b in enumerate(temp):
        if b in pops:
            temp.pop(a)
    # Get gender
    for a,b in enumerate(temp):
        if b.strip() in gender_list:
            gender = b.strip()
            temp.pop(a)
    # Get case
    for a,b in enumerate(temp):
        if b.strip() in case_list:
            case = b.strip()
            temp.pop(a)
    if case!= '':
        coarse ='Noun'
    # Get person
    for a,b in enumerate(temp):
        if b.strip() in person_list:
            person = b.strip()
            temp.pop(a)
    # Get no
    for a,b in enumerate(temp):
        if b.strip() in no_list:
            no = b.strip()
            temp.pop(a)
    # Get Tense
    for b in temp:
        tense=tense+ ' '+b.strip()
    tense=tense.strip()

    if tense == 'adv':
        coarse = 'adv'
    for ind in indeclinable:
        if tense == ind:
            coarse = 'Ind'
    if tense == 'abs' or tense == 'ca abs':
        coarse = 'IV'
    if tense!='' and coarse=='':
        if person !='' or no!='':
            coarse= 'FV'
        else:
            coarse = 'IV'
    if case == 'i':
        return 'inst'

    if case !='':
        return case
    else:
        return coarse

In [3]:
store = pd.DataFrame(columns = ['modifier_case','head_case','relation','modifier_pos','head_pos','file_name'])
for file in os.listdir('./files/data/STBC_Train/'):
    df = pd.read_csv('./files/data/STBC_Train/'+file)
    for i in range(len(df)):
        if df.iloc[i,-1]!=df.iloc[i,-1]:
            continue
        store = store.append({'modifier_case' : get_case(df.iloc[i,3]), 
                              'head_case' : get_case(df.iloc[int(df.iloc[i,-1]),3]), 
                              'relation' : df.iloc[i,-2],
                              'modifier_pos' : df.iloc[i,3],
                              'head_pos' :df.iloc[int(df.iloc[i,-1]),3],
                              'file_name': file},ignore_index = True)

In [4]:
# Load the triplet file
# df = pd.read_csv("files/tripples.csv")
data = store.groupby(["head_case","modifier_case","relation"]).count().sort_values("head_pos",ascending=False)
data['Link'] = ''
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,modifier_pos,head_pos,file_name,Link
head_case,modifier_case,relation,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FV,nom,karwa,2036,2036,2036,
nom,nom,viseranam,1401,1401,1401,
FV,acc,karma,1180,1180,1180,
acc,acc,viseranam,922,922,922,
FV,Ind,sambanxah,922,922,922,


In [5]:
# Add the GitHub links for the evidence examples whose frequencey is less than 30
tripple = dict()
for i in range(len(data)):
    if int(data.iloc[i,0]) < 50:
        temp = list(data.index[i])
        if temp[2] == 'bavalakranasapwami_samanakalah':
            temp[2]= 'bavalakranasapwamisamanakalah'
        data.iloc[i,-1] = 'https://github.com/Jivnesh/STBC_Analysis/blob/main/files/samples/'+str(data.iloc[i,0])+'_'+'_'.join(temp)
        tripple['_'.join(temp)] = int(data.iloc[i,0])
data.to_excel("./files/tripples.xlsx")  

In [6]:
# tripple_files is the dict to store the list of files having the coarse-coarse-relation evidences
# Store the names of these evidence files in the dictionary
tripple_files = dict()
df = store
for key in tripple:
    tripple_files[key] = []
for key in tripple_files.keys():
    a,b,c = key.split('_')
    for i in range(len(df)):
        if df.iloc[i,1] == a and df.iloc[i,0] == b and df.iloc[i,2].replace('_','') == c:
            if df.iloc[i,-1] not in tripple_files[key]:
                tripple_files[key].append(df.iloc[i,-1])
    

In [7]:
# Write down the files which has evidence examples
# For example, IV_IV_prayojanam: denotes all the examples with these instances
for key in tripple_files.keys():
    file_name = str(tripple[key])+'_'+key
    w = open('./files/samples/'+file_name,'w')
    
    h,c,l = key.split('_')
    for file in set(tripple_files[key]):
        f = open('./files/data/STBC_Train/'+file,'r')
        lines = f.readlines()
        f.close()
        j=0
        for line in lines:
            if j==0:
                j+=1
                continue
            if line.split(',')[5] == line.split(',')[5] and line.split(',')[5] != '':
                r = line.split(',')[5].replace('_','')
            else:
                r = 'root'
                
            if line.split(',')[6] == line.split(',')[6] and line.split(',')[6] != '\n':
                try:
                    t = get_case(lines[int(line.split(',')[6])+1].split(',')[3])
                except:
                    t = 'ROOT'
            else:
                t = 'ROOT'
            if c == get_case(line.split(',')[3]) and l == r and h == t:
                w.write(line.replace('\n',' ####### '+ key + ' #######'+'\n'))
            else:
                w.write(line)
            j=j+1
        w.write('\n')
    w.close()   