In [1]:
import json
import numpy
import tqdm
import numpy as np

from tr_syntactic_parser.helper import *

# Noun Suffix Morphemes to CNF

In [2]:
suffix_classes = json.loads(open("tr_syntactic_parser/grammar/all_suffix_classes.json", encoding="utf-8").read())

In [3]:
cnf_suff = ""
for org, targ in CNF_TAG_DICT_NOUN_SUFF.items():
    for item in suffix_classes[org]:
        cnf_suff += targ + " -> " + item + "\n"

print(cnf_suff)

POSPRO1 -> m
POSPRO1 -> üm
POSPRO1 -> ım
POSPRO1 -> yım
POSPRO1 -> i
POSPRO1 -> um
POSPRO1 -> yim
POSPRO1 -> yum
POSPRO1 -> im
POSPROPL1 -> yiz
POSPROPL1 -> yüz
POSPROPL1 -> ız
POSPROPL1 -> üz
POSPROPL1 -> yuz
POSPROPL1 -> iz
POSPROPL1 -> yız
POSPROPL1 -> lım
POSPROPL1 -> uz
POSPROPL1 -> k
POSPRO2 -> sin
POSPRO2 -> s
POSPRO2 -> ı
POSPRO2 -> n
POSPRO2 -> sun
POSPRO2 -> sın
POSPROPL2 -> yin
POSPROPL2 -> ın
POSPROPL2 -> nüz
POSPROPL2 -> siniz
POSPROPL2 -> niz
POSPROPL2 -> sınız
POSPROPL2 -> ı
POSPROPL2 -> n
POSPROPL2 -> nız
POSPROPL2 -> sunuz
POSPROPL2 -> nuz
POSPRO3  -> ı
POSPRO3  -> s
POSPRO3  -> n
POSPRONPL3  -> ler
POSPRONPL3  -> r
POSPRONPL3  -> lar
POSPRONPL3  -> l
POSPRONPL3  -> a



# Lexicon

In [53]:
lexicon = json.loads(open("tr_syntactic_parser/grammar/the_lexicon_reversed.json", encoding="utf-8").read())

In [54]:
POS_CONVERT_DICT = {
    "ADP" : "PP",
    "CCONJ" : "PP",
    "SCONJ" : "pp",
    "VERB" : "VP",
    "NOUN" : "NP",
    "PUNCT" : "PUNC"
}

POS_IMP = {
    "PUNC" : 0,
    "NUM" : 1,
    "VP" : 2,
    "NP" : 3
}

In [55]:
CNF_lexicon = {}
for key, values in lexicon.items():
    for val in values:
        if key in POS_CONVERT_DICT.keys(): key = POS_CONVERT_DICT[key]
            
        if val in CNF_lexicon.keys():
            if CNF_lexicon[val] in POS_IMP.keys(): cost = POS_IMP[CNF_lexicon[val]]
            else: cost = 100
            
            if key in POS_IMP.keys(): curr_cost = POS_IMP[key]
            else: curr_cost = 99
            if cost > curr_cost: CNF_lexicon[val] = key
        else: 
            CNF_lexicon[val] = key

In [56]:
with open("tr_syntactic_parser/grammar/the_lexicon.json","w", encoding='utf-8') as f:
    json.dump(CNF_lexicon, f)

print("Lexicon was saved to :", "tr_syntactic_parser/grammar/the_lexicon.json")

Lexicon was saved to : tr_syntactic_parser/grammar/the_lexicon.json


In [57]:
lex = ""

for key, val in CNF_lexicon.items():
    lex += val + " -> " + key + "\n"

# Merged Verb Suffixes

In [58]:
verb_suff_comb = json.loads(open("tr_syntactic_parser/grammar/combined_verb_suffix_morphemes.json", encoding="utf-8").read())

In [59]:
suff_comb = ""
for key in verb_suff_comb.keys():
    if "Past" in key: tense = "PAST"
    elif "Prog" in key: tense = "PRE"
    elif "Fut" in key: tense = "FUT"
    else: continue

    if "1" in key: finalkey = tense+"1"
    elif "2" in key: finalkey = tense+"2"
    elif "3" in key: finalkey = tense+"3"
    else: finalkey = tense+"3"

    for suff in verb_suff_comb[key]:
        suff_comb += "\n" + finalkey + " -> " + suff
print(suff_comb)


PAST3 -> ydi
PAST3 -> tı
PAST3 -> du
PAST3 -> tu
PAST3 -> ydı
PAST3 -> dü
PAST3 -> tü
PAST3 -> ydu
PAST3 -> dı
PAST3 -> ydü
PAST3 -> di
PAST3 -> ti
PAST3 -> medi
PAST3 -> madı
PRE1 -> iyor
PRE1 -> ıyor
PRE1 -> yor
FUT1 -> eceğim
PAST1 -> dım
PAST1 -> dum
PAST1 -> dim
PAST1 -> tım
PAST1 -> dük
PAST1 -> dık
PAST1 -> dik
FUT3 -> yecek
PAST3 -> emediler
FUT3 -> ecektir
FUT1 -> meyeceğiz
FUT3 -> meyecek
FUT3 -> mayacak
PRE1 -> mıyorlar
PRE1 -> miyor
PAST3 -> tiler
PAST3 -> dılar
PAST3 -> rdı
FUT3 -> meyecekler
PRE2 -> maktadır
PAST3 -> mişti
PAST3 -> mıştı
PRE2 -> mekte
PAST2 -> din
PAST1 -> iyordu
PRE1 -> ıyorum
PAST2 -> dunuz
PRE1 -> yorlar
PRE1 -> ıyorlar
PRE1 -> ıyorsun


# Predefined Rules

In [68]:
pre_rules = open("tr_syntactic_parser/grammar/tr_cnf_rules.txt", encoding="utf-8").read()

In [69]:
lines = pre_rules + suff_comb + cnf_suff + lex
print(lines)

###### PAST TENSE ######
########################
# Pronoun, Verb+past 
S -> PRO1 VPPAST1
S -> PRO2 VPPAST2
S -> PRO3 VPPAST3

# NounPhrase, Verb+past
S -> NP VPPAST1
S -> NP VPPAST2
S -> NP VPPAST3

# PronounClause(Pronoun + NounPhrase), Verb+past
S -> PRONOUNCLAUSE1 VPPAST1
S -> PRONOUNCLAUSE2 VPPAST2
S -> PRONOUNCLAUSE3 VPPAST3

VPPAST1 -> VP PAST1
VPPAST2 -> VP PAST2
VPPAST3 -> VP PAST3

###### PRESENT TENSE ######
###########################
# Pronoun, Verb+present
S -> PRO1 VPPRE1
S -> PRO2 VPPRE2
S -> PRO3 VPPRE3

# NounPhrase, Verb+present
S -> NP VPPRE1
S -> NP VPPRE2
S -> NP VPPRE3

# PronounClause(Pronoun + NounPhrase), Verb+present
S -> PRONOUNCLAUSE1 VPPRE1
S -> PRONOUNCLAUSE2 VPPRE2
S -> PRONOUNCLAUSE3 VPPRE3

VPPRE1 -> VP PRE1
VPPRE2 -> VP PRE2
VPPRE3 -> VP PRE3

###### FUTURE TENSE ######
##########################
# Pronoun, Verb+future
S -> PRO1 VPFUT1
S -> PRO2 VPFUT2
S -> PRO3 VPFUT3

# NounPhrase, Verb+future
S -> NP VPFUT1
S -> NP VPFUT2
S -> NP VPFUT3

# PronounC

In [70]:
for line in lines.split("\n"):
    print(line)

###### PAST TENSE ######
########################
# Pronoun, Verb+past 
S -> PRO1 VPPAST1
S -> PRO2 VPPAST2
S -> PRO3 VPPAST3

# NounPhrase, Verb+past
S -> NP VPPAST1
S -> NP VPPAST2
S -> NP VPPAST3

# PronounClause(Pronoun + NounPhrase), Verb+past
S -> PRONOUNCLAUSE1 VPPAST1
S -> PRONOUNCLAUSE2 VPPAST2
S -> PRONOUNCLAUSE3 VPPAST3

VPPAST1 -> VP PAST1
VPPAST2 -> VP PAST2
VPPAST3 -> VP PAST3

###### PRESENT TENSE ######
###########################
# Pronoun, Verb+present
S -> PRO1 VPPRE1
S -> PRO2 VPPRE2
S -> PRO3 VPPRE3

# NounPhrase, Verb+present
S -> NP VPPRE1
S -> NP VPPRE2
S -> NP VPPRE3

# PronounClause(Pronoun + NounPhrase), Verb+present
S -> PRONOUNCLAUSE1 VPPRE1
S -> PRONOUNCLAUSE2 VPPRE2
S -> PRONOUNCLAUSE3 VPPRE3

VPPRE1 -> VP PRE1
VPPRE2 -> VP PRE2
VPPRE3 -> VP PRE3

###### FUTURE TENSE ######
##########################
# Pronoun, Verb+future
S -> PRO1 VPFUT1
S -> PRO2 VPFUT2
S -> PRO3 VPFUT3

# NounPhrase, Verb+future
S -> NP VPFUT1
S -> NP VPFUT2
S -> NP VPFUT3

# PronounC

In [71]:
grammar_all = []
for line in lines.split("\n"):
    if "#" in line or len(line) < 1: continue
    grammar_all.append(line)
    
grammar_all = "\n".join(grammar_all)
print(grammar_all)

S -> PRO1 VPPAST1
S -> PRO2 VPPAST2
S -> PRO3 VPPAST3
S -> NP VPPAST1
S -> NP VPPAST2
S -> NP VPPAST3
S -> PRONOUNCLAUSE1 VPPAST1
S -> PRONOUNCLAUSE2 VPPAST2
S -> PRONOUNCLAUSE3 VPPAST3
VPPAST1 -> VP PAST1
VPPAST2 -> VP PAST2
VPPAST3 -> VP PAST3
S -> PRO1 VPPRE1
S -> PRO2 VPPRE2
S -> PRO3 VPPRE3
S -> NP VPPRE1
S -> NP VPPRE2
S -> NP VPPRE3
S -> PRONOUNCLAUSE1 VPPRE1
S -> PRONOUNCLAUSE2 VPPRE2
S -> PRONOUNCLAUSE3 VPPRE3
VPPRE1 -> VP PRE1
VPPRE2 -> VP PRE2
VPPRE3 -> VP PRE3
S -> PRO1 VPFUT1
S -> PRO2 VPFUT2
S -> PRO3 VPFUT3
S -> NP VPFUT1
S -> NP VPFUT2
S -> NP VPFUT3
S -> PRONOUNCLAUSE1 VPFUT1
S -> PRONOUNCLAUSE2 VPFUT2
S -> PRONOUNCLAUSE3 VPFUT3
VPFUT1 -> VP FUT1
VPFUT2 -> VP FUT2
VPFUT3 -> VP FUT3
S -> PRO1 NP POSPRO1
S -> PRO2 NP POSPRO2
S -> PRO3 NP POSPRO3
NP -> NP POSPRO1
NP -> NP POSPRO2
NP -> NP POSPRO3
PRONOUNCLAUSE1 -> PRO1 NP
PRONOUNCLAUSE2 -> PRO2 NP
PRONOUNCLAUSE3 -> PRO3 NP
S -> NP VP PREPARTICLE
NP -> ADJ NP
NP -> ADJ NP PL
NP -> ADJ SING NP
PLNOUN -> NP PL
SINGNP -> SING

In [72]:
with open("tr_syntactic_parser/grammar/grammar_all.txt","w", encoding='utf-8') as f:
    f.write(grammar_all)

print("Grammar was saved to :", "tr_syntactic_parser/grammar/grammar_all.txt")

Grammar was saved to : tr_syntactic_parser/grammar/grammar_all.txt


# TEST

In [79]:
temp_lexicon = lexicon.copy()

for key in temp_lexicon.keys():
    temp_lexicon[key] = temp_lexicon[key][0:5]

    
temp_lexicon

{'NOUN': ['35-uluslu', 'bakiye', 'amerikalı', 'broncs', 'ikmal'],
 'PROPN': ["mcdonald's'ın", 'asheville', 'hud', 'marsam', 'cannes'],
 'VERB': ['çıldır', 'yayın', 'kuvvet', 'bollaş', 'magazin'],
 'PUNCT': ['-', '…', '$', '”', '?'],
 'ADJ': ['atomik', 'yayın', 'kuvvet', '1987', 'diplomatik'],
 'NUM': ['1300000', '8.347', '3\\/4', '1987', '142.75'],
 'DET': ['bir', 'birçok', 'her', 'birkaç', 'hiçbir'],
 'ADV': ['kat', 'inatla', 'var', 'karşı', 'dol'],
 'ADP': ['yanısıra', 'yukarı', 'birlikte', 'nazaran', 'gibi'],
 'PRON': ['birbiri', 'sen', 'hepsi', 'biz', 'ne'],
 'CCONJ': ['hem', 'olsun', 'ne', 've', 'örneğin'],
 'AUX': ['değil', 'mi'],
 'X': ['%3.5', '%7.875', '%8.01', '21:00-22:30', '%83'],
 'SCONJ': ['eğer', 'ki'],
 'INTJ': ['oh', 'yaşasın', 'hadi', 'aman', 'ah']}

In [66]:
temp = {}
for key,value in lexicon.items():
    for val in value:
        if val in temp.keys(): temp[val].append(key)
        else: temp[val] = [key]

In [67]:
temp["."]

['VERB', 'PUNCT']

In [94]:
import pandas as pd
df = pd.DataFrame({
    "Combined Tags" : list(verb_suff_comb.keys()),
    "Suffixes" : list(verb_suff_comb.values())
})
for i in range(len(df)):
    key = df["Combined Tags"][i]
    if "Past" in key: tense = "PAST"
    elif "Prog" in key: tense = "PRE"
    elif "Fut" in key: tense = "FUT"
    else: continue

    if "1" in key: finalkey = tense+"1"
    elif "2" in key: finalkey = tense+"2"
    elif "3" in key: finalkey = tense+"3"
    else: finalkey = tense+"3"
    
    df.loc[i,"CNF Tag"] = finalkey

In [95]:
df

Unnamed: 0,Combined Tags,Suffixes,CNF Tag
0,Past,"[ydi, tı, du, tu, ydı, dü, tü, ydu, dı, ydü, d...",PAST3
1,NegPast,"[medi, madı]",PAST3
2,Prog1,"[iyor, ıyor, yor]",PRE1
3,FutA1sg,[eceğim],FUT1
4,PastA1sg,"[dım, dum, dim, tım]",PAST1
5,PastA1pl,"[dük, dık, dik]",PAST1
6,Fut,[yecek],FUT3
7,UnablePastA3pl,[emediler],PAST3
8,FutCop,[ecektir],FUT3
9,NegFutA1pl,[meyeceğiz],FUT1
