In [21]:
# import calamancy for dependency parsing
import calamancy
import tl_calamancy_trf
print(calamancy.models())

nlp = tl_calamancy_trf.load()

#doc = nlp("Kumain is Juan sa Rizal kanina.")
#calamancy_model_trf = calamancy.load("tl_calamancy_trf-0.2.0", "tl-calamancy-trf" = 0.2 )

['tl_calamancy_md-0.2.0', 'tl_calamancy_lg-0.2.0', 'tl_calamancy_trf-0.2.0', 'tl_calamancy_md-0.1.0', 'tl_calamancy_lg-0.1.0', 'tl_calamancy_trf-0.1.0']



If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


In [9]:
tl_inp_list = ['Nabali ang bintana dahil sa larangan ng minahan.',
 'Ang bukid ng minahan ay nasa mesa.',
 'Matatag ang minahan.',
 'Sumabog ang larangan ng minahan.',
 'Ang larangan ng minahan ay yari sa metal.',
 'Ibinigay sa akin ng may - ari ang minahan.']

In [12]:
tl_docs = nlp_calamancy.pipe(tl_inp_list)

for doc in tl_docs:
    print(doc)
    for t in doc:
        print(t,t.dep_)

Nabali ang bintana dahil sa larangan ng minahan.
Nabali ROOT
ang det
bintana nsubj
dahil case
sa fixed
larangan advcl
ng case
minahan nmod
. punct
Ang bukid ng minahan ay nasa mesa.
Ang det
bukid nsubj
ng case
minahan nmod
ay discourse
nasa case
mesa ROOT
. punct
Matatag ang minahan.
Matatag ROOT
ang det
minahan nsubj
. punct
Sumabog ang larangan ng minahan.
Sumabog ROOT
ang det
larangan nsubj
ng case
minahan nmod:poss
. punct
Ang larangan ng minahan ay yari sa metal.
Ang det
larangan nsubj
ng case
minahan nmod:poss
ay discourse
yari ROOT
sa case
metal nmod
. punct
Ibinigay sa akin ng may - ari ang minahan.
Ibinigay ROOT
sa case
akin obl
ng case
may compound
- punct
ari obj:agent
ang det
minahan nsubj
. punct


In [None]:
from spacy import displacy

vocab_dict_tl = {}

for t in tl_inp_list:
    doc = nlp(t)
    for token in doc:
        print(token, token.dep_)
        if token.dep_ == "nsubj":
            if str(token) not in vocab_dict_tl.keys():
                vocab_dict_tl[str(token)] = 1
            else: 
                vocab_dict_tl[str(token)] += 1
    displacy.render(doc, style='dep',jupyter=True)
    print("==================")

vocab_dict_tl

In [None]:
# import spacytrf for dependency parsing in english
import spacy

nlp_spacy = spacy.load('en_core_web_trf')

In [25]:
en_inp_list = ['The delinquent eats pizza.',
 'The delinquent is in the store.',
 'The delinquent is happy.',
 'The delinquent is running.',
 'The delinquent was invited to the party.',
 'I was given a gift from the delinquent.']

In [None]:
vocab_dict_en = {}

for t in en_inp_list:
    doc = nlp_spacy(t)
    for token in doc:
        print(token, token.dep_)
        if token.dep_ == "nsubj":
            if str(token) not in vocab_dict_en.keys():
                vocab_dict_en[str(token)] = 1
            else: 
                vocab_dict_en[str(token)] += 1
    print("==================")

vocab_dict_en

# Actual implementation

In [2]:
# imports
# import calamancy for dependency parsing
# import spacytrf for dependency parsing in english

import spacy
import calamancy
import tl_calamancy_trf

nlp_calamancy = tl_calamancy_trf.load()
nlp_spacy = spacy.load('en_core_web_trf')

# config
print(nlp_calamancy.pipe_names)
nlp_calamancy.disable_pipe('ner')
print(nlp_calamancy.pipe_names)
print(nlp_spacy.pipe_names)
nlp_spacy.disable_pipes(['attribute_ruler', 'lemmatizer', 'ner'])
print(nlp_spacy.pipe_names)


If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


['transformer', 'trainable_lemmatizer', 'tagger', 'morphologizer', 'parser', 'ner']
['transformer', 'trainable_lemmatizer', 'tagger', 'morphologizer', 'parser']
['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
['transformer', 'tagger', 'parser']


In [59]:
# for ETA
from datetime import timedelta,datetime

def calamancy_dp(sentence_list, batch_size=64): # processes the text in batches then returns a list of calamancy docs
    doc_list_master = [] # batching is needed for checkpoints
    eta_rolling_average = [timedelta(0),timedelta(0),timedelta(0),timedelta(0),timedelta(0)]
    batch_index=0
    
    # splits the list into batches
    batched_sentences = [sentence_list[i:i + batch_size] for i in range(0, len(sentence_list), batch_size)]
    batched_sentences_len = len(batched_sentences)

    try:
        for sentence_list_batch in batched_sentences: # get the list of doc
            if batch_index % 2 == 0: # check eta every couple of batches
                start_time = datetime.now()
                doc_list_small = nlp_calamancy.pipe(sentence_list_batch)
                doc_list_master += doc_list_small
                end_time = datetime.now()
                batch_index += 1

                # measure ETA
                time_elapsed = ((end_time - start_time) * (batched_sentences_len - batch_index))
                eta_rolling_average.pop(0)
                eta_rolling_average.append(time_elapsed)
                eta = (sum(eta_rolling_average, timedelta(0))/5)
                print(f"calamancy dependecy parsing - parsed: {batch_index*batch_size} ETA: {eta}")
            else:
                doc_list_small = nlp_calamancy.pipe(sentence_list_batch)
                doc_list_master += doc_list_small
                batch_index += 1
        return doc_list_master

    except Exception as e: #if interrupted unexpectedly, save progress
        print(e)
        print("calamancy was unexpectedly stopped. Here's the progress.")
        a = {"batch_index" : batch_index, "master_list" : doc_list_master}
        print(a)
        return a

#def spacy_dp(sentence_list, batch_size=64): # processes the text in batches then returns a list of spacy docs
#    doc_list_master = [] # batching is needed for checkpoints
#    eta_rolling_average = [0,0,0,0,0]
#    batch_index=0
#    
#    # splits the list into batches
#    batched_sentences = [sentence_list[i:i + batch_size] for i in range(0, len(sentence_list), batch_size)]
#    batched_sentences_len = len(batched_sentences)
#
#    try:
#        for sentence_list_batch in batched_sentences: # get the list of doc
#            if batch_index % 2 == 0: # check eta every couple of batches
#                start_time = datetime.now()
#                doc_list_small = nlp_spacy.pipe(sentence_list_batch)
#                doc_list_master += doc_list_small
#                end_time = datetime.now()
#                batch_index += 1
#
#                # measure ETA
#                time_elapsed = ((end_time - start_time) * (batched_sentences_len - batch_index))
#                eta_rolling_average.pop(0)
#                eta_rolling_average.append(time_elapsed)
#                eta = (sum(eta_rolling_average)/5)
#                print(f"spacy dependecy parsing - parsed: {batch_index*batch_size} ETA: {eta}")
#            else:
#                doc_list_small = nlp_calamancy.pipe(sentence_list_batch)
#                doc_list_master += doc_list_small
#                batch_index += 1
#
#    except Exception as e: #if interrupted unexpectedly, save progress
#        print(e)
#        print("calamancy was unexpectedly stopped. Here's the progress.")
#        a = {"batch_index" : batch_index, "master_list" : doc_list_master}
#        print(a)
#        return a
#    

In [57]:
def docList_to_nsubjList(doc_list): # returns a dictionary list of the tallies of the words that got counted as the nsubj
    outputList = []
    for doc in doc_list:
        output_mini = []
        for token in doc:
            if token.dep_ == 'nsubj':
                output_mini.append(token)
        outputList.append(output_mini)
    return outputList

In [60]:
import pandas as pd
# small scale test just to get and idea for an ETA

test_df = pd.read_csv("translated_helsinki.csv")['translated_en_helsinki'].sample(1000)
test_inp = test_df.tolist()

qq = calamancy_dp(test_inp)
print(qq)
ww = docList_to_nsubjList(qq)
ww


calamancy dependecy parsing - parsed: 64 ETA: 0:00:04.630245
calamancy dependecy parsing - parsed: 192 ETA: 0:00:09.141180
calamancy dependecy parsing - parsed: 320 ETA: 0:00:12.588644
calamancy dependecy parsing - parsed: 448 ETA: 0:00:15.158014
calamancy dependecy parsing - parsed: 576 ETA: 0:00:17.248032
calamancy dependecy parsing - parsed: 704 ETA: 0:00:13.992982
calamancy dependecy parsing - parsed: 832 ETA: 0:00:10.262690
calamancy dependecy parsing - parsed: 960 ETA: 0:00:07.109845


[[bintana],
 [],
 [gal],
 [pangkat],
 [bintana],
 [resort],
 [pagsubok],
 [pagmomodelo],
 [bintana],
 [kapatid],
 [pizza],
 [bangin],
 [meridyano],
 [picket],
 [hayop],
 [bard],
 [wika],
 [takip],
 [attendant],
 [obserhista],
 [bintana],
 [pizza],
 [bodega],
 [lumaon],
 [defender],
 [knell],
 [pandinig],
 [austin],
 [gladiator],
 [upuan],
 [pintor],
 [alperes],
 [ugat],
 [masonerya],
 [baseboard],
 [magbubukid],
 [larangan],
 [pagsasanib],
 [pizza],
 [demagoguery],
 [konserbatoryo],
 [pusa],
 [rogasyon],
 [mycenaen],
 [ako],
 [taniman],
 [boot],
 [quoter],
 [ako],
 [pusa],
 [bookend],
 [echolalia],
 [negosyo],
 [lumalabag],
 [ulo],
 [parusang],
 [iroquois],
 [pulso],
 [pizza],
 [gremonyo],
 [pagbabago],
 [pagsasanay],
 [nobela],
 [cashmere],
 [shrike],
 [myelograpiya],
 [tao],
 [bangkong],
 [kumain],
 [storker],
 [ako],
 [tao],
 [pulutong],
 [bahay],
 [browser],
 [epekto],
 [koponan],
 [tlingit],
 [solfege],
 [urceole],
 [ako],
 [slit],
 [bintana],
 [lalaki],
 [subvelitie],
 [pundasyon

In [67]:
# real implementation
tl_df = pd.read_csv("translated_helsinki.csv").drop(columns=['Unnamed: 0'])
to_tl_dp = tl_df['translated_en_helsinki'].values.tolist()

nsubj_from_en_tl_doc = calamancy_dp(to_tl_dp)
nsubj_from_en_tl_list = docList_to_nsubjList(nsubj_from_en_tl_doc)
tl_df['nsubj_from_helsinki_en_tl'] = nsubj_from_en_tl_list
tl_df.to_csv("translated_helsinki_with_tl_dp.csv")
tl_df


calamancy dependecy parsing - parsed: 64 ETA: 0:10:52.672591
calamancy dependecy parsing - parsed: 192 ETA: 0:19:47.694534
calamancy dependecy parsing - parsed: 320 ETA: 0:29:57.467794
calamancy dependecy parsing - parsed: 448 ETA: 0:40:02.665720
calamancy dependecy parsing - parsed: 576 ETA: 0:50:05.988974
calamancy dependecy parsing - parsed: 704 ETA: 0:49:21.185853
calamancy dependecy parsing - parsed: 832 ETA: 0:49:19.056384
calamancy dependecy parsing - parsed: 960 ETA: 0:49:16.730257
calamancy dependecy parsing - parsed: 1088 ETA: 0:48:40.008590
calamancy dependecy parsing - parsed: 1216 ETA: 0:48:49.073288
calamancy dependecy parsing - parsed: 1344 ETA: 0:48:42.736893
calamancy dependecy parsing - parsed: 1472 ETA: 0:49:50.447170
calamancy dependecy parsing - parsed: 1600 ETA: 0:49:14.290775
calamancy dependecy parsing - parsed: 1728 ETA: 0:49:39.044348
calamancy dependecy parsing - parsed: 1856 ETA: 0:49:30.811086
calamancy dependecy parsing - parsed: 1984 ETA: 0:49:22.034287
c

Unnamed: 0,category,lemma,just_category,in_sentenceV2,translated_en_helsinki,nsubj_from_helsinki_en_tl
0,person.n.01,person,person,The person ate the pizza.,Ang taong iyon ay kumain ng pizza.,[taong]
1,person.n.01,person,person,The person is at the store.,Ang tao ay nasa tindahan.,[tao]
2,person.n.01,person,person,The person is happy.,Maligaya ang taong iyon.,[taong]
3,person.n.01,person,person,The person runs.,Tumatakbo ang tao.,[tao]
4,person.n.01,person,person,The person was invited to the party.,Ang taong iyon ay inanyayahan sa salu - salo.,[taong]
...,...,...,...,...,...,...
138589,consequence.n.01,placebo_effect,consequence,The placebo effect is on Saturday.,Ang epekto ng placebo ay sa Sabado.,[epekto]
138590,consequence.n.01,placebo_effect,consequence,The placebo effect is successful.,Ang epekto ng placebo ay matagumpay.,[epekto]
138591,consequence.n.01,placebo_effect,consequence,The placebo effect happened.,Ang epekto ng placebo ay nangyari.,[epekto]
138592,consequence.n.01,placebo_effect,consequence,The placebo effect was cancelled due to rain.,Ang epekto ng placebo ay kinansela dahil sa ulan.,[epekto]


NameError: name 'tl_df' is not defined

# formatting for translation quality assessment

In [2]:
import pandas as pd

translated_to_tl = pd.read_csv("translated_helsinki_with_tl_dp.csv").drop(columns=['Unnamed: 0'])
translated_to_tl

Unnamed: 0,category,lemma,just_category,in_sentenceV2,translated_en_helsinki,nsubj_from_helsinki_en_tl
0,person.n.01,person,person,The person ate the pizza.,Ang taong iyon ay kumain ng pizza.,[taong]
1,person.n.01,person,person,The person is at the store.,Ang tao ay nasa tindahan.,[tao]
2,person.n.01,person,person,The person is happy.,Maligaya ang taong iyon.,[taong]
3,person.n.01,person,person,The person runs.,Tumatakbo ang tao.,[tao]
4,person.n.01,person,person,The person was invited to the party.,Ang taong iyon ay inanyayahan sa salu - salo.,[taong]
...,...,...,...,...,...,...
138589,consequence.n.01,placebo_effect,consequence,The placebo effect is on Saturday.,Ang epekto ng placebo ay sa Sabado.,[epekto]
138590,consequence.n.01,placebo_effect,consequence,The placebo effect is successful.,Ang epekto ng placebo ay matagumpay.,[epekto]
138591,consequence.n.01,placebo_effect,consequence,The placebo effect happened.,Ang epekto ng placebo ay nangyari.,[epekto]
138592,consequence.n.01,placebo_effect,consequence,The placebo effect was cancelled due to rain.,Ang epekto ng placebo ay kinansela dahil sa ulan.,[epekto]


In [41]:
translated_to_tl.dtypes

category                     object
lemma                        object
just_category                object
in_sentenceV2                object
translated_en_helsinki       object
nsubj_from_helsinki_en_tl    object
dtype: object

In [13]:
grouped = translated_to_tl.groupby(by=['lemma', 'just_category'])
#grouped.first()
listed_df = grouped.agg(list)
listed_df = listed_df.reset_index()
listed_df

Unnamed: 0,lemma,just_category,category,in_sentenceV2,translated_en_helsinki,nsubj_from_helsinki_en_tl
0,'hood,location,"[location.n.01, location.n.01, location.n.01, ...","[People visit the 'hood., The 'hood is near th...","[Dinadalaw ng mga tao ang 'hood., Malapit sa i...","[[ang, ', hood], [ang, ', kanluran], [ang, ', ..."
1,'hood,object,"[object.n.01, object.n.01, object.n.01, object...","[The 'hood broke the window., The 'hood is on ...","[Nabali ang bintana dahil sa 'kaguluhan., Ang ...","[[bintana], [], [', kapuua, '], [ang, ', kaput..."
2,a'man,organization,"[organization.n.01, organization.n.01, organiz...","[The a'man launched a new product., The a'man ...",[Ang a'man ay naglunsad ng isang bagong produk...,"[[a'man], [a'man], [a'man], [tao], [a'man], [l..."
3,a-line,object,"[object.n.01, object.n.01, object.n.01, object...","[The a-line broke the window., The a-line is o...","[Nabali ang bintana dahil sa a-line., Ang a-li...","[[bintana], [a, -, line], [a, -, line], [ang, ..."
4,a-team,organization,"[organization.n.01, organization.n.01, organiz...","[The a-team launched a new product., The a-tea...",[Ang a-team ay naglunsad ng isang bagong produ...,"[[a, -, team], [a, -, team], [a, -, team], [a,..."
...,...,...,...,...,...,...
23093,zulu,object,"[object.n.01, object.n.01, object.n.01, object...","[The zulu broke the window., The zulu is on th...","[Nabali ang bintana dahil sa zulu., Nasa mesa ...","[[bintana], [zulu], [zulu], [zulu], [zulu], [a..."
23094,zulu,person,"[person.n.01, person.n.01, person.n.01, person...","[The zulu ate the pizza., The zulu is at the s...","[Kinain ng zulu ang pizza., Ang zulu ay nasa t...","[[pizza], [zulu], [zulu], [zulu], [zulu], [ako]]"
23095,zuni,person,"[person.n.01, person.n.01, person.n.01, person...","[The zuni ate the pizza., The zuni is at the s...","[Kinain ng zuni ang pizza., Ang zuni ay nasa t...","[[pizza], [zuni], [zuni], [zuni], [zuni], [ako]]"
23096,zurvanism,organization,"[organization.n.01, organization.n.01, organiz...","[The zurvanism launched a new product., The zu...",[Ang zurvanismo ay naglunsad ng isang bagong p...,"[[zurvanismo], [zurvanismo], [zurvanismo], [zu..."


In [51]:
import re
def list_of_lists_to_dict(list_inp):
    final_dict = {}
    #print(list_inp)
    for in_list_string in list_inp:
        # a little string builder
        a = []
        for i in in_list_string.split(','):
            i_new = i
            i_new = i_new.replace("[",'')
            i_new = i_new.replace("]",'')
            i_new = i_new.replace(" ",'')
            a.append(i_new)

        #print(a)
        # check if the key exists:
        for i in a:
            if i not in final_dict.keys():
                final_dict[i] = 1
            else:
                final_dict[i] += 1
    return final_dict

listed_df["consolidated_nsubj_helsinki_tl"] = listed_df['nsubj_from_helsinki_en_tl'].apply(lambda x: list_of_lists_to_dict(x))
listed_df

Unnamed: 0,lemma,just_category,category,in_sentenceV2,translated_en_helsinki,nsubj_from_helsinki_en_tl,consolidated_nsubj_helsinki_tl
0,'hood,location,"[location.n.01, location.n.01, location.n.01, ...","[People visit the 'hood., The 'hood is near th...","[Dinadalaw ng mga tao ang 'hood., Malapit sa i...","[[ang, ', hood], [ang, ', kanluran], [ang, ', ...","{'ang': 3, ''': 6, 'hood': 1, 'kanluran': 1, '..."
1,'hood,object,"[object.n.01, object.n.01, object.n.01, object...","[The 'hood broke the window., The 'hood is on ...","[Nabali ang bintana dahil sa 'kaguluhan., Ang ...","[[bintana], [], [', kapuua, '], [ang, ', kaput...","{'bintana': 1, '': 1, ''': 4, 'kapuua': 2, 'an..."
2,a'man,organization,"[organization.n.01, organization.n.01, organiz...","[The a'man launched a new product., The a'man ...",[Ang a'man ay naglunsad ng isang bagong produk...,"[[a'man], [a'man], [a'man], [tao], [a'man], [l...","{'a'man': 4, 'tao': 1, 'lupon': 1}"
3,a-line,object,"[object.n.01, object.n.01, object.n.01, object...","[The a-line broke the window., The a-line is o...","[Nabali ang bintana dahil sa a-line., Ang a-li...","[[bintana], [a, -, line], [a, -, line], [ang, ...","{'bintana': 1, 'a': 4, '-': 4, 'line': 4, 'ang..."
4,a-team,organization,"[organization.n.01, organization.n.01, organiz...","[The a-team launched a new product., The a-tea...",[Ang a-team ay naglunsad ng isang bagong produ...,"[[a, -, team], [a, -, team], [a, -, team], [a,...","{'a': 5, '-': 5, 'team': 5, 'Ang': 1, 'lupon': 1}"
...,...,...,...,...,...,...,...
23093,zulu,object,"[object.n.01, object.n.01, object.n.01, object...","[The zulu broke the window., The zulu is on th...","[Nabali ang bintana dahil sa zulu., Nasa mesa ...","[[bintana], [zulu], [zulu], [zulu], [zulu], [a...","{'bintana': 1, 'zulu': 4, 'ako': 1}"
23094,zulu,person,"[person.n.01, person.n.01, person.n.01, person...","[The zulu ate the pizza., The zulu is at the s...","[Kinain ng zulu ang pizza., Ang zulu ay nasa t...","[[pizza], [zulu], [zulu], [zulu], [zulu], [ako]]","{'pizza': 1, 'zulu': 4, 'ako': 1}"
23095,zuni,person,"[person.n.01, person.n.01, person.n.01, person...","[The zuni ate the pizza., The zuni is at the s...","[Kinain ng zuni ang pizza., Ang zuni ay nasa t...","[[pizza], [zuni], [zuni], [zuni], [zuni], [ako]]","{'pizza': 1, 'zuni': 4, 'ako': 1}"
23096,zurvanism,organization,"[organization.n.01, organization.n.01, organiz...","[The zurvanism launched a new product., The zu...",[Ang zurvanismo ay naglunsad ng isang bagong p...,"[[zurvanismo], [zurvanismo], [zurvanismo], [zu...","{'zurvanismo': 5, 'lupon': 1}"


In [79]:
# a list of stopwords
tl_stopwords = [
    "ako",
    "'",
    "kanila",
    "ang",
    "pizza",
]

tl_stopwords = set(tl_stopwords)

In [80]:
def get_probable_nsubj(dict_inp):
    output_list = []
    #get max value from valid words
    max_val = 0
    for key in dict_inp.keys():
        if key not in tl_stopwords:
            if dict_inp[key] > max_val:
                max_val = dict_inp[key]
    #get the words with the max_val
    for key in dict_inp.keys():
        if key not in tl_stopwords:
            if dict_inp[key] == max_val:
                output_list.append(key)
    return output_list

listed_df['probable_nsubj'] = listed_df["consolidated_nsubj_helsinki_tl"].apply(lambda x: get_probable_nsubj(x))
listed_df

Unnamed: 0,lemma,just_category,category,in_sentenceV2,translated_en_helsinki,nsubj_from_helsinki_en_tl,consolidated_nsubj_helsinki_tl,probable_nsubj
0,'hood,location,"[location.n.01, location.n.01, location.n.01, ...","[People visit the 'hood., The 'hood is near th...","[Dinadalaw ng mga tao ang 'hood., Malapit sa i...","[[ang, ', hood], [ang, ', kanluran], [ang, ', ...","{'ang': 3, ''': 6, 'hood': 1, 'kanluran': 1, '...",[kapuua]
1,'hood,object,"[object.n.01, object.n.01, object.n.01, object...","[The 'hood broke the window., The 'hood is on ...","[Nabali ang bintana dahil sa 'kaguluhan., Ang ...","[[bintana], [], [', kapuua, '], [ang, ', kaput...","{'bintana': 1, '': 1, ''': 4, 'kapuua': 2, 'an...",[kapuua]
2,a'man,organization,"[organization.n.01, organization.n.01, organiz...","[The a'man launched a new product., The a'man ...",[Ang a'man ay naglunsad ng isang bagong produk...,"[[a'man], [a'man], [a'man], [tao], [a'man], [l...","{'a'man': 4, 'tao': 1, 'lupon': 1}",[a'man]
3,a-line,object,"[object.n.01, object.n.01, object.n.01, object...","[The a-line broke the window., The a-line is o...","[Nabali ang bintana dahil sa a-line., Ang a-li...","[[bintana], [a, -, line], [a, -, line], [ang, ...","{'bintana': 1, 'a': 4, '-': 4, 'line': 4, 'ang...","[a, -, line]"
4,a-team,organization,"[organization.n.01, organization.n.01, organiz...","[The a-team launched a new product., The a-tea...",[Ang a-team ay naglunsad ng isang bagong produ...,"[[a, -, team], [a, -, team], [a, -, team], [a,...","{'a': 5, '-': 5, 'team': 5, 'Ang': 1, 'lupon': 1}","[a, -, team]"
...,...,...,...,...,...,...,...,...
23093,zulu,object,"[object.n.01, object.n.01, object.n.01, object...","[The zulu broke the window., The zulu is on th...","[Nabali ang bintana dahil sa zulu., Nasa mesa ...","[[bintana], [zulu], [zulu], [zulu], [zulu], [a...","{'bintana': 1, 'zulu': 4, 'ako': 1}",[zulu]
23094,zulu,person,"[person.n.01, person.n.01, person.n.01, person...","[The zulu ate the pizza., The zulu is at the s...","[Kinain ng zulu ang pizza., Ang zulu ay nasa t...","[[pizza], [zulu], [zulu], [zulu], [zulu], [ako]]","{'pizza': 1, 'zulu': 4, 'ako': 1}",[zulu]
23095,zuni,person,"[person.n.01, person.n.01, person.n.01, person...","[The zuni ate the pizza., The zuni is at the s...","[Kinain ng zuni ang pizza., Ang zuni ay nasa t...","[[pizza], [zuni], [zuni], [zuni], [zuni], [ako]]","{'pizza': 1, 'zuni': 4, 'ako': 1}",[zuni]
23096,zurvanism,organization,"[organization.n.01, organization.n.01, organiz...","[The zurvanism launched a new product., The zu...",[Ang zurvanismo ay naglunsad ng isang bagong p...,"[[zurvanismo], [zurvanismo], [zurvanismo], [zu...","{'zurvanismo': 5, 'lupon': 1}",[zurvanismo]


In [81]:

listed_df.to_json("translated_helsinki_tl_dp_nsubj.json")

In [None]:
keywords_category_tl = listed_df[['lemma','just_category','probable_nsubj']]
#keywords_category_tl.loc[listed_df['probable_nsubj'].str.len() > 1 ]
keywords_category_tl

Unnamed: 0,lemma,just_category,probable_nsubj
0,'hood,location,[kapuua]
1,'hood,object,[kapuua]
2,a'man,organization,[a'man]
3,a-line,object,"[a, -, line]"
4,a-team,organization,"[a, -, team]"
...,...,...,...
23093,zulu,object,[zulu]
23094,zulu,person,[zulu]
23095,zuni,person,[zuni]
23096,zurvanism,organization,[zurvanismo]


In [86]:
def get_tl_for_categ(list_inp):
    if len(list_inp) >= 1 and len(list_inp) <= 2:
        return list_inp
    elif len(list_inp) >= 3:
        inp_copy = list_inp
        trio_words = []
        # check for trios
        start = 0
        end = 2
        while end < len(inp_copy):
            check_trio = [inp_copy[start], inp_copy[start+1], inp_copy[end]]
            if check_trio[1] == '-':
                trio_words.append(f"{inp_copy[start]}{inp_copy[start+1]}{inp_copy[end]}")
                inp_copy.pop(start)
                inp_copy.pop(start)
                inp_copy.pop(start)
            else:
                start += 1
                end += 1
        return list_inp+trio_words
        
keywords_category_tl['tl_word'] = keywords_category_tl['probable_nsubj'].apply(lambda x: get_tl_for_categ(x))
keywords_category_tl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keywords_category_tl['tl_word'] = keywords_category_tl['probable_nsubj'].apply(lambda x: get_tl_for_categ(x))


Unnamed: 0,lemma,just_category,probable_nsubj,tl_word
0,'hood,location,[kapuua],[kapuua]
1,'hood,object,[kapuua],[kapuua]
2,a'man,organization,[a'man],[a'man]
3,a-line,object,[],[a-line]
4,a-team,organization,[],[a-team]
...,...,...,...,...
23093,zulu,object,[zulu],[zulu]
23094,zulu,person,[zulu],[zulu]
23095,zuni,person,[zuni],[zuni]
23096,zurvanism,organization,[zurvanismo],[zurvanismo]


In [89]:
# don't run this alone, yung nauna muna
keywords_category_tl = keywords_category_tl[['just_category', 'tl_word']]
keywords_category_tl = keywords_category_tl.explode('tl_word')
keywords_category_tl = keywords_category_tl.groupby('just_category').agg(list).reset_index()
keywords_category_tl

Unnamed: 0,just_category,tl_word
0,administration,"[administrasyon, burukrasya, pangasiwaan, pang..."
1,aim,"[layon, negosyo, grail, okasyon, punto, bagay]"
2,animal,"[aardvark, abalone, aborsiyon, abrocome, kalal..."
3,consequence,"[resulta, resulta, epekto, bariansiya, epekto,..."
4,event,"[awiting, pag-awit, pagtalikod, pag-aalsa, abd..."
5,location,"[kapuua, abbasya, dingding, katuwaan, acutment..."
6,object,"[kapuua, a-line, aba, ababa, abacus, taong, ab..."
7,organization,"[a'man, a-team, abecedarian, akademiko, akadem..."
8,person,"[taong, abbetor, alaga, abator, tagapag-awdisy..."


In [99]:
# even simpler

person_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "person"]['tl_word'].values.tolist()[0]
object_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "object"]['tl_word'].values.tolist()[0] + keywords_category_tl.loc[keywords_category_tl['just_category'] == "aim"]['tl_word'].values.tolist()[0] 
location_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "location"]['tl_word'].values.tolist()[0]
organization_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "organization"]['tl_word'].values.tolist()[0] + keywords_category_tl.loc[keywords_category_tl['just_category'] == "administration"]['tl_word'].values.tolist()[0] 
event_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "event"]['tl_word'].values.tolist()[0] + keywords_category_tl.loc[keywords_category_tl['just_category'] == "consequence"]['tl_word'].values.tolist()[0] 
animal_list = keywords_category_tl.loc[keywords_category_tl['just_category'] == "animal"]['tl_word'].values.tolist()[0]

keywords_category_tl_final = pd.DataFrame({
    "TAO" : [person_list],
    "BAGAY" : [object_list],
    "LUGAR" : [location_list],
    "ORG" : [organization_list],
    "PANGYAYARI" : [event_list],
    "HAYOP" : [animal_list], 
})

keywords_category_tl_final

Unnamed: 0,TAO,BAGAY,LUGAR,ORG,PANGYAYARI,HAYOP
0,"[taong, abbetor, alaga, abator, tagapag-awdisy...","[kapuua, a-line, aba, ababa, abacus, taong, ab...","[kapuua, abbasya, dingding, katuwaan, acutment...","[a'man, a-team, abecedarian, akademiko, akadem...","[awiting, pag-awit, pagtalikod, pag-aalsa, abd...","[aardvark, abalone, aborsiyon, abrocome, kalal..."


In [100]:
keywords_category_tl_final.to_json("keyword_categ_tl_unassessed.json")