In [1]:
#my reference: https://pypi.org/project/conllu/

In [1]:
from io import open
from conllu import parse_tree_incr
from conllu import parse_incr
from conllu import parse
from collections import OrderedDict
import pandas as pd

In [2]:
file = 'TamilTB.v0.1\data\TamilTB.v0.1.utf8.conll'
data_file = open(file, "r", encoding="utf-8")


In [3]:
#Converting conll to a dataframe
columns = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head']
df = pd.DataFrame(columns = columns)
for tokenlist in parse_incr(data_file):
    for word in tokenlist:
        df = df.append({'id': word["id"], 
                        'form' : word["form"],
                        'lemma' : word["lemma"],
                        'upos' : word["upos"],
                        'xpos' : word["xpos"],
                        'feats' : word["feats"],
                        'head' : word["head"]},
                        ignore_index = True
                        )
df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2
1,2,அருகே,அருகே,P,PP-------,,18
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6


In [48]:
#Function to extract the morphemes by stripping away the lemma
def morpheme_extract(lemmalist, wordlist):
    if len(lemmalist)<len(wordlist):
        for i in range(len(lemmalist)):
            if (wordlist[i] != lemmalist[i]):
                difference.append(wordlist[i])
        index = i
        for i in range(len(wordlist)):
            if i > index:
                difference.append(wordlist[i])
                morpheme = ''.join(difference)
                row["Morpheme"] = morpheme
    else:
        row["Morpheme"] = ""


In [49]:
# Going through each row in the dataframe and calling the function to strip away the morphemes
for index, row in df.iterrows():
    word = row["form"]
    wordlist = [ch for ch in word] 
    lemma = row["lemma"]
    lemmalist = [ch for ch in lemma] 
    difference = []
    morpheme_extract(lemmalist, wordlist)

In [95]:
#Creating dataframe out of the frequencies of the morphemes
value_counts_df = pd.DataFrame(df["Morpheme"].value_counts())
value_counts_df.reset_index(inplace=True)
value_counts_df = value_counts_df.rename(columns = {'index':'Morpheme', 'Morpheme':'Counts'})
value_counts_df["Counts"][0]=0
value_counts_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  value_counts_df["Counts"][0]=0


Unnamed: 0,Morpheme,Counts
0,,0
1,கள்,220
2,யில்,166
3,ம்,144
4,ில்,136


In [99]:
#Merging value counts with the original dataframe
merged_df = pd.merge(df, value_counts_df, on="Morpheme", how = "left")
merged_df = merged_df.drop('values',axis=1)
merged_df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme,Counts
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,,0
1,2,அருகே,அருகே,P,PP-------,,18,,0
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,,0
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,136
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,,0


In [120]:
# Removing all the blank morpheme rows
for index, row in merged_df.iterrows():
    if row["Morpheme"] == '':
        merged_df.drop(index, inplace = True)


In [122]:
#Removing everything expect nouns and verbs
for index, row in merged_df.iterrows():
    if (row["upos"] != 'N') and (row["upos"] != 'V':
        merged_df.drop(index, inplace = True)

In [126]:
merged_df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme,Counts
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,136
10,11,நிலையத்துக்குக்க்,நிலையம்,N,NND-3SN--,"{'Cas': 'D', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",12,ததுக்குக்க்,1
16,17,வகையில்,வகை,N,NNL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,யில்,166
17,18,எடுக்கப்,எடு,V,Vu-T---AA,"{'Ten': 'T', 'Voi': 'A', 'Neg': 'A'}",20,க்கப்,69
18,19,படும்,படு,V,VR-F3SNPA,"{'Ten': 'F', 'Per': '3', 'Num': 'S', 'Gen': 'N...",18,ம்,144


In [148]:
#Cleaning up to only get the noun and verb morphemes along with their value counts
columns = ['Morpheme', 'Counts','upos']
adjusted_morpheme_df = pd.DataFrame(columns = columns)
adjusted_morpheme_df["Morpheme"] = merged_df["Morpheme"]
adjusted_morpheme_df["Counts"] = merged_df["Counts"]
adjusted_morpheme_df["upos"] = merged_df["upos"]
adjusted_morpheme_df = adjusted_morpheme_df.drop_duplicates(subset = ["Morpheme"])
adjusted_morpheme_df = adjusted_morpheme_df.sort_values(by='Counts', ascending=False)
adjusted_morpheme_df.head()

Unnamed: 0,Morpheme,Counts,upos
199,கள்,220,N
16,யில்,166,N
18,ம்,144,V
3,ில்,136,N
80,ப்,111,V


In [146]:
len(adjusted_morpheme_df.Morpheme)

381

In [None]:
#Need to add condition to consider morpheme connected to previous word as a full morpheme