In [1]:
#my reference: https://pypi.org/project/conllu/

In [2]:
from io import open
from conllu import parse_tree_incr
from conllu import parse_incr
from conllu import parse
from collections import OrderedDict
import pandas as pd

In [3]:
file = 'TamilTB.v0.1\data\TamilTB.v0.1.utf8.conll'
data_file = open(file, "r", encoding="utf-8")


In [25]:
#Setting precision of dataframe as 0
pd.set_option('precision', 0)

#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [5]:
#Converting conll to a dataframe
columns = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'Morpheme']
df = pd.DataFrame(columns = columns)
for tokenlist in parse_incr(data_file):
    for word in tokenlist:
        df = df.append({'id': word["id"], 
                        'form' : word["form"],
                        'lemma' : word["lemma"],
                        'upos' : word["upos"],
                        'xpos' : word["xpos"],
                        'feats' : word["feats"],
                        'head' : word["head"]},
                        ignore_index = True
                        )
df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,2,அருகே,அருகே,P,PP-------,,18,
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,


In [6]:
#Function to extract the morphemes by stripping away the lemma
def morpheme_extract(lemmalist, wordlist):
    if len(lemmalist)<len(wordlist):
        for i in range(len(lemmalist)):
            if (wordlist[i] != lemmalist[i]):
                difference.append(wordlist[i])
        index = i
        for i in range(len(wordlist)):
            if i > index:
                difference.append(wordlist[i])
                morpheme = ''.join(difference)
                row["Morpheme"] = morpheme
    else:
        row["Morpheme"] = ""


In [7]:
# Going through each row in the dataframe and calling the function to strip away the morphemes
for index, row in df.iterrows():
    word = row["form"]
    wordlist = [ch for ch in word] 
    lemma = row["lemma"]
    lemmalist = [ch for ch in lemma] 
    difference = []
    morpheme_extract(lemmalist, wordlist)
    difference[:5]

In [8]:
df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,2,அருகே,அருகே,P,PP-------,,18,
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,
...,...,...,...,...,...,...,...,...
9576,10,வேண்டும்,வேண்டு,V,VR-F3SNAA,"{'Ten': 'F', 'Per': '3', 'Num': 'S', 'Gen': 'N...",9,ம்
9577,11,என்பது,என்,T,TQ-------,,13,பது
9578,12,தான்,தான்,T,Tq-------,,11,
9579,13,அது,அது,R,RpN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",0,


In [9]:
#Dropping id column
df.drop(columns='id', inplace=True)
df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,அருகே,அருகே,P,PP-------,,18,
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்
4,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,


In [10]:
#Reading sentences csv 
file_path = "Sentences.csv"
sentence_df = pd.read_csv(file_path, encoding="utf-8")
sentence_df.dropna(inplace=True)
sentence_df.drop(columns='id', inplace=True)
sentence_df.reset_index(inplace=True)
sentence_df.head()

Unnamed: 0,index,NoSpaceAfter
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [11]:
#Adding NoSpaceAfter column to dataframe. The 'no_space_after' will be set to 1 if the following token is part of the current token.
#Whenever the splitting takes place this attribute will be set to 1 for the first token.
#For example, The 'no_space_after' attribute for pATukAkkap  will be 1. Whereas the 'no_space_after' attribute for um will be 0.  
df['NoSpaceAfter'] = sentence_df['NoSpaceAfter']
df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme,NoSpaceAfter
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,,0
1,அருகே,அருகே,P,PP-------,,18,,0
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,,0
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,0
4,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,,0


In [12]:
#Adding postpositions and clitics to morpheme list
for i in range(0, len(df)):
    if df.loc[i,'NoSpaceAfter']==1:
        df.loc[i+1, 'Morpheme'] = df.loc[i+1, 'form']
        df.loc[i+1, 'NoSpaceAfter'] = 2


In [13]:
#Removing everything expect nouns and verbs
for index, row in df.iterrows():
    if (row["upos"] != 'N') and (row["upos"] != 'V') and (row['NoSpaceAfter'] != 2):
        df.drop(index, inplace = True)

        

In [14]:
#Creating dataframe out of the frequencies of the morphemes
value_counts_df = pd.DataFrame(df["Morpheme"].value_counts())
value_counts_df.reset_index(inplace=True)
value_counts_df = value_counts_df.rename(columns = {'index':'Morpheme', 'Morpheme':'Counts'})
value_counts_df["Counts"][0]=0
value_counts_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  value_counts_df["Counts"][0]=0


Unnamed: 0,Morpheme,Counts
0,,0
1,கள்,210
2,உம்,209
3,யில்,155
4,ஆக,127
...,...,...
434,றகும்,1
435,ியதைப்,1
436,ளதைய்,1
437,டகவ்,1


In [15]:
#Merging value counts with the original dataframe
merged_df = pd.merge(df, value_counts_df, on="Morpheme", how = "left")
merged_df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme,NoSpaceAfter,Counts
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,,0,0
1,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,,0,0
2,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,0,111
3,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,,0,0
4,பீல்டு,பீல்டு,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",11,,0,0


In [16]:
# Removing all the blank morpheme rows
for index, row in merged_df.iterrows():
    if row["Morpheme"] == '':
        merged_df.drop(index, inplace = True)


In [17]:
merged_df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme,NoSpaceAfter,Counts
2,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,0,111
6,நிலையத்துக்குக்க்,நிலையம்,N,NND-3SN--,"{'Cas': 'D', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",12,ததுக்குக்க்,1,1
7,ஆன,ஆகு,T,Tg-------,,13,ஆன,2,61
10,வகையில்,வகை,N,NNL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,யில்,0,155
11,எடுக்கப்,எடு,V,Vu-T---AA,"{'Ten': 'T', 'Voi': 'A', 'Neg': 'A'}",20,க்கப்,0,69


In [18]:
#Cleaning up to only get the noun and verb morphemes along with their value counts
columns = ['Morpheme', 'Counts','upos', 'NoSpaceAfter']
adjusted_morpheme_df = pd.DataFrame(columns = columns)
adjusted_morpheme_df["Morpheme"] = merged_df["Morpheme"]
adjusted_morpheme_df["Counts"] = merged_df["Counts"]
adjusted_morpheme_df["upos"] = merged_df["upos"]
adjusted_morpheme_df["NoSpaceAfter"] = merged_df["NoSpaceAfter"]
adjusted_morpheme_df = adjusted_morpheme_df.drop_duplicates(subset = ['Morpheme', 'NoSpaceAfter', 'Counts'])
adjusted_morpheme_df = adjusted_morpheme_df.sort_values(by='Counts', ascending=False)
adjusted_morpheme_df.head()

Unnamed: 0,Morpheme,Counts,upos,NoSpaceAfter
130,கள்,210,N,1
172,கள்,210,N,0
30,உம்,209,T,2
10,யில்,155,N,0
653,யில்,155,N,1
...,...,...,...,...
2644,ததைச்,1,N,0
2651,வைப்,1,N,0
2654,டடுக்கொள்கிறேன்,1,V,0
2696,டுத்தாது,1,V,0


In [19]:
# Making postpositions/clitics morpheme list
clitic_df = adjusted_morpheme_df[adjusted_morpheme_df['NoSpaceAfter']==2]
clitic_df.head()

Unnamed: 0,Morpheme,Counts,upos,NoSpaceAfter
30,உம்,209,T,2
131,ஆக,127,P,2
7,ஆன,61,T,2
358,பட்ட்,52,V,2
235,உள்ளது,47,V,2
...,...,...,...,...
3619,ஆகியதால்,1,V,2
2289,நிலையில்,1,N,2
2530,இருப்பத்,1,V,2
121,பகுதியில்,1,N,2


In [31]:
#Word list rxcluding postpostions
root_df = adjusted_morpheme_df[adjusted_morpheme_df['NoSpaceAfter']<2]
root_df.head()

Unnamed: 0,Morpheme,Counts,upos,NoSpaceAfter
130,கள்,210,N,1
172,கள்,210,N,0
10,யில்,155,N,0
653,யில்,155,N,1
2,ில்,111,N,0


In [35]:
no_repeat_df = root_df.drop_duplicates(subset = ['Morpheme'])
