In [4]:
#my reference: https://pypi.org/project/conllu/


In [5]:
from io import open
from conllu import parse_tree_incr
from conllu import parse_incr
from conllu import parse
from collections import OrderedDict
import pandas as pd
import itertools
import re

In [6]:
file = 'TamilTB.v0.1\data\TamilTB.v0.1.utf8.conll'
data_file = open(file, "r", encoding="utf-8")


In [7]:
#Setting precision of dataframe as 0
pd.set_option('precision', 0)

#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [8]:
#Converting conll to a dataframe
columns = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'Morpheme']
df = pd.DataFrame(columns = columns)
for tokenlist in parse_incr(data_file):
    for word in tokenlist:
        df = df.append({'id': word["id"], 
                        'form' : word["form"],
                        'lemma' : word["lemma"],
                        'upos' : word["upos"],
                        'xpos' : word["xpos"],
                        'feats' : word["feats"],
                        'head' : word["head"]},
                        ignore_index = True
                        )
df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,2,அருகே,அருகே,P,PP-------,,18,
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,


In [9]:
#Function to extract the morphemes by stripping away the lemma
def morpheme_extract(lemmalist, wordlist):
    if len(lemmalist)<len(wordlist):
        for i in range(len(lemmalist)):
            if (wordlist[i] != lemmalist[i]):
                difference.append(wordlist[i])
        index = i
        for i in range(len(wordlist)):
            if i > index:
                difference.append(wordlist[i])
                morpheme = ''.join(difference)
                row["Morpheme"] = morpheme
    else:
        row["Morpheme"] = ""


In [10]:
# Going through each row in the dataframe and calling the function to strip away the morphemes
for index, row in df.iterrows():
    word = row["form"]
    wordlist = [ch for ch in word] 
    lemma = row["lemma"]
    lemmalist = [ch for ch in lemma] 
    difference = []
    morpheme_extract(lemmalist, wordlist)
    difference[:5]

In [11]:
df.head()

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,Morpheme
0,1,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,2,அருகே,அருகே,P,PP-------,,18,
2,3,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,4,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்
4,5,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,


In [12]:
#Dropping id column
df.drop(columns='id', inplace=True)
df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,
1,அருகே,அருகே,P,PP-------,,18,
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்
4,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,


In [13]:
#Reading sentences csv 
file_path = "Sentence_index.csv"
sentence_df = pd.read_csv(file_path, encoding="utf-8")
sentence_df.dropna(inplace=True)
sentence_df.drop(columns='id', inplace=True)
sentence_df.reset_index(inplace=True)
sentence_df.head()

Unnamed: 0,index,NoSpaceAfter
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [14]:
#Adding NoSpaceAfter column to dataframe. The 'no_space_after' will be set to 1 if the following token is part of the current token.
#Whenever the splitting takes place this attribute will be set to 1 for the first token.
#For example, The 'no_space_after' attribute for pATukAkkap  will be 1. Whereas the 'no_space_after' attribute for um will be 0.  
df['NoSpaceAfter'] = sentence_df['NoSpaceAfter']
df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme,NoSpaceAfter
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,,0
1,அருகே,அருகே,P,PP-------,,18,,0
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,,0
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,0
4,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,,0


In [15]:
#Adding postpositions and clitics to morpheme list
for i in range(0, len(df)):
    if df.loc[i,'NoSpaceAfter']==1:
        df.loc[i+1, 'Morpheme'] = df.loc[i+1, 'form']
        df.loc[i+1, 'NoSpaceAfter'] = 2


In [16]:
#Removing everything expect nouns and verbs
for index, row in df.iterrows():
    if (row["upos"] != 'N') and (row["upos"] != 'V') and (row['NoSpaceAfter'] != 2):
        df.drop(index, inplace = True)

        

In [17]:
#Creating dataframe out of the frequencies of the morphemes
value_counts_df = pd.DataFrame(df["Morpheme"].value_counts())
value_counts_df.reset_index(inplace=True)
value_counts_df = value_counts_df.rename(columns = {'index':'Morpheme', 'Morpheme':'Counts'})
value_counts_df["Counts"][0]=0
value_counts_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Morpheme,Counts
0,,0
1,கள்,210
2,உம்,209
3,யில்,155
4,ஆக,127


In [18]:
#Merging value counts with the original dataframe
merged_df = pd.merge(df, value_counts_df, on="Morpheme", how = "left")
merged_df.head()

Unnamed: 0,form,lemma,upos,xpos,feats,head,Morpheme,NoSpaceAfter,Counts
0,சென்னை,சென்னை,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",2,,0,0
1,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",4,,0,0
2,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,"{'Cas': 'L', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",18,ில்,0,111
3,கிரீன்,கிரீன்,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",6,,0,0
4,பீல்டு,பீல்டு,N,NEN-3SN--,"{'Cas': 'N', 'Per': '3', 'Num': 'S', 'Gen': 'N'}",11,,0,0


In [19]:
# Removing all the blank morpheme rows
for index, row in merged_df.iterrows():
    if row["Morpheme"] == '':
        merged_df.drop(index, inplace = True)


In [20]:
#Cleaning up to only get the noun and verb morphemes along with their value counts
columns = ['Morpheme', 'Counts','upos', 'xpos', 'NoSpaceAfter']
adjusted_morpheme_df = pd.DataFrame(columns = columns)
adjusted_morpheme_df["Morpheme"] = merged_df["Morpheme"]
adjusted_morpheme_df["Counts"] = merged_df["Counts"]
adjusted_morpheme_df["upos"] = merged_df["upos"]
adjusted_morpheme_df["xpos"] = merged_df["xpos"]
adjusted_morpheme_df["NoSpaceAfter"] = merged_df["NoSpaceAfter"]
adjusted_morpheme_df = adjusted_morpheme_df.drop_duplicates(subset = ['Morpheme', 'NoSpaceAfter', 'Counts'])
adjusted_morpheme_df = adjusted_morpheme_df.sort_values(by='Counts', ascending=False)
adjusted_morpheme_df.head()

Unnamed: 0,Morpheme,Counts,upos,xpos,NoSpaceAfter
130,கள்,210,N,NNN-3PA--,1
172,கள்,210,N,NNN-3PN--,0
30,உம்,209,T,Tv-------,2
10,யில்,155,N,NNL-3SN--,0
653,யில்,155,N,NNL-3SN--,1


In [28]:
# Making postpositions/clitics morpheme list
columns = ['form', 'Counts','upos', 'xpos', 'NoSpaceAfter']
clitics_df = pd.DataFrame(columns = columns)
clitics_df["form"] = merged_df["form"]
clitics_df["Counts"] = merged_df["Counts"]
clitics_df["upos"] = merged_df["upos"]
clitics_df["xpos"] = merged_df["xpos"]
clitics_df["NoSpaceAfter"] = merged_df["NoSpaceAfter"]
clitic_df = clitics_df[clitics_df['NoSpaceAfter']==2]
clitic_df = clitic_df.drop_duplicates(subset = ['form'])
clitic_df

Unnamed: 0,form,Counts,upos,xpos,NoSpaceAfter
7,ஆன,61,T,Tg-------,2
16,உள்ளார்,31,V,VR-T3SHAA,2
30,உம்,209,T,Tv-------,2
47,பட்டு,19,V,VT-T---PA,2
66,ப்பட,7,V,VU-T---PA,2
83,உள்ளனர்,16,V,VR-T3PAAA,2
86,உள்ள,18,J,Jd-T----A,2
121,பகுதியில்,1,N,NNL-3SN--,2
127,இலிருந்து,25,P,PP-------,2
131,ஆக,127,P,PP-------,2


In [22]:
#Word list excluding postpostions
root_df = adjusted_morpheme_df[adjusted_morpheme_df['NoSpaceAfter']<2]
root_df.reset_index(inplace=True)
root_df.head()

Unnamed: 0,index,Morpheme,Counts,upos,xpos,NoSpaceAfter
0,130,கள்,210,N,NNN-3PA--,1
1,172,கள்,210,N,NNN-3PN--,0
2,10,யில்,155,N,NNL-3SN--,0
3,653,யில்,155,N,NNL-3SN--,1
4,2,ில்,111,N,NEL-3SN--,0


In [23]:
#Creating morpheme list with no duplicates (without postpositions)
no_repeat_df = root_df.drop_duplicates(subset = ['Morpheme'])


# Getting Morpheme list for each type of noun and verb

## Noun - Accusative Case

In [24]:
# Creating function to make morphlist with counts
def morphcount(input_df):
    for i in range(len(input_df)):
        word = input_df.loc[i, 'Morpheme']
        a = [ch for ch in word]
        b= []
        n = len(a)
        for num_splits in range(n):
            for splits in itertools.combinations(range(1, n), num_splits):
                splices = zip([0] + list(splits), list(splits) + [n])
                b.append([a[i:j] for i, j in splices])

        df = pd.DataFrame.from_records(b)
        if i == 0:
            morphs = df[i]
        for j in range(len(df.columns)):
            morphs = morphs.append(df[j])
        
        morphs = morphs.map(lambda x: ''.join(x),na_action='ignore')
        morphs.dropna(inplace=True)


In [25]:
NA_df = no_repeat_df[no_repeat_df['xpos'].str.contains(r'(NNA.*)')]
NA_df.reset_index(inplace=True)
NA_df.drop_duplicates(inplace=True)
NA_df

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,level_0,index,Morpheme,Counts,upos,xpos,NoSpaceAfter
0,18,390,களை,46,N,NNA-3PN--,0
1,70,28,ஙகளை,14,N,NNA-3PN--,0
2,83,249,ததைய்,11,N,NNA-3SN--,1
3,87,1011,களைய்,11,N,NNA-3PA--,1
4,114,92,களைப்,7,N,NNA-3PN--,0
5,132,2640,ைக்,5,N,NNA-3SN--,0
6,133,1659,களைக்,5,N,NNA-3PN--,0
7,143,108,ஙகளைப்,4,N,NNA-3PN--,0
8,147,1228,ைப்,4,N,NNA-3SN--,0
9,148,1236,யைத்,4,N,NNA-3SN--,0


In [26]:
morphcount(NA_df)
morphs_df = pd.DataFrame(morphs)
columns = ['values']
morphs_values_df = pd.DataFrame(morphs_df.value_counts(), columns = columns)
morphs_values_df.sort_values(by=['values'], ascending=False).head()

NameError: name 'morphs' is not defined

## Noun - Dative Case

In [None]:
ND_df = no_repeat_df[no_repeat_df['xpos'].str.contains(r'(N.D.*)')]
ND_df.reset_index(inplace=True)
ND_df.drop_duplicates(inplace=True)
ND_df

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ND_df.drop_duplicates(inplace=True)


Unnamed: 0,level_0,index,Morpheme,Counts,upos,xpos,NoSpaceAfter
0,15,506,க்கு,49,N,NND-3SN--,0
1,57,169,ுக்கு,18,N,NPDF3PH-A,0
2,64,1281,களுக்கு,16,N,NND-3PN--,0
3,68,258,வுக்கு,14,N,NED-3SN--,0
4,69,948,களுக்க்,14,N,NND-3PN--,1
5,72,1760,க்க்,14,N,NED-3SN--,1
6,84,70,ததுக்கு,11,N,NND-3SN--,0
7,96,1740,ுக்க்,9,N,NND-3SN--,1
8,122,2376,ததுக்க்,6,N,NND-3SN--,1
9,131,1294,ஙகளுக்கு,5,N,NND-3PN--,0


In [None]:
morphcount(ND_df)
morphs_df = pd.DataFrame(morphs)
columns = ['values']
morphs_values_df = pd.DataFrame(morphs_df.value_counts(), columns = columns)
morphs_values_df.sort_values(by=['values'], ascending=False).head()

Unnamed: 0_level_0,values
0,Unnamed: 1_level_1
்,998
இ,576
வ,548
ை,431
த,383


In [None]:
##Next steps
## make a list based on what you see for each tense (noun - stem+plural+case  verb - stem+tense+PNG)
##Read the grammar textbook again make sure theres nothing missing