In [1]:
# Importing dependencies
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Obtaining data tables from SQL and Preprocessing Data for Feature Extraction

In [2]:
# Creating connection string
db_string = f"postgres://postgres:{db_password}@indusscript.cljludlfcgoa.us-east-2.rds.amazonaws.com:5432/postgres"

In [3]:
# Creating engine
engine = create_engine(db_string)

In [4]:
# Function to separate a string and store it into the Morpheme Separated column
def separator(input_df):
    input_df['formSeparated'] = np.nan
    input_df['formSeparated'] = input_df['formSeparated'].astype(object)
    for i in range(len(input_df)):
        word = input_df.loc[i, 'form']
        a = [ch for ch in word]
        b = np.empty(1, dtype=object)
        b[0] = a
        input_df.loc[i, 'formSeparated'] = b

In [5]:
#Setting precision of dataframe as 0
pd.set_option('precision', 0)

#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [6]:
# Reading cleaned data from postgreSQL
complete_df = pd.read_sql_table('complete_tamil', con=engine)
#complete_df = pd.read_sql('Select * from complete_tamil limit 5', con=engine)
separator(complete_df)
complete_df.drop(columns="index", inplace = True)
complete_df.reset_index(inplace=True)
complete_df.head()

Unnamed: 0,index,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,MorphemeSeparated,index1,formSeparated
0,0,சென்னை,சென்னை,N,NEN-3SN--,2,,0,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"
1,1,அருகே,அருகே,P,PP-------,18,,0,0,"{அ,ர,ு,க,ே}",1,"[அ, ர, ு, க, ே]"
2,2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,4,,0,0,"{ஸ,்,ர,ீ}",2,"[ஸ, ், ர, ீ]"
3,3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0,136,"{ப,ெ,ர,ு,ம,்,ப,ு,த,ூ,ர,ி,ல,்}",3,"[ப, ெ, ர, ு, ம, ், ப, ு, த, ூ, ர, ி, ல, ்]"
4,4,கிரீன்,கிரீன்,N,NEN-3SN--,6,,0,0,"{க,ி,ர,ீ,ன,்}",4,"[க, ி, ர, ீ, ன, ்]"


In [7]:
# Reading morpheme data from postgreSQL
morpheme_df = pd.read_sql('morphemes_labelled', con=engine)
# morpheme_df = pd.read_sql('Select * from morphemes_labelled limit 5', con=engine)
morpheme_df.head()

Unnamed: 0,index,MorphemeSeparated,Morpheme,Type,Tense,PNG,Case,id
0,0,ை,ை,,,,Accusative,0
1,1,"{க,ு}",கு,,,,Dative,1
2,2,"{க,்,க,ு}",க்கு,,,,Dative,1
3,3,"{க,்,க,்}",க்க்,,,,Dative,1
4,4,"{ா,ல,்}",ால்,"Verb- lexical, conditional",,,,5


In [8]:
# Look at unique morphemes
morpheme1 = morpheme_df['MorphemeSeparated'].unique()

In [9]:
morpheme2 = morpheme_df['Morpheme'].unique()

In [10]:
# Unravelling the letters of each word so they can be used as features for the model
columns = ['index1', 'index2']
letters = pd.Series(complete_df['formSeparated'])
letters = letters.apply(pd.Series).stack()
letters_df = letters.to_frame(name='letters')
letters_df = letters_df.rename_axis(['index', 'index 2'])

In [12]:
# Merging letters with dataframe and filtering for one grammar case to test
features_df = pd.merge(letters_df, complete_df, on="index", how = "right")
features_df.drop(columns = ['head', 'NoSpaceAfter','lemma'], inplace=True)
features_df.reset_index(drop=True, inplace=True)
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]"


In [13]:
#Adding columns in features dataframe for ML model
features_df['prefix'] = ''
features_df['vowel'] = ''
features_df['morpheme boundary'] = ''
features_df['noun'] = 0
features_df['verb'] = 0
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,,,0,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,,,0,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,,,0,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,,,0,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,,,0,0


In [14]:
#Getting all unique letters from the dataset
letters = features_df['letters']
letters.drop_duplicates(inplace=True)
letters.reset_index(drop=True, inplace=True)

In [15]:
# Manually adding vowels in dataset
ak = letters.loc[64]
ee = letters.loc[61]
ai = letters.loc[58]
ai2 = letters.loc[40]
u = letters.loc[57]
oh = letters.loc[45]
ohh = letters.loc[42]
oh2 = letters.loc[37]
oh3 = letters.loc[33]
ooh = letters.loc[30]
ii = letters.loc[26]
aa = letters.loc[25]
aa2 = letters.loc[23]
ee2 = letters.loc[16]
ooh2 = letters.loc[15]
ee3 = letters.loc[11]
oh4 = letters.loc[9]
ooh3 = letters.loc[7]
aa3 = letters.loc[5]
ai3 = letters.loc[4]
nn = letters.loc[3]
e = letters.loc[1]

vowels = [ak, ee, ai, ai2, u, oh, ohh, oh2, oh3, ooh, ii, aa, aa2, ee2, ooh2, ee3, oh4, ooh3, aa3, ai3, nn, e]


In [16]:
#Filling in vowels manually
for i in range(len(features_df)):
    if features_df.loc[i, 'letters'] in vowels:
        features_df.loc[i, 'vowel'] = 1
    else:
        features_df.loc[i, 'vowel'] = 0
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,0,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,0,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,0,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,0,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,0,0


In [17]:
# Filling in Noun and Verb classifiers
for i in range(len(features_df)):
    if features_df.loc[i,'upos'] == 'N':
        features_df.loc[i,'noun'] = 1
    if features_df.loc[i,'upos'] == 'V':
        features_df.loc[i,'verb'] = 1
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0


In [35]:
# #Assigning morpheme boundaries manually
# morpheme = ['ை']

In [36]:
# #Assigning morpheme boundaries manually
# for i in range(len(features_df)):
#     if features_df.loc[i, 'letters'] in morpheme:
#         features_df.loc[i, 'morpheme boundary'] = 1
#     else:
#         features_df.loc[i, 'morpheme boundary'] = 0
# features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,0,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,0,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,0,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,0,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,0,1,0


In [18]:
#Cleaning dataframe
features_df['prefix'] = ''
features_df.rename(columns = {'index': 'key'}, inplace = True)
features_df.head()

Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,1,,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",,0,,1,0


In [19]:
#Changing prefix column to list
features_df.loc[:,"prefix"] = features_df.loc[:,"prefix"].apply(lambda x: [x])
features_df.head()

Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,formSeparated,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",[],0,,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",[],1,,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",[],0,,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",[],1,,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன, ை]",[],0,,1,0


In [20]:
#Creating function to calculate prefix of each letter
list = []
list2 = []
j = 0

for index, row in features_df.iterrows():
    try:
        # if index is same, it is the same word
        if features_df.loc[index, "key"] == features_df.loc[index+1, "key"]:
            j=j+1
            list = row["formSeparated"]
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
        else:
            j=j+1
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
            j=0
            list = []
    except KeyError:
        print("Done")
list2
prefix_col = pd.DataFrame([list2])
prefix_col = prefix_col.T
prefix_col.rename(columns = {0: "prefix"}, inplace = True)
prefix_col.head()

Done


Unnamed: 0,prefix
0,[]
1,[ச]
2,"[ச, ெ]"
3,"[ச, ெ, ன]"
4,"[ச, ெ, ன, ்]"


In [21]:
# Adding prefixes to features datafreame
features_df["prefix"] = prefix_col["prefix"]
features_df.drop(columns="formSeparated", inplace=True)
features_df.head()

Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,[],0,,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,[ச],1,,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ]",0,,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன]",1,,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ்]",0,,1,0


In [21]:
# Morpheme character array
morpheme1

array(['ை', '{க,ு}', '{க,்,க,ு}', '{க,்,க,்}', '{ா,ல,்}', '{த,ு}',
       '{த,்,த,ு}', '{ி,ன}', '{ி,ன,்}', '{ி,ல,்}', '{ி,ட,ம,்}', '{ோ,ட,ு}',
       '{ு,ட,ன,்}', '{க,ள}', '{க,ள,்}', '{க,ள,ு}', '{ே,ன,்}', '{க,ி,ற}',
       '{க,்,க,ி,ற}', '{ண,ட}', '{ன}', '{ீ,ர,்,க,ள,்}', '{ந,்,த,்}',
       '{ந,்,த}', '{ி,ய}', '{த}', '{த,்,த,்}', '{த,்}', '{த,்,த}',
       '{ட,்,ட,்}', '{்,ட}', '{ட,்}', '{ட,்,ட}', '{க}', '{க,்}',
       '{க,்,க}', '{ு,ம,்}', '{ா}', '{ா,ர,்}', '{வ}', '{ோ,ம,்}', '{ப}',
       '{ப,்,ப}', '{ர,்}', '{க,்,க,ி,ன,்,ற}', '{க,ி,ன,்,ற}',
       '{ா,ர,்,க,ள,்}', '{ி}', '{ட,ு}', '{ந,்,த,ு}', '{ட,்,ட,ு}',
       '{ண,ட,ு}', '{ண,ட,்}', '{வ,ர}'], dtype=object)

In [22]:
# Morpheme array
morpheme2

array(['ை', 'கு', 'க்கு', 'க்க்', 'ால்', 'து', 'த்து', 'ின', 'ின்', 'ில்',
       'ிடம்', 'ோடு', 'ுடன்', 'கள', 'கள்', 'களு', 'ேன்', 'கிற', 'க்கிற',
       'ணட', 'ன', 'ீர்கள்', 'ந்த்', 'ந்த', 'ிய', 'த', 'த்த்', 'த்', 'த்த',
       'ட்ட்', '்ட', 'ட்', 'ட்ட', 'க', 'க்', 'க்க', 'ும்', 'ா', 'ார்',
       'வ', 'ோம்', 'ப', 'ப்ப', 'ர்', 'க்கின்ற', 'கின்ற', 'ார்கள்', 'ி',
       'டு', 'ந்து', 'ட்டு', 'ணடு', 'ணட்', 'வர'], dtype=object)

In [22]:
features_df['morpheme boundary'] = 0

In [23]:
all_words = features_df['form'].tolist()

prev_key = -1
for word_idx in range(len(all_words)):
    if(prev_key == features_df['key'][word_idx]):
        continue
    else:
        prev_key = features_df['key'][word_idx]
    
    for morph_idx in range(len(morpheme2)):
        try:
            str_idx = all_words[word_idx].index(morpheme2[morph_idx])
            features_df['morpheme boundary'][word_idx + str_idx] = 1
        except:
            continue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [24]:
features_df.head(35)

Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,MorphemeSeparated,index1,prefix,vowel,morpheme boundary,noun,verb
0,0,ச,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,[],0,0,1,0
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,[ச],1,0,1,0
2,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ]",0,1,1,0
3,0,்,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன]",1,0,1,0
4,0,ன,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ்]",0,0,1,0
5,0,ை,சென்னை,N,NEN-3SN--,,0,"{ச,ெ,ன,்,ன,ை}",0,"[ச, ெ, ன, ், ன]",1,1,1,0
6,1,அ,அருகே,P,PP-------,,0,"{அ,ர,ு,க,ே}",1,[],1,0,0,0
7,1,ர,அருகே,P,PP-------,,0,"{அ,ர,ு,க,ே}",1,[அ],0,0,0,0
8,1,ு,அருகே,P,PP-------,,0,"{அ,ர,ு,க,ே}",1,"[அ, ர]",1,0,0,0
9,1,க,அருகே,P,PP-------,,0,"{அ,ர,ு,க,ே}",1,"[அ, ர, ு]",0,1,0,0


In [25]:
#Changing datatype of prefix column for ease of encoding
features_df['prefix'] = features_df['prefix'].astype(str)

### Encode features

In [26]:
#Using label encoder for now but will use weight of evidence encoding on letters with the final dataset. Prefix column will be a function of letters and their position. 
#Two additional columns of all the letters encountered after the last morpheme boundary as well as the parts of speech will be added as well. These will be encoding similar to the prefix column and using one hot encoding respectively.
features_df['letter_label_encoded'] = LabelEncoder().fit_transform(features_df.letters)
features_df['prefix_label_encoded'] = LabelEncoder().fit_transform(features_df.prefix)
features_df.dropna(inplace=True)

In [27]:
#Creating features and target
y = features_df["morpheme boundary"]
X = features_df.drop(columns=['key', 'letters','form', 'MorphemeSeparated', 'morpheme boundary', 'prefix', 'upos', 'xpos','FormWithoutLemma','Counts'])
y=y.astype('int')


In [39]:
X.head()

Unnamed: 0,index1,vowel,noun,verb,letter_label_encoded,prefix_label_encoded
0,0,0,1,0,40,11374
1,0,1,1,0,65,5375
2,0,0,1,0,47,5318
3,0,1,1,0,71,5065
4,0,0,1,0,47,5064


In [31]:
features_df.to_csv("features.csv")