In [616]:
# Importing dependencies
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [617]:
# Creating connection string
db_string = f"postgres://postgres:{db_password}@indusscript.cljludlfcgoa.us-east-2.rds.amazonaws.com:5432/postgres"

In [618]:
# Creating engine
engine = create_engine(db_string)

In [619]:
# Function to separate a string and store it into the Morpheme Separated column
def separator(input_df):
    input_df['formSeparated'] = np.nan
    input_df['formSeparated'] = input_df['formSeparated'].astype(object)
    for i in range(len(input_df)):
        word = input_df.loc[i, 'form']
        a = [ch for ch in word]
        b = np.empty(1, dtype=object)
        b[0] = a
        input_df.loc[i, 'formSeparated'] = b

In [620]:
#Setting precision of dataframe as 0
pd.set_option('precision', 0)

#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [621]:
# Reading cleaned data from postgreSQL
complete_df = pd.read_sql_table('completetamildata', con=engine)
separator(complete_df)
complete_df.drop(columns="index", inplace = True)
complete_df.reset_index(inplace=True)
complete_df.head()

Unnamed: 0,index,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,formSeparated
0,0,சென்னை,சென்னை,N,NEN-3SN--,2,,0,0,"[ச, ெ, ன, ், ன, ை]"
1,1,அருகே,அருகே,P,PP-------,18,,0,0,"[அ, ர, ு, க, ே]"
2,2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,4,,0,0,"[ஸ, ், ர, ீ]"
3,3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0,136,"[ப, ெ, ர, ு, ம, ், ப, ு, த, ூ, ர, ி, ல, ்]"
4,4,கிரீன்,கிரீன்,N,NEN-3SN--,6,,0,0,"[க, ி, ர, ீ, ன, ்]"


In [622]:
# Unravelling the letters of each word so they can be used as features for the model
columns = ['index1', 'index2']
letters = pd.Series(complete_df['formSeparated'])
letters = letters.apply(pd.Series).stack()
letters_df = letters.to_frame(name='letters')
letters_df = letters_df.rename_axis(['index', 'index 2'])
letters_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,letters
index,index 2,Unnamed: 2_level_1
0,0,ச
0,1,ெ
0,2,ன
0,3,்
0,4,ன


In [623]:
# Merging letters with dataframe and filtering for one grammar case to test
NAS_df = pd.merge(letters_df, complete_df, on="index", how = "right")
NAS_df.drop(columns = ['head', 'NoSpaceAfter', 'Counts', 'lemma'], inplace=True)
NAS_df.reset_index(drop=True, inplace=True)
re = '(N.A..S...)'
NAS_df = NAS_df[NAS_df['xpos'].str.contains(rf'{re}')]
NAS_df = NAS_df[:60]
NAS_df.reset_index(drop=True, inplace=True)
NAS_df.head()

  return func(self, *args, **kwargs)


Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,formSeparated
0,40,வ,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]"
1,40,ள,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]"
2,40,ர,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]"
3,40,்,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]"
4,40,ச,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]"


In [624]:
# Manually adding vowels in dataset
ii = NAS_df.loc[7, 'letters']
ai = NAS_df.loc[9, 'letters']
ee = NAS_df.loc[41, 'letters']

In [625]:
#Creating features dataframe for ML model
columns = ['index', 'letters', 'prefix', 'vowel', 'formSeparated', 'morpheme boundary']
feature_df = pd.DataFrame(columns=columns)
feature_df['index'] = NAS_df['index']
feature_df['letters'] = NAS_df['letters']
feature_df['formSeparated'] = NAS_df['formSeparated']
feature_df.reset_index(drop=True, inplace=True)
feature_df.head()

Unnamed: 0,index,letters,prefix,vowel,formSeparated,morpheme boundary
0,40,வ,,,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
1,40,ள,,,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
2,40,ர,,,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
3,40,்,,,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
4,40,ச,,,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",


In [626]:
#Checking vowels manually
vowels = [ii, ai, ee]
vowels

['ி', 'ை', 'ீ']

In [627]:
#Filling in vowels manually
for i in range(len(feature_df)):
    if feature_df.loc[i, 'letters'] in vowels:
        feature_df.loc[i, 'vowel'] = 1
    else:
        feature_df.loc[i, 'vowel'] = 0
feature_df.head()

Unnamed: 0,index,letters,prefix,vowel,formSeparated,morpheme boundary
0,40,வ,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
1,40,ள,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
2,40,ர,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
3,40,்,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",
4,40,ச,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",


In [628]:
#Assigning morpheme boundaries manually
morpheme = ['ை']

In [629]:
#Assigning morpheme boundaries manually
for i in range(len(feature_df)):
    if feature_df.loc[i, 'letters'] in morpheme:
        feature_df.loc[i, 'morpheme boundary'] = 1
    else:
        feature_df.loc[i, 'morpheme boundary'] = 0
feature_df.head()

Unnamed: 0,index,letters,prefix,vowel,formSeparated,morpheme boundary
0,40,வ,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
1,40,ள,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
2,40,ர,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
3,40,்,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
4,40,ச,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0


In [630]:
#Cleaning dataframe
feature_df['prefix'] = ''
feature_df.rename(columns = {'index': 'key'}, inplace = True)
feature_df.head()

Unnamed: 0,key,letters,prefix,vowel,formSeparated,morpheme boundary
0,40,வ,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
1,40,ள,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
2,40,ர,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
3,40,்,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
4,40,ச,,0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0


In [631]:
#Changing prefix column to list
feature_df.loc[:,"prefix"] = feature_df.loc[:,"prefix"].apply(lambda x: [x])
feature_df.head()

Unnamed: 0,key,letters,prefix,vowel,formSeparated,morpheme boundary
0,40,வ,[],0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
1,40,ள,[],0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
2,40,ர,[],0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
3,40,்,[],0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0
4,40,ச,[],0,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",0


In [632]:
#Creating function to calculate prefix of each letter
list = 0
list2 = []
j=0
for index, row in feature_df.iterrows():
    try:
        if feature_df.loc[index, "key"] == feature_df.loc[index+1, "key"]:
            j = j+1
            list = row["formSeparated"]
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
        else:
            j=j+1
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
            j=0
            list = 0
    except KeyError:
        print("Done")
list2
prefix_col = pd.DataFrame([list2])
prefix_col = prefix_col.T
prefix_col.rename(columns = {0: "prefix"}, inplace = True)
prefix_col.head()

    

Done


Unnamed: 0,prefix
0,[]
1,[வ]
2,"[வ, ள]"
3,"[வ, ள, ர]"
4,"[வ, ள, ர, ்]"


In [647]:
# Adding prefixes to features datafreame
feature_df["prefix"] = prefix_col["prefix"]
feature_df.drop(columns="formSeparated", inplace=True)
feature_df.head()

Unnamed: 0,key,letters,prefix,vowel,morpheme boundary,letter_label_encoded,prefix_label_encoded
0,40,வ,[],0,0,12,40
1,40,ள,[வ],0,0,10,39
2,40,ர,"[வ, ள]",0,0,8,34
3,40,்,"[வ, ள, ர]",0,0,16,33
4,40,ச,"[வ, ள, ர, ்]",0,0,2,32


In [634]:
#Changing datatype of prefix column for ease of encoding
feature_df['prefix'] = feature_df['prefix'].astype(str)

In [635]:
#Using label encoder for now but will use weight of evidence encoding on letters with the final dataset. Prefix column will be a function of letters and their position. 
#Two dditional columns of all the letters encountered after the last morpheme boundary as well as the parts of speech will be added as well. These will be encoding similar to the prefix column and using one hot encoding respectively.
feature_df['letter_label_encoded'] = LabelEncoder().fit_transform(feature_df.letters)
feature_df['prefix_label_encoded'] = LabelEncoder().fit_transform(feature_df.prefix)
feature_df.dropna(inplace=True)


In [636]:
#Creating features and target
y = feature_df["morpheme boundary"]
X = feature_df.drop(columns=['key', 'letters', 'prefix', 'morpheme boundary'])
y=y.astype('int')


In [637]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(45, 3)

In [638]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [639]:
model.fit(X_train, y_train)

SVC(kernel='linear')

In [642]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
"Prediction": y_pred,
"Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,1,1
