In [1]:
# Importing dependencies
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
# Creating connection string
db_string = f"postgres://postgres:{db_password}@indusscript.cljludlfcgoa.us-east-2.rds.amazonaws.com:5432/postgres"

In [3]:
# Creating engine
engine = create_engine(db_string)

In [4]:
# Function to separate a string and store it into the Morpheme Separated column
def separator(input_df):
    input_df['formSeparated'] = np.nan
    input_df['formSeparated'] = input_df['formSeparated'].astype(object)
    for i in range(len(input_df)):
        word = input_df.loc[i, 'form']
        a = [ch for ch in word]
        b = np.empty(1, dtype=object)
        b[0] = a
        input_df.loc[i, 'formSeparated'] = b

In [5]:
#Setting precision of dataframe as 0
pd.set_option('precision', 0)

#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [7]:
# Reading cleaned data from postgreSQL
complete_df = pd.read_sql_table('completetamildata', con=engine)
separator(complete_df)
complete_df.drop(columns="index", inplace = True)
complete_df.reset_index(inplace=True)
complete_df.head()

Unnamed: 0,index,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,formSeparated
0,0,சென்னை,சென்னை,N,NEN-3SN--,2,,0,0,"[ச, ெ, ன, ், ன, ை]"
1,1,அருகே,அருகே,P,PP-------,18,,0,0,"[அ, ர, ு, க, ே]"
2,2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,4,,0,0,"[ஸ, ், ர, ீ]"
3,3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0,136,"[ப, ெ, ர, ு, ம, ், ப, ு, த, ூ, ர, ி, ல, ்]"
4,4,கிரீன்,கிரீன்,N,NEN-3SN--,6,,0,0,"[க, ி, ர, ீ, ன, ்]"


In [8]:
# Reading morpheme data from postgreSQL
morpheme_df = pd.read_sql_table('morphemes', con=engine)
morpheme_df.head()

Unnamed: 0,index,MorphemeSeparated,Morpheme,xpos Regex,Morpheme continued
0,2,ை,ை,(N.A..S...),
1,3,"{க,்,க,்}",க்க்,(N.D..S...),1.0
2,4,"{க,்,க,ு}",க்கு,(N.D..S...),
3,5,"{க,ு}",கு,(N.D..S...),
4,6,"{க,ள,ு}",களு,(N.D..P...),


In [9]:
# Unravelling the letters of each word so they can be used as features for the model
columns = ['index1', 'index2']
letters = pd.Series(complete_df['formSeparated'])
letters = letters.apply(pd.Series).stack()
letters_df = letters.to_frame(name='letters')
letters_df = letters_df.rename_axis(['index', 'index 2'])
letters_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,letters
index,index 2,Unnamed: 2_level_1
0,0,ச
0,1,ெ
0,2,ன
0,3,்
0,4,ன


In [28]:
# Making features dataframe
features_df = pd.merge(letters_df, complete_df, on="index", how = "right")
features_df.drop(columns = ['head', 'NoSpaceAfter','lemma'], inplace=True)
features_df.reset_index(drop=True, inplace=True)
features_df['prefix'] = ''
features_df['vowel'] = ''
features_df['morpheme boundary'] = ''
features_df['current prefix'] = ''
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,formSeparated,prefix,vowel,morpheme boundary,current prefix
0,0,ச,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",,,,
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",,,,
2,0,ன,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",,,,
3,0,்,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",,,,
4,0,ன,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",,,,


In [12]:
#Getting all unique letters from the dataset
letters = features_df['letters']
letters.drop_duplicates(inplace=True)
letters.reset_index(drop=True, inplace=True)
letters.head()

0    ச
1    ெ
2    ன
3    ்
4    ை
Name: letters, dtype: object

In [13]:
# Manually adding vowels in dataset
ak = letters.loc[64]
ee = letters.loc[61]
ai = letters.loc[58]
ai2 = letters.loc[40]
u = letters.loc[57]
oh = letters.loc[45]
ohh = letters.loc[42]
oh2 = letters.loc[37]
oh3 = letters.loc[33]
ooh = letters.loc[30]
ii = letters.loc[26]
aa = letters.loc[25]
aa2 = letters.loc[23]
ee2 = letters.loc[16]
ooh2 = letters.loc[15]
ee3 = letters.loc[11]
oh4 = letters.loc[9]
ooh3 = letters.loc[7]
aa3 = letters.loc[5]
ai3 = letters.loc[4]
nn = letters.loc[3]
e = letters.loc[1]

vowels = [ak, ee, ai, ai2, u, oh, ohh, oh2, oh3, ooh, ii, aa, aa2, ee2, ooh2, ee3, oh4, ooh3, aa3, ai3, nn, e]


In [29]:
#Filling in vowels manually
for i in range(len(features_df)):
    if features_df.loc[i, 'letters'] in vowels:
        features_df.loc[i, 'vowel'] = 1
    else:
        features_df.loc[i, 'vowel'] = 0
features_df.head()

KeyboardInterrupt: 

In [None]:
#Filtering for one noun accusative case
re = '(N.A..S...)'
NAS_df = features_df[features_df['xpos'].str.contains(rf'{re}')]
NAS_df.drop_duplicates(subset=['form', 'letters'], inplace=True)
for index, row in NAS_df.iterrows():
        if row["FormWithoutLemma"] == '':
            NAS_df.drop(index, inplace=True)
NAS_df.head()

  return func(self, *args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,formSeparated,prefix,vowel,morpheme boundary,current prefix
253,40,வ,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,1,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",,0,,
254,40,ள,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,1,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",,0,,
255,40,ர,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,1,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",,0,,
256,40,்,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,1,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",,1,,
257,40,ச,வளர்ச்சியைக்,N,NNA-3SN--,யைக்,1,"[வ, ள, ர, ், ச, ், ச, ி, ய, ை, க, ்]",,0,,


## The code below is from ML model test

In [36]:
#Changing prefix column to list
features_df.loc[:,"prefix"] = features_df.loc[:,"prefix"].apply(lambda x: [x])
features_df.rename(columns = {'index': 'key'}, inplace = True)
features_df.head()

Unnamed: 0,key,letters,form,upos,xpos,FormWithoutLemma,Counts,formSeparated,prefix,vowel,morpheme boundary,current prefix
0,0,ச,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",[[[['']]]],0,,
1,0,ெ,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",[[[['']]]],1,,
2,0,ன,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",[[[['']]]],0,,
3,0,்,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",[[[['']]]],1,,
4,0,ன,சென்னை,N,NEN-3SN--,,0,"[ச, ெ, ன, ், ன, ை]",[[[['']]]],0,,


In [34]:
#Creating function to calculate prefix of each letter. Works with filtered dataframe but giving an 'int' object is not subscriptable with full dataframe
list = 0
list2 = []
j=0
for index, row in features_df.iterrows():
    try:
        if features_df.loc[index, "key"] == features_df.loc[index+1, "key"]:
            j = j+1
            list = row["formSeparated"]
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
        else:
            j=j+1
            row["prefix"] = list[0:j-1]
            value = row["prefix"]
            list2.append(value)
            j=0
            list = 0
    except KeyError:
        print("Done")
list2
prefix_col = pd.DataFrame([list2])
prefix_col = prefix_col.T
prefix_col.rename(columns = {0: "prefix"}, inplace = True)
prefix_col.head()

    

TypeError: 'int' object is not subscriptable

In [20]:
# Adding prefixes to features datafreame
features_df["prefix"] = prefix_col["prefix"]
features_df.drop(columns="formSeparated", inplace=True)
features_df.head()

Unnamed: 0,index,letters,form,upos,xpos,FormWithoutLemma,Counts,prefix,vowel,morpheme boundary,current prefix
0,0,ச,சென்னை,N,NEN-3SN--,,0,,0,,
1,0,ெ,சென்னை,N,NEN-3SN--,,0,,1,,
2,0,ன,சென்னை,N,NEN-3SN--,,0,,0,,
3,0,்,சென்னை,N,NEN-3SN--,,0,,1,,
4,0,ன,சென்னை,N,NEN-3SN--,,0,,0,,


In [21]:
#Changing datatype of prefix column for ease of encoding
features_df['prefix'] = features_df['prefix'].astype(str)

In [22]:
#Using label encoder for now but will use weight of evidence encoding on letters with the final dataset. Prefix column will be a function of letters and their position. 
#Two dditional columns of all the letters encountered after the last morpheme boundary as well as the parts of speech will be added as well. These will be encoding similar to the prefix column and using one hot encoding respectively.
features_df['letter_label_encoded'] = LabelEncoder().fit_transform(features_df.letters)
features_df['prefix_label_encoded'] = LabelEncoder().fit_transform(features_df.prefix)
features_df.dropna(inplace=True)


In [38]:
#Creating features and target
y = features_df["morpheme boundary"]
X = features_df.drop(columns=['key', 'letters', 'prefix', 'morpheme boundary'])
y=y.astype('int')


ValueError: invalid literal for int() with base 10: ''

In [637]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(45, 3)

In [638]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [639]:
model.fit(X_train, y_train)

SVC(kernel='linear')

In [642]:
y_pred = model.predict(X_test)
results = pd.DataFrame({
"Prediction": y_pred,
"Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,1,1
