In [253]:
#Importing dependencies
from sqlalchemy import create_engine
from config import db_password
import pandas as pd

In [254]:
#Creating connection string
db_string = f"postgres://postgres:{db_password}@indusscript.cljludlfcgoa.us-east-2.rds.amazonaws.com:5432/postgres"

In [255]:
#Setting Dataframe display to max
pd.set_option('display.max_rows', None)

In [256]:
#Creating engine
engine = create_engine(db_string)

In [257]:
#Reading clitics dataframe from sql. (Created in Tamil_Morpheme_List.ipynb)
clitics_data = pd.read_sql_table('clitics', con=engine)
clitics_data.reset_index(drop=True, inplace=True)
clitics_data.drop(columns="index", inplace=True)
clitics_data.head()

Unnamed: 0,form,Counts,upos,xpos,NoSpaceAfter
0,ஆன,69,T,Tg-------,2.0
1,உள்ளார்,32,V,VR-T3SHAA,2.0
2,உம்,234,T,Tv-------,2.0
3,பட்டு,19,V,VT-T---PA,2.0
4,ப்பட,7,V,VU-T---PA,2.0


In [258]:
#Reading complete tamil data from sql
tamildata = pd.read_sql_table('completetamil', con=engine)
tamildata.drop(columns="index", inplace=True)
tamildata.head()

Unnamed: 0,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,formSeparated
0,சென்னை,சென்னை,N,NEN-3SN--,2,,0.0,0,"{ச,ெ,ன,்,ன,ை}"
1,அருகே,அருகே,P,PP-------,18,,0.0,0,"{அ,ர,ு,க,ே}"
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,4,,0.0,0,"{ஸ,்,ர,ீ}"
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0.0,136,"{ப,ெ,ர,ு,ம,்,ப,ு,த,ூ,ர,ி,ல,்}"
4,கிரீன்,கிரீன்,N,NEN-3SN--,6,,0.0,0,"{க,ி,ர,ீ,ன,்}"


In [259]:
postpositions = tamildata[(tamildata["upos"]=='C')|(tamildata["upos"]=='D')|(tamildata["upos"]=='P')|(tamildata["upos"]=='Q')|(tamildata["upos"]=='T')]
postpositions = postpositions[["form", "lemma", "Counts", "upos", "xpos", "NoSpaceAfter"]]
postpositions.drop_duplicates(subset="form", inplace=True)
postpositions.reset_index(drop=True, inplace=True)
postpositions.tail()


Unnamed: 0,form,lemma,Counts,upos,xpos,NoSpaceAfter
117,ஆ,ஆ,0,T,Te-------,0.0
118,காட்டிலும்,காட்டிலும்,0,T,Tb-------,0.0
119,என்பதைய்,என்,1,T,TzAF3SN-A,1.0
120,பின்னர்,பின்னர்,0,P,PP-------,0.0
121,ஒரு,ஒரு,0,D,DD-------,0.0


In [260]:
clitics_data = clitics_data.append(postpositions)
clitics_data.reset_index(drop=True, inplace=True)
clitics_data.tail()

Unnamed: 0,form,Counts,upos,xpos,NoSpaceAfter,lemma
187,ஆ,0,T,Te-------,0.0,ஆ
188,காட்டிலும்,0,T,Tb-------,0.0,காட்டிலும்
189,என்பதைய்,1,T,TzAF3SN-A,1.0,என்
190,பின்னர்,0,P,PP-------,0.0,பின்னர்
191,ஒரு,0,D,DD-------,0.0,ஒரு


In [261]:
# Adding an identifier for each clitic. To function as a logogram
clitics = pd.DataFrame(clitics_data["form"])
s= clitics.form.str.len().sort_values(ascending=False).index
clitics = clitics.reindex(s)
clitics["id"] = clitics.index.astype(str)
clitics["id"] = clitics["id"].apply(lambda x: x.zfill(3))
clitics.reset_index(drop=True, inplace=True)
clitics.head()

Unnamed: 0,form,id
0,இருக்கிறீர்கள்,65
1,இருக்கினறனர்,47
2,காட்டிலும்,188
3,இருக்கின்ற,59
4,இடமிருந்து,186


In [266]:
#Adding clitics, postpositions and identifiers to sql
clitics.to_sql(name ='cliticsandpostpositions', con=engine)

In [264]:
# Adding form column to dataframe

for i in range(len(clitics["form"])):
    tamildata["form"] = tamildata["form"].str.replace(clitics["form"][i], clitics["id"][i])

tamildata.head()

Unnamed: 0,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,formSeparated
0,சென்னை,சென்னை,N,NEN-3SN--,2,,0.0,0,"{ச,ெ,ன,்,ன,ை}"
1,070,அருகே,P,PP-------,18,,0.0,0,"{அ,ர,ு,க,ே}"
2,ஸ்ரீ,ஸ்ரீ,N,NEN-3SN--,4,,0.0,0,"{ஸ,்,ர,ீ}"
3,பெரும்புதூரில்,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0.0,136,"{ப,ெ,ர,ு,ம,்,ப,ு,த,ூ,ர,ி,ல,்}"
4,கிரீன்,கிரீன்,N,NEN-3SN--,6,,0.0,0,"{க,ி,ர,ீ,ன,்}"


In [265]:
# Sending tamildata with converted clitics to sql
tamildata.to_sql(name ='convertedcliticstamil', con=engine)