In [1]:
#Importing dependencies
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import math
import re
import numpy as np

In [2]:
# Creating connection string
db_string = f"postgres://postgres:{db_password}@indusscript.cljludlfcgoa.us-east-2.rds.amazonaws.com:5432/postgres"

In [3]:
#Setting Dataframe display to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [4]:
#Creating engine
engine = create_engine(db_string)

In [5]:
# Reading logosyllabic sentences data from postgreSQL
logosyllabic_sentence_df = pd.read_sql_table('logo_syllabic_tamil_sentences_with_names', con=engine)
logosyllabic_sentence_df.head()

Unnamed: 0,index,Sentence
0,0,5000 106 5001 5002-2008 5003 5004 ( 5005 ) 5006 5007-2006-2001-2001-155 5008 5009-3003-2001-3006 5010 85 5011-3004-2008 5012-3005 5013-3006 98 5014 5015 5016-2021-47 .
1,1,"5018 5019-3007 , 5020 5021 5022-2022 5023 : ."
2,2,"5024 11 5006-3009 5025-2008 5026-3010 5027-3006 5028-3004-2000-2025 5029-2008 5030-3011 , 5031 5032-3012-2012-2008 -107 5006 5007-3012-2012-2000 5033-3013-149 , 5034-3007 5035 5006 5007-3012-2012-2000 5036-3014-149 5037 5038 5039 5040-2021-2006 ."
3,3,"5041 , 5042 , 5043 , 5044 , 5000 5045-2020 5006 5007-3012-2012-2000 5046 5047 5048-3012-2012 5049-3017-100 5050-3009 5013-2035-2017 ."
4,4,"5051-2021-2008-149 , 5052-2021-2008-149 5003 5004 5006 5007-3012-2012-2000 5036-2006 5053-2021-2000 5054-3020 5030-3021-2039 5055-2022-2017-2034 ."


In [6]:
#Reading logosyllabic words from postgreSQL
logosyl_data = pd.read_sql_table('logo_syllabic_tamil_with_names', con=engine)
logosyl_data.drop(columns="index", inplace=True)
s = logosyl_data.index1.sort_values().index
logosyl_data = logosyl_data.reindex(s)
logosyl_data.reset_index(drop=True, inplace=True)
logosyl_data.head()

Unnamed: 0,form,lemma,upos,xpos,head,FormWithoutLemma,NoSpaceAfter,Counts,MorphemeSeparated,index1,index2
0,5000,சென்னை,N,NEN-3SN--,2,,0.0,0,"{ச,ெ,ன,்,ன,ை}",0,6
1,106,அருகே,P,PP-------,18,,0.0,0,"{அ,ர,ு,க,ே}",1,5
2,5001,ஸ்ரீ,N,NEN-3SN--,4,,0.0,0,"{ஸ,்,ர,ீ}",2,4
3,5002-2008,பெரும்புதூர்,N,NEL-3SN--,18,ில்,0.0,136,"{ப,ெ,ர,ு,ம,்,ப,ு,த,ூ,ர,ி,ல,்}",3,11
4,5003,கிரீன்,N,NEN-3SN--,6,,0.0,0,"{க,ி,ர,ீ,ன,்}",4,6


In [7]:
#Reading all logograms from sql
all_logograms = pd.read_sql_table('all_logograms', con=engine)
all_logograms["Frequency"] = 0

In [8]:
all_logograms["id"] = all_logograms["id"].str.replace('-',"")

# Calculating frequencies

In [9]:
for i in range(len(all_logograms["id"])):
    letter = all_logograms.loc[i, "id"]
    all_logograms.loc[i, "Frequency"] = logosyllabic_sentence_df["Sentence"].str.count(letter).sum()
    

In [10]:
all_logograms.rename(columns={'id': 'sign'}, inplace=True)
all_logograms.head()

Unnamed: 0,lemma,sign,Frequency
0,துச்சம்,5585,1
1,தானியங்கி,5966,3
2,தகவல்,5330,6
3,்டிற்,3241,2
4,மனிதநேயம்,5858,1


In [11]:
numbers = ['0','1','2','3','4','5','6','7','8','9']


# Calculating initial sign frequencies

In [12]:
# Finding initial signs
inital_sign_df = pd.DataFrame(columns = ['initial sign'])

for i in range(len(logosyllabic_sentence_df)):
    Sentence = logosyllabic_sentence_df.iloc[i]

    #Creating a dataframe with inital signs for each sentence
    Sign = Sentence.str.extractall(r'(?:([^-\d])|(^))(\d+)')
    Sign_df = pd.DataFrame(Sign)
    Sign_df.reset_index(drop=True, inplace=True)
    Sign_df.drop(columns=[0,1], inplace=True)
    Sign_df.rename(columns={2:'initial sign'}, inplace=True)

    # Adding it to inital sign dataframe
    inital_sign_df = inital_sign_df.append(Sign_df)
    inital_sign_df.reset_index(drop=True, inplace=True)


In [13]:
inital_sign_df.head()

Unnamed: 0,initial sign
0,5000
1,106
2,5001
3,5002
4,5003


In [14]:
# Calculating frequency of initial signs
inital_sign_freq_df = pd.DataFrame(inital_sign_df["initial sign"].value_counts())
inital_sign_freq_df.reset_index(inplace=True)
inital_sign_freq_df.rename(columns={'index': 'sign', 'initial sign': 'Initial Frequency'}, inplace=True)
inital_sign_freq_df.head()

Unnamed: 0,sign,Initial Frequency
0,5020,93
1,5164,89
2,5235,80
3,5040,78
4,5154,77


In [15]:
#Merge onto all logograms
all_logograms = all_logograms.merge(inital_sign_freq_df, on="sign", how ="left")

In [16]:
all_logograms.head()

Unnamed: 0,lemma,sign,Frequency,Initial Frequency
0,துச்சம்,5585,1,1.0
1,தானியங்கி,5966,3,3.0
2,தகவல்,5330,6,6.0
3,்டிற்,3241,2,
4,மனிதநேயம்,5858,1,1.0


# Calculating terminal sign frequencies

In [17]:
# Finding terminal signs
terminal_sign_df = pd.DataFrame(columns = ['terminal sign'])

for i in range(len(logosyllabic_sentence_df)):
    Sentence = logosyllabic_sentence_df.iloc[i]

    #Creating a dataframe with inital signs for each sentence
    Sign = Sentence.str.extractall(r'(?:([\d]+[\s]))')
    Sign_df = pd.DataFrame(Sign)
    Sign_df.reset_index(drop=True, inplace=True)
    Sign_df.rename(columns={0:'terminal sign'}, inplace=True)

    # Adding it to inital sign dataframe
    terminal_sign_df = terminal_sign_df.append(Sign_df)
    terminal_sign_df.reset_index(drop=True, inplace=True)


In [18]:
terminal_sign_df.head()

Unnamed: 0,terminal sign
0,5000
1,106
2,5001
3,2008
4,5003


In [19]:
# Calculating frequency of terminal signs
terminal_sign_freq_df = pd.DataFrame(terminal_sign_df["terminal sign"].value_counts())
terminal_sign_freq_df.reset_index(inplace=True)
terminal_sign_freq_df.rename(columns={'index': 'sign', 'terminal sign': 'Terminal Frequency'}, inplace=True)
terminal_sign_freq_df["sign"]=terminal_sign_freq_df["sign"].str.replace(' ','')
terminal_sign_freq_df.head()

Unnamed: 0,sign,Terminal Frequency
0,2008,437
1,2012,246
2,149,225
3,2006,220
4,2000,213


In [20]:
#Merge onto all logograms
all_logograms = all_logograms.merge(terminal_sign_freq_df, on="sign", how ="left")

In [21]:
all_logograms.head()

Unnamed: 0,lemma,sign,Frequency,Initial Frequency,Terminal Frequency
0,துச்சம்,5585,1,1.0,
1,தானியங்கி,5966,3,3.0,3.0
2,தகவல்,5330,6,6.0,3.0
3,்டிற்,3241,2,,2.0
4,மனிதநேயம்,5858,1,1.0,1.0


# Calculating natural logs for each sign

In [22]:
all_logograms["log Initial Frequency"] = ''
all_logograms["log Terminal Frequency"] = ''
all_logograms["log Frequency"] = ''
all_logograms["NPI(i)"] = ''
all_logograms["NPI(f)"] = ''
all_logograms.fillna(0, inplace=True)

In [23]:
all_logograms["log Initial Frequency"] = np.log(all_logograms["Initial Frequency"])
all_logograms["log Terminal Frequency"] = np.log(all_logograms["Terminal Frequency"])
all_logograms["log Frequency"] = np.log(all_logograms["Frequency"])

all_logograms.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,lemma,sign,Frequency,Initial Frequency,Terminal Frequency,log Initial Frequency,log Terminal Frequency,log Frequency,NPI(i),NPI(f)
0,துச்சம்,5585,1,1.0,0.0,0.0,-inf,0.0,,
1,தானியங்கி,5966,3,3.0,3.0,1.098612,1.098612,1.098612,,
2,தகவல்,5330,6,6.0,3.0,1.791759,1.098612,1.791759,,
3,்டிற்,3241,2,0.0,2.0,-inf,0.693147,0.693147,,
4,மனிதநேயம்,5858,1,1.0,1.0,0.0,0.0,0.0,,


In [24]:
#Only calculating NPI(i) if intial frequency is greater than terminal frequency
temp_df = all_logograms[all_logograms["Initial Frequency"]>all_logograms["Terminal Frequency"]]
temp_df["NPI(i)"] = -temp_df["log Initial Frequency"]/temp_df["log Frequency"]
mask = temp_df.index
all_logograms.loc[mask, ["NPI(i)"]] = temp_df.loc[:, ["NPI(i)"]]
all_logograms.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["NPI(i)"] = -temp_df["log Initial Frequency"]/temp_df["log Frequency"]


Unnamed: 0,lemma,sign,Frequency,Initial Frequency,Terminal Frequency,log Initial Frequency,log Terminal Frequency,log Frequency,NPI(i),NPI(f)
0,துச்சம்,5585,1,1.0,0.0,0.0,-inf,0.0,,
1,தானியங்கி,5966,3,3.0,3.0,1.098612,1.098612,1.098612,,
2,தகவல்,5330,6,6.0,3.0,1.791759,1.098612,1.791759,-1.0,
3,்டிற்,3241,2,0.0,2.0,-inf,0.693147,0.693147,,
4,மனிதநேயம்,5858,1,1.0,1.0,0.0,0.0,0.0,,


In [25]:
#Only calculating NPI(f) if terminal frequency is greater than initial frequency
temp_df = all_logograms[all_logograms["Terminal Frequency"]>all_logograms["Initial Frequency"]]
temp_df["NPI(f)"] = -temp_df["log Terminal Frequency"]/temp_df["log Frequency"]
mask = temp_df.index
all_logograms.loc[mask, ["NPI(f)"]] = temp_df.loc[:, ["NPI(f)"]]
all_logograms.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["NPI(f)"] = -temp_df["log Terminal Frequency"]/temp_df["log Frequency"]


Unnamed: 0,lemma,sign,Frequency,Initial Frequency,Terminal Frequency,log Initial Frequency,log Terminal Frequency,log Frequency,NPI(i),NPI(f)
0,துச்சம்,5585,1,1.0,0.0,0.0,-inf,0.0,,
1,தானியங்கி,5966,3,3.0,3.0,1.098612,1.098612,1.098612,,
2,தகவல்,5330,6,6.0,3.0,1.791759,1.098612,1.791759,-1.0,
3,்டிற்,3241,2,0.0,2.0,-inf,0.693147,0.693147,,-1.0
4,மனிதநேயம்,5858,1,1.0,1.0,0.0,0.0,0.0,,


# Calculating initial sign pair frequencies

In [26]:
# Finding initial sign pairs
initial_sign_pair_df = pd.DataFrame(columns = ['initial sign pair'])

for i in range(len(logosyllabic_sentence_df)):
    Sentence = logosyllabic_sentence_df.iloc[i]

    #Creating a dataframe with inital signs for each sentence
    Sign = Sentence.str.extractall(r'(?:([^-\d])|(^))(\d+-\d+)')
    Sign_df = pd.DataFrame(Sign)
    Sign_df.reset_index(drop=True, inplace=True)
    Sign_df.drop(columns=[0,1], inplace=True)
    Sign_df.rename(columns={2:'initial sign pair'}, inplace=True)

    # Adding it to inital sign dataframe
    initial_sign_pair_df = initial_sign_pair_df.append(Sign_df)
    initial_sign_pair_df.reset_index(drop=True, inplace=True)

In [27]:
initial_sign_pair_df.head()

Unnamed: 0,initial sign pair
0,5002-2008
1,5007-2006
2,5009-3003
3,5011-3004
4,5012-3005


In [28]:
# Calculating frequency of initial signs
initial_sign_pair_freq_df = pd.DataFrame(initial_sign_pair_df["initial sign pair"].value_counts())
initial_sign_pair_freq_df.reset_index(inplace=True)
initial_sign_pair_freq_df.rename(columns={'index': 'initial sign pair', 'initial sign pair': 'frequency'}, inplace=True)
initial_sign_pair_freq_df.head()

Unnamed: 0,initial sign pair,frequency
0,5235-2021,54
1,5024-2012,49
2,5017-3037,49
3,5079-3006,48
4,5095-2019,29


# Calculating terminal sign pair frequencies

In [29]:
# Finding terminal sign pairs
terminal_sign_pair_df = pd.DataFrame(columns = ['terminal sign pair'])

for i in range(len(logosyllabic_sentence_df)):
    Sentence = logosyllabic_sentence_df.iloc[i]

    #Creating a dataframe with inital signs for each sentence
    Sign = Sentence.str.extractall(r'(?:([\d]+-[\d]+[^-\d]))')
    Sign_df = pd.DataFrame(Sign)
    Sign_df.reset_index(drop=True, inplace=True)
    Sign_df.rename(columns={0:'terminal sign pair'}, inplace=True)

    # Adding it to inital sign dataframe
    terminal_sign_pair_df = terminal_sign_pair_df.append(Sign_df)
    terminal_sign_pair_df.reset_index(drop=True, inplace=True)

In [30]:
terminal_sign_pair_df.head()

Unnamed: 0,terminal sign pair
0,5002-2008
1,2001-155
2,2001-3006
3,3004-2008
4,5012-3005


In [31]:
# Calculating frequency of terminal signs
terminal_sign_pair_freq_df = pd.DataFrame(terminal_sign_pair_df["terminal sign pair"].value_counts())
terminal_sign_pair_freq_df.reset_index(inplace=True)
terminal_sign_pair_freq_df.rename(columns={'index': 'terminal sign pair', 'terminal sign pair': 'frequency'}, inplace=True)
terminal_sign_pair_freq_df.head()

Unnamed: 0,terminal sign pair,frequency
0,3004-2008,144
1,2012-2000,68
2,2021-2008,60
3,2012-2008,59
4,5079-3006,48


In [32]:
#Calculating Initial Frequencies
for i in range(len(logosyllabic_sentence_df["Sentence"])):
    logo = []
    for j in range(6):
        digit = 0
        if logosyllabic_sentence_df.loc[i, "Sentence"][j] in numbers:
            digit = logosyllabic_sentence_df.loc[i, "Sentence"][j][0]
            logo.append(digit)
        else:
            logogram = ''.join(logo)
    for k in range(len(all_logograms["id"])):
        if all_logograms.loc[k, "id"] == logogram:
            all_logograms.loc[k, "Initial Frequency"] = all_logograms.loc[k, "Initial Frequency"] + 1

KeyError: 'id'

In [None]:
#Calculating Terminal Frequencies
for i in range(len(logosyllabic_sentence_df["Sentence"])):
    logo = []
    length = len(logosyllabic_sentence_df.loc[i, "Sentence"])
    for j in range(length-1, length-6, -1):
        digit = 0
        if logosyllabic_sentence_df.loc[i, "Sentence"][j] in numbers:
            digit = logosyllabic_sentence_df.loc[i, "Sentence"][j][0]
            logo = [digit]+logo
        else:
            logogram = ''.join(logo)
    for k in range(len(all_logograms["id"])):
        if all_logograms.loc[k, "id"] == logogram:
            all_logograms.loc[k, "Terminal Frequency"] = all_logograms.loc[k, "Terminal Frequency"] + 1

In [None]:
#Creating sign pairs
columns = ['Sign pairs']
sign_pairs = pd.DataFrame(columns = columns)
sign_pairs.head()

Unnamed: 0,Sign pairs


In [None]:
# Function to extract sign pairs from sentences
for l in range(len(logosyllabic_sentence_df["Sentence"])):
    indexes = []
    try:
        for i in range(len(logosyllabic_sentence_df.loc[l, "Sentence"])):
            if (logosyllabic_sentence_df.loc[l, "Sentence"][i] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i+1] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i-1] not in numbers):
                indexes.append(i)
    except:
        pass
    words = []
    try:
        for i in range(len(indexes)):
            j = indexes[i]
            letter =[]
            for k in range(6):
                if logosyllabic_sentence_df.loc[l, "Sentence"][j+k] in numbers:
                    letter.append(logosyllabic_sentence_df.loc[l, "Sentence"][j+k])
                else:
                    word = ''.join(letter)
            words.append(word)
    except:
        pass
    for i in range(len(words)):
        try:
            first = words[i]
            second = words[i+1]
            sign_pairs.loc[len(sign_pairs), "Sign pairs"] = [first, second]
        except:
            pass

In [None]:
sign_pairs.head()

Unnamed: 0,Sign pairs
0,"[5000, 106]"
1,"[106, 5001]"
2,"[5001, 5002]"
3,"[5002, 2008]"
4,"[2008, 5003]"


In [None]:
# Calculating frequency of sign pairs
sign_pairs_df = pd.DataFrame(sign_pairs["Sign pairs"].value_counts())
sign_pairs_df.reset_index(inplace=True)
sign_pairs_df.rename(columns={'index': 'Sign pairs', 'Sign pairs': 'frequency'}, inplace=True)
sign_pairs_df.head()


Unnamed: 0,Sign pairs,frequency
0,"[3004, 2008]",155
1,"[2012, 2000]",104
2,"[3012, 2012]",94
3,"[2012, 2008]",68
4,"[2021, 2008]",67


In [None]:
all_logograms.sort_values(by=['Terminal Frequency'], ascending=False, inplace=True)
all_logograms.head()

Unnamed: 0,lemma,id,Initial Frequency,Frequency,Terminal Frequency,Sign Pairs
1607,உள்ளது,78,0,219,66,0
833,உள்ளார்,47,0,129,30,0
703,பட்டது,77,0,149,20,0
818,உள்ளனர்,57,0,514,15,0
1668,இருந்தது,18,0,244,6,0


In [None]:
# Creating initial sign pairs
columns = ['Initial sign pairs']
initial_sign_pairs = pd.DataFrame(columns = columns)
initial_sign_pairs.head()

Unnamed: 0,Initial sign pairs


In [None]:
# Finding initial sign pairs
for l in range(len(logosyllabic_sentence_df["Sentence"])):
    indexes = []
    try:
        for i in range(len(logosyllabic_sentence_df.loc[l, "Sentence"])):
            if (logosyllabic_sentence_df.loc[l, "Sentence"][i] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i+1] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i-1] not in numbers):
                indexes.append(i)
        indexes = indexes[0:2]
    except:
        pass
    words = []
    try:
        for i in range(len(indexes)):
            j = indexes[i]
            letter =[]
            for k in range(6):
                if logosyllabic_sentence_df.loc[l, "Sentence"][j+k] in numbers:
                    letter.append(logosyllabic_sentence_df.loc[l, "Sentence"][j+k])
                else:
                    word = ''.join(letter)
            words.append(word)
    except:
        pass
    for i in range(len(words)):
        try:
            first = words[i]
            second = words[i+1]
            initial_sign_pairs.loc[len(initial_sign_pairs), "Initial sign pairs"] = [first, second]
        except:
            pass

In [None]:
# Calculating frequency of sign pairs
initial_sign_pairs_df = pd.DataFrame(initial_sign_pairs["Initial sign pairs"].value_counts())
initial_sign_pairs_df.reset_index(inplace=True)
initial_sign_pairs_df.rename(columns={'index': 'Initial Sign pairs', 'Initial sign pairs': 'frequency'}, inplace=True)
initial_sign_pairs_df.head()

Unnamed: 0,Initial Sign pairs,frequency
0,"[5154, 5024]",7
1,"[5018, 2008]",6
2,"[121, 5139]",5
3,"[5164, 2031]",5
4,"[5084, 5020]",5


In [None]:
# Calculating terminal sign pairs
columns = ['Terminal sign pairs']
terminal_sign_pairs = pd.DataFrame(columns = columns)
terminal_sign_pairs.head()

Unnamed: 0,Terminal sign pairs


In [None]:
# Finding initial sign pairs
for l in range(len(logosyllabic_sentence_df["Sentence"])):
    indexes = []
    try:
        for i in range(len(logosyllabic_sentence_df.loc[l, "Sentence"])):
            if (logosyllabic_sentence_df.loc[l, "Sentence"][i] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i+1] in numbers) and (logosyllabic_sentence_df.loc[l, "Sentence"][i-1] not in numbers):
                indexes.append(i)
        indexes = indexes[-2:]
    except:
        pass
    words = []
    try:
        for i in range(len(indexes)):
            j = indexes[i]
            letter =[]
            for k in range(6):
                if logosyllabic_sentence_df.loc[l, "Sentence"][j+k] in numbers:
                    letter.append(logosyllabic_sentence_df.loc[l, "Sentence"][j+k])
                else:
                    word = ''.join(letter)
            words.append(word)
    except:
        pass
    for i in range(len(words)):
        try:
            first = words[i]
            second = words[i+1]
            terminal_sign_pairs.loc[len(terminal_sign_pairs), "Terminal sign pairs"] = [first, second]
        except:
            pass

In [None]:
# Calculating frequency of sign pairs
terminal_sign_pairs_df = pd.DataFrame(terminal_sign_pairs["Terminal sign pairs"].value_counts())
terminal_sign_pairs_df.reset_index(inplace=True)
terminal_sign_pairs_df.rename(columns={'index': 'Terminal Sign pairs', 'Terminal sign pairs': 'frequency'}, inplace=True)
terminal_sign_pairs_df.head()

Unnamed: 0,Terminal Sign pairs,frequency
0,"[2014, 2006]",38
1,"[2021, 2029]",36
2,"[2017, 2034]",35
3,"[2007, 2029]",27
4,"[5079, 3006]",17
