In [32]:
import spacy
import pandas as pd
from nltk import edit_distance
import tqdm
from nltk.stem import SnowballStemmer
import nltk

In [33]:
df = pd.read_excel("phonItalia 1.10 - word forms.xlsx")
df = df[['fqTot', 'gramCat', 'lemma', 'word', 'Phones']] # only keep relevant columns
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones
0,50518,P,a,a,a
1,5219,P IN P@,a,a,a
2,1544,P IN B@,a,a,a
3,272,P IN C@,a,a,a
4,234,S,a,a,a
...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli
119997,1,E,zurlini,zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone


In [34]:
phones = dict(zip(df['word'], df['Phones']))
df['lemma_phones'] = df['lemma'].apply(lambda i: phones.get(i, ''))
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
0,50518,P,a,a,a,a
1,5219,P IN P@,a,a,a,a
2,1544,P IN B@,a,a,a,a
3,272,P IN C@,a,a,a,a
4,234,S,a,a,a,a
...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli,Zurli
119997,1,E,zurlini,zurlini,Zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone


In [35]:
# df = df[df['lemma_phones'] != df['Phones']] # remove uninflected words
df = df[df['lemma_phones'] != ''] # remove words with broken lemmas
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
0,50518,P,a,a,a,a
1,5219,P IN P@,a,a,a,a
2,1544,P IN B@,a,a,a,a
3,272,P IN C@,a,a,a,a
4,234,S,a,a,a,a
...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli,Zurli
119997,1,E,zurlini,zurlini,Zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone


In [36]:
stemmer = nltk.stem.SnowballStemmer('italian')
df['nltk_stem'] = df['word'].apply(stemmer.stem)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['nltk_stem'] = df['word'].apply(stemmer.stem)


Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones,nltk_stem
0,50518,P,a,a,a,a,a
1,5219,P IN P@,a,a,a,a,a
2,1544,P IN B@,a,a,a,a,a
3,272,P IN C@,a,a,a,a,a
4,234,S,a,a,a,a,a
...,...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli,zurl
119996,2,E IN E@,zurli',zurli',Zurli,Zurli,zurli'
119997,1,E,zurlini,zurlini,Zurlini,Zurlini,zurlin
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone,zuzzurellon


In [37]:
nlp = spacy.load("it_core_news_lg")

In [38]:
df['morph'] = ''
df['pos'] = ''

for col in ['Gender', 'Number', 'Mood', 'Person', 'Tense', 'VerbForm', 'Clitic', 'PronType', 'Foreign', 'Degree', 'NumType', 'Definite', 'Poss']:
    df[col] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    try:
        doc = nlp(row['word'])
        tok = doc[0]
        df.loc[idx, 'morph'] = str(tok.morph)
        df.loc[idx, 'pos'] = tok.pos_
        for k, v in tok.morph.to_dict().items():
            df.loc[idx, k] = v
    except Exception as e:
        print(e, idx, row)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['morph'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pos'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation

In [39]:
conv = {
    "B": "ADV",
    "C": "CCONJ", # or sconj? unknown
    "E": "NOUN",
    "G": "ADJ",
    "I": "INTJ",
    "N": "PRON",
    "P": "PREP",
    "K": "PUNCT",
    "R": "ART",
    "S": "NOUN", #substantive?
    "V": "VERB",
    "X": "X",
    "Z": "SYM",
    "NU": "NUM",
    "TC": "VERB", #composed verb?
    "VA": "AUX",
    "U": "X",
}

df["realCat"] = df["gramCat"].apply(conv.get)
df["mismarked"] = df["gramCat"].apply(conv.get) != df["pos"]
df["mismarked"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["realCat"] = df["gramCat"].apply(conv.get)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mismarked"] = df["gramCat"].apply(conv.get) != df["pos"]


mismarked
False    66244
True     44704
Name: count, dtype: int64

In [40]:
df = df.fillna('nan')
df['nltk_extra'] = ''
for idx, row in tqdm.tqdm(df.iterrows()):
    w = row["word"]
    l = row["nltk_stem"]
    try:
        df.loc[idx, 'nltk_extra'] = w[len(l):]
    except:
        print(w, l)

110948it [00:19, 5554.12it/s]


In [41]:
# x = df.copy(deep=True)

In [42]:
# x.to_excel("dataset-test-include-uninflected.xlsx")

In [43]:
# df = x.copy(deep=True)

In [44]:
# df = pd.read_excel("dataset-test-include-uninflected.xlsx")

In [45]:
df.to_excel("dataset-test-include-uninflected.xlsx")

In [46]:
import os

def find_actual_lemma(row_number, new_lemma_phones):
    best_cp, shortest_length, best = 0, 1000, ""
    buffer = 10
    for i in range(max(0, row_number - buffer), min(58258, row_number + buffer)):
        row = df.iloc[i]
        l = row["lemma_phones"]
        cp = len(os.path.commonprefix([l, new_lemma_phones]))
        if cp > best_cp or (cp == best_cp and len(l) > shortest_length):
            best_cp = cp
            shortest_length = len(l)
            best = l
        print(new_lemma_phones, cp, best)
    return best

In [47]:
import numpy as np
suffixes = {
    "ADJ": ["an", "os", "iv", "ic", "al", "tori", "in", "eo", "ea", "istic", "asc", "esc", "izi", "oid", "ign", "ace", "ard", "asc", "esc"],
    "NOUN": ["ment", "zion", "ist", "ism", "sim", "agli", "am", "um", "at", "et", "il", "ess", "aggi", "nz"],
    "ADV": ["mente"],
    "VERB": ["izz", "ific", "ell", "arell", "erell", "icchi", "acchi"]
}
# 
df['derivational'] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    w = row['word']
    if row["realCat"] not in suffixes:
        continue
    for s in suffixes[row["realCat"]]:
        new_phones = None
        if row['nltk_extra'] is np.nan or row['nltk_stem'] is np.nan:
            continue
        if s in row['nltk_extra']:
            df.loc[idx, 'derivational'] = s
            if row['lemma_phones'].endswith(s):
                new_phones = row['lemma_phones'][:-len(s)]
            elif row['lemma_phones'][:-1].endswith(s):
                new_phones = row['lemma_phones'][:-1][:-len(s)]
            df.loc[idx, 'lemma_phones'] = row['lemma_phones'][:len(row['nltk_stem'])]
        # elif row['nltk_stem'].endswith(s):
        #     new_phones = row['lemma_phones'][:-len(s)]
        #     df.loc[idx, 'derivational'] = s
        # elif row['nltk_stem'][:-1].endswith(s):
        #     new_phones = row['lemma_phones'][:-1][:-len(s)]
        #     df.loc[idx, 'derivational'] = s

        # if new_phones is not None and row['word'] == 'accettavano':
        #     print(row)
        #     print(new_phones)
        #     print(find_actual_lemma(idx, new_phones))
        #     input()
        
        # if new_phones is not None:
        #     # print(row['word'], row['lemma_phones'], row['nltk_stem'], row['nltk_extra'], s, new_phones)
        #     # print(find_actual_lemma(idx, new_phones))
        #     # input()
        #     lp = find_actual_lemma(idx, new_phones)
        #     df.loc[idx, 'lemma_phones'] = lp
        #     if lp == "":
        #         print(row)
        #         input()
        #     # if idx == 613:
        #     #     print(row)
        #     #     print(df.iloc[613])
        #     #     input()
        #     break

110948it [00:35, 3110.49it/s]


In [48]:
df.to_excel("dataset-test-include-uninflected.xlsx")

In [49]:
df['derivational'].value_counts()

derivational
         100651
at         1963
ic         1878
zion       1155
mente      1075
an          771
iv          720
os          539
am          533
ism         445
nz          418
ist         402
ment        259
il           63
tori         35
ea           21
et           18
sim           2
Name: count, dtype: int64

In [50]:
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones,nltk_stem,morph,pos,Gender,...,PronType,Foreign,Degree,NumType,Definite,Poss,realCat,mismarked,nltk_extra,derivational
0,50518,P,a,a,a,a,a,,ADP,,...,,,,,,,PREP,True,,
1,5219,P IN P@,a,a,a,a,a,,ADP,,...,,,,,,,,True,,
2,1544,P IN B@,a,a,a,a,a,,ADP,,...,,,,,,,,True,,
3,272,P IN C@,a,a,a,a,a,,ADP,,...,,,,,,,,True,,
4,234,S,a,a,a,a,a,,ADP,,...,,,,,,,NOUN,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli,zurl,Clitic=Yes|Gender=Masc|Number=Plur|Person=3|Pr...,VERB,Masc,...,Prs,,,,,,NOUN,True,i,
119996,2,E IN E@,zurli',zurli',Zurli,Zurli,zurli',Clitic=Yes|Gender=Masc|Number=Plur|Person=3|Pr...,VERB,Masc,...,Prs,,,,,,,True,,
119997,1,E,zurlini,zurlini,Zurlini,Zurlini,zurlin,,PROPN,,...,,,,,,,NOUN,True,i,
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone,zuzzurellon,Clitic=Yes|PronType=Prs|VerbForm=Ger,VERB,,...,Prs,,,,,,ADJ,True,e,


In [60]:
df.to_excel("dataset-test-include-uninflected.xlsx")

In [None]:
# import pandas as pd
# df = pd.read_excel("dataset-test-include-uninflected.xlsx")

In [52]:
clitics = {
    "me": "1SI",
    "mi": "1SD",
    "te": "2SI",
    "ti": "2SD",
    "vi": "2PD",
    "lo": "3SD",
    "la": "3SDF",
    "le": "3SIF",
    "gli": "3SDM",
    "l'": "3SIM",
    "li": "3PD",
    "si": "1PD",
    "ci": "1PI",
    "se": "3SDA",
    "ce": "3SIA",
    "ne": "3SDA"
}

for idx, row in tqdm.tqdm(df.iterrows()):
    if row["Clitic"] == "Yes":
        clitic = row["word"][-2:]
        try:
            df.loc[idx, "Clitic"] = clitics[clitic]
        except:
            pass # todo
#             print(row["word"])

110948it [00:02, 45134.35it/s]


In [53]:
df["Clitic"].value_counts()

Clitic
        104023
1PD       1440
3SD        914
3PD        825
3SDF       717
Yes        644
3SDA       535
1SD        464
1PI        457
3SIF       441
2SD        280
2PD        163
2SI         24
3SIA        18
1SI          2
3SIM         1
Name: count, dtype: int64

In [54]:
import tqdm
df["lemma_phones"].value_counts()

lemma_phones
Essere      212
fare        183
avere       176
andare      142
dare        106
           ... 
egeljano      1
assan         1
ammamEt       1
amburGer      1
ZuZZurro      1
Name: count, Length: 42451, dtype: int64

In [55]:
df = df.replace({'': 'nan'})

In [56]:
inp = open("inputs.txt", "w")
out = open("outputs.txt", "w")

df = df.fillna('nan')

use = ["Gender", "Number", "Person", "Tense", "Clitic", "Mood", "VerbForm"]

for idx, row in tqdm.tqdm(df.iterrows()):
    in_phones = " ".join(row["lemma_phones"])
    out_phones = " ".join(row["Phones"])
    
    in_ = ""
    out_ = ""
        
    inflection = ' '.join([row[i]
                          for i in use])
    inflection_add = ''.join([' - ' for i in use])
    derivation = None
    if row["derivational"] != "nan":
        derivation = row["pos"] + " - "
    else:
        derivation = " - " + row["pos"]
    if derivation:
        in_ += " " + derivation
        out_ += " -  - "
    else: # add 6th tag
        in_ += " - "
        out_ += " - "        
    in_ += " " + inflection
    out_ += inflection_add

    in_ += " " + in_phones
    out_ += " " + out_phones

    if in_phones == 'n a n':
        print(row)
        input()
    
#     print(in_, out_)
#     input()
    
    in_ = in_.strip()
    out_ = out_.strip()
    
    inp.write(in_ + "\n")
    out.write(out_ + "\n")

inp.close()
out.close()

110948it [00:04, 26583.66it/s]


In [57]:
import random
with open("inputs.txt") as in_file:
    inputs = in_file.readlines()
    
with open("outputs.txt") as out_file:
    outputs = out_file.readlines()
    
x = list(range(len(inputs)))
random.shuffle(x)

valid_size = 0.1
valid_split = int(len(inputs) * (1 - valid_size))

with open("inputs.txt", "w") as in_file:
    for i in range(valid_split):
        in_file.write(inputs[x[i]])
    
with open("valid_in.txt", "w") as valid_in_file:
    for i in range(valid_split, len(inputs)):
        valid_in_file.write(inputs[x[i]]) 
        
with open("outputs.txt", "w") as out_file:
    for i in range(valid_split):
        out_file.write(outputs[x[i]])
    
with open("valid_out.txt", "w") as valid_out_file:
    for i in range(valid_split, len(inputs)):
        valid_out_file.write(outputs[x[i]])

In [58]:
import random
with open("inputs.txt") as in_file:
    inputs = in_file.readlines()
    
with open("outputs.txt") as out_file:
    outputs = out_file.readlines()

x = list(range(len(inputs)))
random.shuffle(x)

test_size = 500

with open("test_in.txt", "w") as test_in_file:
    for i in range(test_size):
        test_in_file.write(inputs[i])

with open("test_out.txt", "w") as test_out_file:
    for i in range(test_size):
        test_out_file.write(outputs[i])

In [59]:
len(open("valid_out.txt").readlines())

11095