In [1]:
import spacy
import pandas as pd
from nltk import edit_distance
import tqdm
from nltk.stem import SnowballStemmer
import nltk

2023-11-15 19:39:08.550548: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
df = pd.read_excel("phonItalia 1.10 - word forms.xlsx")
df = df[['fqTot', 'gramCat', 'lemma', 'word', 'Phones']] # only keep relevant columns
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones
0,50518,P,a,a,a
1,5219,P IN P@,a,a,a
2,1544,P IN B@,a,a,a
3,272,P IN C@,a,a,a
4,234,S,a,a,a
...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli
119997,1,E,zurlini,zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone


In [9]:
phones = dict(zip(df['word'], df['Phones']))
df['lemma_phones'] = df['lemma'].apply(lambda i: phones.get(i, ''))
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
0,50518,P,a,a,a,a
1,5219,P IN P@,a,a,a,a
2,1544,P IN B@,a,a,a,a
3,272,P IN C@,a,a,a,a
4,234,S,a,a,a,a
...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli,Zurli
119997,1,E,zurlini,zurlini,Zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone


In [10]:
df = df[df['lemma_phones'] != df['Phones']] # remove uninflected words
df = df[df['lemma_phones'] != ''] # remove words with broken lemmas
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
12,1,R,il,a,a,il
16,1,C,e,a,a,e
32,1,VA IN E@,avere,a',a,avere
33,2,R,il,'a,a,il
45,1,S,abate,abati,abati,abate
...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina
119979,2,S,zuffa,zuffe,zuffe,zuffa
119981,1,S,zufolo,zufoli,zufoli,zufolo
119988,6,S,zuppa,zuppe,zuppe,zuppa


In [11]:
stemmer = nltk.stem.SnowballStemmer('italian')
df['nltk_stem'] = df['word'].apply(stemmer.stem)
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones,nltk_stem
12,1,R,il,a,a,il,a
16,1,C,e,a,a,e,a
32,1,VA IN E@,avere,a',a,avere,a'
33,2,R,il,'a,a,il,'a
45,1,S,abate,abati,abati,abate,abat
...,...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina,zucchin
119979,2,S,zuffa,zuffe,zuffe,zuffa,zuff
119981,1,S,zufolo,zufoli,zufoli,zufolo,zufol
119988,6,S,zuppa,zuppe,zuppe,zuppa,zupp


In [2]:
nlp = spacy.load("it_core_news_lg")

In [12]:
# import tqdm, spacy, pandas
# df = pandas.read_excel("dataset.xlsx")
df['morph'] = ''
df['pos'] = ''

for col in ['Gender', 'Number', 'Mood', 'Person', 'Tense', 'VerbForm', 'Clitic', 'PronType', 'Foreign', 'Degree', 'NumType', 'Definite', 'Poss']:
    df[col] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    try:
        doc = nlp(row['word'])
        tok = doc[0]
        df.loc[idx, 'morph'] = str(tok.morph)
        df.loc[idx, 'pos'] = tok.pos_
        for k, v in tok.morph.to_dict().items():
            df.loc[idx, k] = v
    except Exception as e:
        print(e, idx, row)


58258it [04:42, 206.12it/s]


In [13]:
conv = {
    "B": "ADV",
    "C": "CCONJ", # or sconj? unknown
    "E": "NOUN",
    "G": "ADJ",
    "I": "INTJ",
    "N": "PRON",
    "P": "PREP",
    "K": "PUNCT",
    "R": "ART",
    "S": "NOUN", #substantive?
    "V": "VERB",
    "X": "X",
    "Z": "SYM",
    "NU": "NUM",
    "TC": "VERB", #composed verb?
    "VA": "AUX",
    "U": "X",
}

df["mismarked"] = df["gramCat"].apply(conv.get) != df["pos"]
df["mismarked"].value_counts()

False    38454
True     19804
Name: mismarked, dtype: int64

In [14]:
df = df.fillna('nan')
df['nltk_extra'] = ''
for idx, row in tqdm.tqdm(df.iterrows()):
    w = row["word"]
    l = row["nltk_stem"]
    try:
        df.loc[idx, 'nltk_extra'] = w[len(l):]
    except:
        print(w, l)

58258it [00:06, 9121.49it/s]


In [63]:
import os

def find_actual_lemma(row_number, new_lemma_phones):
    best_cp, shortest_length, best = 0, 0, ""
    buffer = 10
    for i in range(max(0, row_number - buffer), min(58258, row_number + buffer)):
        row = df.iloc[i]
        l = row["lemma_phones"]
        cp = len(os.path.commonprefix([l, new_lemma_phones]))
        if cp > best_cp or (cp == best_cp and len(l) < shortest_length):
            best_cp = cp
            shortest_length = len(l)
            best = l
    return best

In [64]:
suffixes = {
    "ADJ": ["an", "os", "iv", "ic", "al", "tori", "in", "eo", "ea", "istic", "asc", "esc", "izi", "oid", "ign", "ace", "ard", "asc", "esc"],
    "NOUN": ["ment", "zion", "ist", "ism", "sim", "agli", "am", "um", "at", "et", "il", "ess", "aggi", "nz"],
    "ADV": ["mente"],
    "VERB": ["izz", "ific", "ell", "arell", "erell", "icchi", "acchi", "ett"]
}
# 
# df['derivational'] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    w = row['word']
    if row["pos"] not in suffixes:
        continue
    for s in suffixes[row["pos"]]:
        new_phones = None
        if s in row['nltk_extra']:
            df.loc[idx, 'derivational'] = s
            if row['lemma_phones'].endswith(s):
                new_phones = row['lemma_phones'][:-len(s)]
            elif row['lemma_phones'][:-1].endswith(s):
                new_phones = row['lemma_phones'][:-1][:-len(s)]
        elif row['nltk_stem'].endswith(s):
            new_phones = row['lemma_phones'][:-len(s)]
            df.loc[idx, 'derivational'] = s
        elif row['nltk_stem'][:-1].endswith(s):
            new_phones = row['lemma_phones'][:-1][:-len(s)]
            df.loc[idx, 'derivational'] = s

        if new_phones is not None:
#             print(row['word'], row['lemma_phones'], row['nltk_stem'], row['nltk_extra'], s, new_phones)
#             print(find_actual_lemma(idx, new_phones))
#             input()
            df.loc[idx, 'lemma_phones'] = find_actual_lemma(idx, new_phones)
            break

58258it [00:14, 3913.75it/s]


In [56]:
df.iloc[585]

fqTot                                                          77
gramCat                                                         V
lemma                                                   accettare
word                                                      accetta
Phones                                                    accEtta
lemma_phones                                            accettare
nltk_stem                                                  accett
morph           Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...
pos                                                          VERB
Gender                                                           
Number                                                       Sing
Mood                                                          Ind
Person                                                          3
Tense                                                        Pres
VerbForm                                                      Fin
Clitic    

In [65]:
df['derivational'].value_counts()

ic       1560
an       1093
at       1000
os        798
ett       695
al        639
izz       567
zion      561
iv        537
in        419
et        404
ist       345
ment      258
ific      237
nz        206
am        183
il        119
ell       108
ess        99
esc        77
ism        59
um         41
ard        40
asc        15
ea         14
sim        14
ign         8
ace         8
eo          6
tori        5
aggi        5
mente       3
acchi       2
oid         2
icchi       1
agli        1
Name: derivational, dtype: int64

In [24]:
df.to_excel("dataset.xlsx")

In [22]:
df = pd.read_excel("dataset.xlsx")

In [23]:
clitics = {
    "me": "1SI",
    "mi": "1SD",
    "te": "2SI",
    "ti": "2SD",
    "vi": "2PD",
    "lo": "3SD",
    "la": "3SDF",
    "le": "3SIF",
    "gli": "3SDM",
    "l'": "3SIM",
    "li": "3PD",
    "si": "1PD",
    "ci": "1PI",
    "se": "3SDA",
    "ce": "3SIA",
    "ne": "3SDA"
}

for idx, row in df.iterrows():
    if row["Clitic"] == "Yes":
        clitic = row["word"][-2:]
        try:
            df.loc[idx, "Clitic"] = clitics[clitic]
        except:
            pass # todo
#             print(row["word"])

abbaglino
abbassera'
abbozzo'
accompagnero'
aderimmo
affosserebbero
aggiusteremo
allontanino
animalia
appassionerai
approfittero'
arghi
arrangeranno
arruolino
arruolo'
assali'
augureremmo
augusta
avverti'
baderanno
balbettavo
battero'
bevvero
boema
brindero'
brontolarono
brontolo'
bussammo
cambiai
cancellerai
c'e'
cedemmo
c'entra
c'entrano
c'entrava
c'entravano
c'entravo
c'entreranno
c'entri
c'entriamo
c'entro
c'era
c'erano
c'eravamo
cercar
cerchero'
c'eri
c'ero
c'hanno
cio'
cio'
comincero'
cominciassimo
comperai
compiangono
concordammo
coniugherai
conprado
contattai
converti'
criminalizzo
cubie
curai
danzino
delimitino
determino'
deviai
diminui'
disprezzavo
distruggero'
diverti'
divorziai
dormimmo
egual
empirea
ereditammo
eroderebbero
escogiteremo
esploravo
fabbricheranno
figliuol
funebri
gelino
giungera'
gonfie
gridammo
importera'
indirizzai
indovino
indovino'
influi'
kenyani
liberino
libitum
liquiderebbe
magos
marciammo
maturero'
mi'
morir
mutua
ne'
nemmen'
neppur
opponemmo
opto'
or

In [26]:
df["Clitic"].value_counts()

nan     52215
1PD      1389
3SD       849
3PD       746
3SDF      617
3SDA      488
1SD       453
3SIF      426
1PI       425
2SD       235
Yes       229
2PD       160
2SI        13
3SIA       12
3SIM        1
Name: Clitic, dtype: int64

In [25]:
inp = open("inputs.txt", "w")
out = open("outputs.txt", "w")

df = df.fillna('nan')

use = ["Gender", "Number", "Person", "Tense", "Clitic", "Mood", "VerbForm"]

for idx, row in tqdm.tqdm(df.iterrows()):
    in_phones = " ".join(row["lemma_phones"])
    out_phones = " ".join(row["Phones"])
    
    in_ = ""
    out_ = ""
        
    inflection = ' '.join([row[i]
                          for i in use])
    inflection_add = ''.join([' - ' for i in use])
    derivation = None
    if row["derivational"] != "nan":
        derivation = row["pos"]
    if derivation:
        in_ += " " + derivation
        out_ += " - "
    else: # add 6th tag
        in_ += " - "
        out_ += " - "        
    in_ += " " + inflection
    out_ += inflection_add

    in_ += " " + in_phones
    out_ += " " + out_phones
    
#     print(in_, out_)
#     input()
    
    in_ = in_.strip()
    out_ = out_.strip()
    
    inp.write(in_ + "\n")
    out.write(out_ + "\n")

inp.close()
out.close()

58258it [00:03, 14720.82it/s]


In [27]:
import random
with open("inputs.txt") as in_file:
    inputs = in_file.readlines()
    
with open("outputs.txt") as out_file:
    outputs = out_file.readlines()
    
x = list(range(len(inputs)))
random.shuffle(x)

valid_size = 0.1
valid_split = int(len(inputs) * (1 - valid_size))

with open("inputs.txt", "w") as in_file:
    for i in range(valid_split):
        in_file.write(inputs[x[i]])
    
with open("valid_in.txt", "w") as valid_in_file:
    for i in range(valid_split, len(inputs)):
        valid_in_file.write(inputs[x[i]]) 
        
with open("outputs.txt", "w") as out_file:
    for i in range(valid_split):
        out_file.write(outputs[x[i]])
    
with open("valid_out.txt", "w") as valid_out_file:
    for i in range(valid_split, len(inputs)):
        valid_out_file.write(outputs[x[i]])