In [1]:
import spacy
import pandas as pd
from nltk import edit_distance
import tqdm
from nltk.stem import SnowballStemmer
import nltk

2023-10-18 17:25:57.292778: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_excel("phonItalia 1.10 - word forms.xlsx")
df = df[['fqTot', 'gramCat', 'lemma', 'word', 'Phones']] # only keep relevant columns
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones
0,50518,P,a,a,a
1,5219,P IN P@,a,a,a
2,1544,P IN B@,a,a,a
3,272,P IN C@,a,a,a
4,234,S,a,a,a
...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli
119997,1,E,zurlini,zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone


In [3]:
phones = dict(zip(df['word'], df['Phones']))
df['lemma_phones'] = df['lemma'].apply(lambda i: phones.get(i, ''))
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
0,50518,P,a,a,a,a
1,5219,P IN P@,a,a,a,a
2,1544,P IN B@,a,a,a,a
3,272,P IN C@,a,a,a,a
4,234,S,a,a,a,a
...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli,Zurli
119997,1,E,zurlini,zurlini,Zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone


In [4]:
df = df[df['lemma_phones'] != df['Phones']] # remove uninflected words
df = df[df['lemma_phones'] != ''] # remove words with broken lemmas
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
12,1,R,il,a,a,il
16,1,C,e,a,a,e
32,1,VA IN E@,avere,a',a,avere
33,2,R,il,'a,a,il
45,1,S,abate,abati,abati,abate
...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina
119979,2,S,zuffa,zuffe,zuffe,zuffa
119981,1,S,zufolo,zufoli,zufoli,zufolo
119988,6,S,zuppa,zuppe,zuppe,zuppa


In [5]:
stemmer = nltk.stem.SnowballStemmer('italian')
df['nltk_stem'] = df['word'].apply(stemmer.stem)
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones,nltk_stem
12,1,R,il,a,a,il,a
16,1,C,e,a,a,e,a
32,1,VA IN E@,avere,a',a,avere,a'
33,2,R,il,'a,a,il,'a
45,1,S,abate,abati,abati,abate,abat
...,...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina,zucchin
119979,2,S,zuffa,zuffe,zuffe,zuffa,zuff
119981,1,S,zufolo,zufoli,zufoli,zufolo,zufol
119988,6,S,zuppa,zuppe,zuppe,zuppa,zupp


In [7]:
# import tqdm, spacy, pandas
# df = pandas.read_excel("dataset.xlsx")
nlp = spacy.load("it_core_news_lg")
df['morph'] = ''
df['pos'] = ''

for col in ['Gender', 'Number', 'Mood', 'Person', 'Tense', 'VerbForm', 'Clitic', 'PronType', 'Foreign', 'Degree', 'NumType', 'Definite', 'Poss']:
    df[col] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    try:
        doc = nlp(row['word'])
        tok = doc[0]
        df.loc[idx, 'morph'] = str(tok.morph)
        df.loc[idx, 'pos'] = tok.pos_
        for k, v in tok.morph.to_dict().items():
            df.loc[idx, k] = v
    except Exception as e:
        print(e, idx, row)


58258it [03:54, 248.61it/s]


In [8]:
conv = {
    "B": "ADV",
    "C": "CCONJ", # or sconj? unknown
    "E": "NOUN",
    "G": "ADJ",
    "I": "INTJ",
    "N": "PRON",
    "P": "PREP",
    "K": "PUNCT",
    "R": "ART",
    "S": "NOUN", #substantive?
    "V": "VERB",
    "X": "X",
    "Z": "SYM",
    "NU": "NUM",
    "TC": "VERB", #composed verb?
    "VA": "AUX",
    "U": "X",
}

df["gramCatStripped"] = df["gramCat"].apply(lambda i: i.split(" ")[0].strip("@"))

df["mismarked"] = df["gramCatStripped"].apply(conv.get) != df["pos"]
df["mismarked"].value_counts()

False    44444
True     13814
Name: mismarked, dtype: int64

In [None]:
# df = df[df["mismarked"] == False]

In [9]:
df = df.fillna('nan')
df['nltk_extra'] = ''
for idx, row in tqdm.tqdm(df.iterrows()):
    w = row["word"]
    l = row["nltk_stem"]
    try:
        df.loc[idx, 'nltk_extra'] = w[len(l):]
    except:
        print(w, l)

58258it [00:06, 9655.91it/s] 


In [11]:
suffixes = {
    "ADJ": ["an", "os", "iv", "ic", "al", "tori", "in", "eo", "ea", "istic", "asc", "esc", "izi", "oid", "ign", "ace", "ard", "asc", "esc"],
    "NOUN": ["ment", "zion", "ist", "ism", "sim", "agli", "am", "um", "at", "et", "il", "ess", "aggi", "nz"],
    "ADV": ["mente"],
    "VERB": ["izz", "ific", "ell", "arell", "erell", "icchi", "acchi", "ett"]
}

df['derivational'] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    w = row['word']
    c = conv[row["gramCatStripped"]]
    if c not in suffixes:
        continue
    for s in suffixes[c]:
        if s in row['nltk_extra']:
            df.loc[idx, 'derivational'] = s
            break
        elif row['nltk_stem'].endswith(s):
            df.loc[idx, 'nltk_stem'] = row['nltk_stem'][:-len(s)]
            df.loc[idx, 'derivational'] = s
            break
        elif row['nltk_stem'][:-1].endswith(s):
            df.loc[idx, 'nltk_stem'] = row['nltk_stem'][:-1][:-len(s)]
            df.loc[idx, 'derivational'] = s
            break 

58258it [00:07, 7589.43it/s]


In [14]:
df['derivational'].value_counts()

         48428
ic        1631
an         960
at         837
ett        771
os         759
izz        621
zion       586
al         550
iv         522
in         453
et         342
ist        282
ment       282
ific       253
nz         199
il         125
ell        111
am         103
esc         85
ess         72
ism         71
ard         38
tori        35
um          32
asc         26
ign         16
sim         16
ea          15
ace         12
aggi         8
eo           6
mente        5
icchi        2
acchi        2
oid          2
Name: derivational, dtype: int64

In [None]:
df.to_excel("dataset.xlsx")

In [None]:
df = pd.read_excel("dataset.xlsx")

In [None]:
inp = open("inputs2.txt", "w")
out = open("outputs2.txt", "w")

df = df.fillna('nan')

orders = {
    "Gender": ["nan", "Fem", "Masc", "Fem,Masc"],
    "Number": ["nan", "Sing", "Plur", "Plur,Sing"],
    "Mood":   ["nan", "Cnd", "Imp", "Ind", "Sub"],
    "Person": ["nan", "1", "1,2", "1,3", "2", "2,3", "3"],
    "Tense":  ["nan", "Fut", "Imp", "Past", "Pres"],
    "VerbForm": ["nan", "Fin", "Inf", "Ger", "Part"],
    "Clitic": ["nan", "Yes"]
}

for idx, row in tqdm.tqdm(df.iterrows()):
    in_ = " ".join(row["lemma_phones"])
    out_ = " ".join(row["Phones"])
        
    inflection = ''.join([str(orders[i].index(row[i]))
                          for i in ["Gender", "Number", "Mood", "Person", "Tense", "VerbForm", "Clitic"]])
    derivation = None
    if row["derivational"] != "nan":
        derivation = row["pos"]
        
    if derivation:
        in_ = derivation + " " + in_
        out_ = "- " + out_
    in_ = inflection + " " + in_
    out_ = "- " + out_
    
    inp.write(in_ + "\n")
    out.write(out_ + "\n")

inp.close()
out.close()