In [14]:
import spacy
import pandas as pd
from nltk import edit_distance
import tqdm
from nltk.stem import SnowballStemmer
import nltk

In [2]:
df = pd.read_excel("phonItalia 1.10 - word forms.xlsx")
df = df[['fqTot', 'gramCat', 'lemma', 'word', 'Phones']] # only keep relevant columns
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones
0,50518,P,a,a,a
1,5219,P IN P@,a,a,a
2,1544,P IN B@,a,a,a
3,272,P IN C@,a,a,a
4,234,S,a,a,a
...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli
119997,1,E,zurlini,zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone


In [3]:
phones = dict(zip(df['word'], df['Phones']))
df['lemma_phones'] = df['lemma'].apply(lambda i: phones.get(i, ''))
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
0,50518,P,a,a,a,a
1,5219,P IN P@,a,a,a,a
2,1544,P IN B@,a,a,a,a
3,272,P IN C@,a,a,a,a
4,234,S,a,a,a,a
...,...,...,...,...,...,...
119995,5,E,zurli,zurli,Zurli,Zurli
119996,2,E IN E@,zurli',zurli',Zurli,Zurli
119997,1,E,zurlini,zurlini,Zurlini,Zurlini
119998,1,G,zuzzurellone,zuzzurellone,ZuZZurellone,ZuZZurellone


In [4]:
df = df[df['lemma_phones'] != df['Phones']] # remove uninflected words
df = df[df['lemma_phones'] != ''] # remove words with broken lemmas
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones
12,1,R,il,a,a,il
16,1,C,e,a,a,e
32,1,VA IN E@,avere,a',a,avere
33,2,R,il,'a,a,il
45,1,S,abate,abati,abati,abate
...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina
119979,2,S,zuffa,zuffe,zuffe,zuffa
119981,1,S,zufolo,zufoli,zufoli,zufolo
119988,6,S,zuppa,zuppe,zuppe,zuppa


In [5]:
stemmer = nltk.stem.SnowballStemmer('italian')
df['nltk_stem'] = df['word'].apply(stemmer.stem)
df

Unnamed: 0,fqTot,gramCat,lemma,word,Phones,lemma_phones,nltk_stem
12,1,R,il,a,a,il,a
16,1,C,e,a,a,e,a
32,1,VA IN E@,avere,a',a,avere,a'
33,2,R,il,'a,a,il,'a
45,1,S,abate,abati,abati,abate,abat
...,...,...,...,...,...,...,...
119970,22,S,zucchina,zucchine,zukkine,zukkina,zucchin
119979,2,S,zuffa,zuffe,zuffe,zuffa,zuff
119981,1,S,zufolo,zufoli,zufoli,zufolo,zufol
119988,6,S,zuppa,zuppe,zuppe,zuppa,zupp


In [1]:
import tqdm, spacy, pandas
df = pandas.read_excel("dataset.xlsx")
nlp = spacy.load("it_core_news_lg")
df['morph'] = ''
df['pos'] = ''

for col in ['Gender', 'Number', 'Mood', 'Person', 'Tense', 'VerbForm', 'Clitic', 'PronType', 'Foreign', 'Degree', 'NumType', 'Definite', 'Poss']:
    df[col] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    try:
        doc = nlp(row['word'])
        tok = doc[0]
        df.loc[idx, 'morph'] = str(tok.morph)
        df.loc[idx, 'pos'] = tok.pos_
        for k, v in tok.morph.to_dict().items():
            df.loc[idx, k] = v
    except Exception as e:
        print(e, idx, row)


2023-10-16 19:55:42.886282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
58258it [03:53, 249.43it/s]


In [7]:
conv = {
    "B": "ADV",
    "C": "CCONJ", # or sconj? unknown
    "E": "NOUN",
    "G": "ADJ",
    "I": "INTJ",
    "N": "PRON",
    "P": "PREP",
    "K": "PUNCT",
    "R": "ART",
    "S": "NOUN", #substantive?
    "V": "VERB",
    "X": "X",
    "Z": "SYM",
    "NU": "NUM",
    "TC": "VERB", #composed verb?
    "VA": "AUX",
    "U": "X",
}

df["mismarked"] = df["gramCat"].apply(conv.get) != df["pos"]
df["mismarked"].value_counts()

False    38454
True     19804
Name: mismarked, dtype: int64

In [None]:
df = df[df["mismarked"] == False]

In [38]:
df = pd.read_excel("dataset.xlsx")

In [50]:
df = df.fillna('nan')
df['nltk_extra'] = ''
for idx, row in tqdm.tqdm(df.iterrows()):
    w = row["word"]
    l = row["nltk_stem"]
    try:
        df.loc[idx, 'nltk_extra'] = w[len(l):]
    except:
        print(w, l)

44444it [00:05, 8665.09it/s]


In [51]:
suffixes = {
    "ADJ": ["an", "os", "iv", "ic", "al", "tori", "in", "eo", "ea", "istic", "asc", "esc", "izi", "oid", "ign", "ace", "ard", "asc", "esc"],
    "NOUN": ["ment", "zion", "ist", "ism", "sim", "agli", "am", "um", "at", "et", "il", "ess", "aggi", "nz"],
    "ADV": ["mente"],
    "VERB": ["izz", "ific", "ell", "arell", "erell", "icchi", "acchi", "ett"]
}

df['derivational'] = ''

for idx, row in tqdm.tqdm(df.iterrows()):
    w = row['word']
    if row["pos"] not in suffixes:
        continue
    for s in suffixes[row["pos"]]:
        if s in row['nltk_extra']:
            df.loc[idx, 'derivational'] = s
            break
        elif row['nltk_stem'].endswith(s):
            df.loc[idx, 'nltk_stem'] = row['nltk_stem'][:-len(s)]
            df.loc[idx, 'derivational'] = s
            break
        elif row['nltk_stem'][:-1].endswith(s):
            df.loc[idx, 'nltk_stem'] = row['nltk_stem'][:-1][:-len(s)]
            df.loc[idx, 'derivational'] = s
            break 

44444it [00:06, 7361.69it/s]


In [52]:
df['derivational'].value_counts()

         36779
ic        1284
an         744
os         670
ett        643
zion       580
izz        517
at         458
iv         447
al         432
et         277
ment       266
ist        235
ific       214
nz         189
in         183
ell         97
am          96
il          70
ism         59
esc         55
ess         45
um          28
ard         26
asc          8
sim          8
ea           7
aggi         4
eo           4
ace          3
tori         3
mente        3
ign          3
acchi        2
agli         2
oid          2
icchi        1
Name: derivational, dtype: int64

In [41]:
df['derivational'].value_counts()

         37495
ic        1334
ett        619
os         560
zion       558
izz        514
an         469
at         436
iv         426
al         413
in         215
ific       214
nz         211
ist        210
am         189
ment       107
ell         95
esc         63
il          60
ism         56
et          50
ess         48
ard         30
um          23
asc         12
istic        9
sim          8
ign          8
mente        3
tori         3
erell        2
ea           2
oid          2
Name: derivational, dtype: int64

In [37]:
df.to_excel("dataset.xlsx")