In [None]:
import spacy

#install Italian language model
!spacy download it_core_news_sm
#install English language model
!spacy download en_core_web_sm



In [2]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px

In [3]:
# Initialize empty lists to store Italian and English texts
texts_it = []
texts_en = []
file_names_it = []
file_names_en = []

# Define paths to folders containing Italian and English texts 
folder_path_it = 'data_italian'
folder_path_en = 'data_english'

# Iterate through each file in the Italian texts folder
for _file_name in os.listdir(folder_path_it):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts_it.append(open(folder_path_it + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names_it.append(_file_name)

# Iterate through each file in the English texts folder
for _file_name in os.listdir(folder_path_en):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts_en.append(open(folder_path_en + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names_en.append(_file_name)


In [4]:
# Create dictionary objects associating each file name with its text, one for Italian texts and one for English
d_it = {'Filename':file_names_it,'Document':texts_it}
d_en = {'Filename':file_names_en,'Document':texts_en}

In [5]:
# Turn dictionaries into dataframes
ungaretti_df = pd.DataFrame(d_it)
warpoets_df = pd.DataFrame(d_en) 

In [6]:
ungaretti_df

Unnamed: 0,Filename,Document
0,In dormiveglia.txt,Assisto la notte violentata\nL’aria è crivella...
1,Veglia.txt,Un’intera nottata Buttato vicino A un compagno...
2,Pellegrinaggio.txt,In agguato in queste budella di macerie ore e ...
3,Fratelli.txt,Di che reggimento siete fratelli?\nParola trem...
4,San Martino del Carso.txt,Di queste case \nNon è rimasto \nChe qualche \...
5,Soldati.txt,Si sta come \nD’autunno \nSugli alberi \nLe fo...
6,Sono una creatura.txt,Come questa pietra Del S. Michele Così fredda ...


In [7]:
warpoets_df

Unnamed: 0,Filename,Document
0,Dulce et Decorum Est.txt,"Dulce et Decorum Est\nBent double, like old be..."
1,Anthem for Doomed Youth.txt,Anthem for Doomed Youth\nWhat passing-bells fo...
2,Suicide in the Trenches.txt,Suicide In The Trenches\nI knew a simple soldi...
3,Break of Day in the Trenches.txt,Break of Day in the Trenches\nThe darkness cru...
4,August 1914.txt,August 1914\nWhat in our lives is burnt\nIn th...
5,Survivors.txt,Survivors\nNo doubt they’ll soon get well; the...


In [8]:
# create a preprocessed "Text" column
ungaretti_df['Text'] = ungaretti_df['Document'].str.replace('[\n\u2028]', ' ', regex=True).str.strip()
ungaretti_df

Unnamed: 0,Filename,Document,Text
0,In dormiveglia.txt,Assisto la notte violentata\nL’aria è crivella...,Assisto la notte violentata L’aria è crivellat...
1,Veglia.txt,Un’intera nottata Buttato vicino A un compagno...,Un’intera nottata Buttato vicino A un compagno...
2,Pellegrinaggio.txt,In agguato in queste budella di macerie ore e ...,In agguato in queste budella di macerie ore e ...
3,Fratelli.txt,Di che reggimento siete fratelli?\nParola trem...,Di che reggimento siete fratelli? Parola trema...
4,San Martino del Carso.txt,Di queste case \nNon è rimasto \nChe qualche \...,Di queste case Non è rimasto Che qualche Br...
5,Soldati.txt,Si sta come \nD’autunno \nSugli alberi \nLe fo...,Si sta come D’autunno Sugli alberi Le foglie.
6,Sono una creatura.txt,Come questa pietra Del S. Michele Così fredda ...,Come questa pietra Del S. Michele Così fredda ...


In [9]:
warpoets_df['Text'] = warpoets_df['Document'].str.replace('[\n\u2028]', ' ', regex=True).str.strip()
warpoets_df

Unnamed: 0,Filename,Document,Text
0,Dulce et Decorum Est.txt,"Dulce et Decorum Est\nBent double, like old be...","Dulce et Decorum Est Bent double, like old beg..."
1,Anthem for Doomed Youth.txt,Anthem for Doomed Youth\nWhat passing-bells fo...,Anthem for Doomed Youth What passing-bells for...
2,Suicide in the Trenches.txt,Suicide In The Trenches\nI knew a simple soldi...,Suicide In The Trenches I knew a simple soldie...
3,Break of Day in the Trenches.txt,Break of Day in the Trenches\nThe darkness cru...,Break of Day in the Trenches The darkness crum...
4,August 1914.txt,August 1914\nWhat in our lives is burnt\nIn th...,August 1914 What in our lives is burnt In the ...
5,Survivors.txt,Survivors\nNo doubt they’ll soon get well; the...,Survivors No doubt they’ll soon get well; the ...


In [42]:
# Load metadata
metadata_en_df = pd.read_csv('extra_columns_en.csv')
metadata_en_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year
0,Dulce et Decorum Est.txt,Dulce et decorum,Wilfred Owen,Poems,1920,,
1,Suicide in the Trenches.txt,Suicide in the trenches,Siegfried Sassoon,Counter-Attacks and other poems,1918,,
2,August 1914.txt,August 1914,Isaac Rosemberg,Collected Works,1922,,
3,Survivors.txt,Survivors,Siegfried Sassoon,Counter-Attacks and other poems,1918,,
4,Break of Day in the Trenches.txt,Break of Day in the Trenches,Isaac Rosemberg,Collected Works,1922,,
5,Anthem for Doomed Youth.txt,Anthem for Doomed Youth,Wilfred Owen,Poems,1920,,


In [44]:
metadata_it_df = pd.read_csv('extra_columns_it2.csv')
metadata_it_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year
0,In dormiveglia.txt,In dormiveglia,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931
1,Veglia.txt,Veglia,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931
2,Pellegrinaggio.txt,Pellegrinaggio,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931
3,Fratelli.txt,Fratelli,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931
4,San Martino del Carso.txt,San Martino del Carso,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931
5,Soldati.txt,Soldati,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931
6,Sono una creatura.txt,Sono una creatura,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931


In [46]:
# Merge metadata and original DataFrames into a new, complete one 
final_ungaretti_df = metadata_it_df.merge(ungaretti_df,on='Filename')
final_ungaretti_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year,Document,Text
0,In dormiveglia.txt,In dormiveglia,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Assisto la notte violentata\nL’aria è crivella...,Assisto la notte violentata L’aria è crivellat...
1,Veglia.txt,Veglia,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,Un’intera nottata Buttato vicino A un compagno...,Un’intera nottata Buttato vicino A un compagno...
2,Pellegrinaggio.txt,Pellegrinaggio,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,In agguato in queste budella di macerie ore e ...,In agguato in queste budella di macerie ore e ...
3,Fratelli.txt,Fratelli,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,Di che reggimento siete fratelli?\nParola trem...,Di che reggimento siete fratelli? Parola trema...
4,San Martino del Carso.txt,San Martino del Carso,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Di queste case \nNon è rimasto \nChe qualche \...,Di queste case Non è rimasto Che qualche Br...
5,Soldati.txt,Soldati,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Si sta come \nD’autunno \nSugli alberi \nLe fo...,Si sta come D’autunno Sugli alberi Le foglie.
6,Sono una creatura.txt,Sono una creatura,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Come questa pietra Del S. Michele Così fredda ...,Come questa pietra Del S. Michele Così fredda ...


In [48]:
final_warpoets_df = metadata_en_df.merge(warpoets_df,on='Filename')
final_warpoets_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year,Document,Text
0,Dulce et Decorum Est.txt,Dulce et decorum,Wilfred Owen,Poems,1920,,,"Dulce et Decorum Est\nBent double, like old be...","Dulce et Decorum Est Bent double, like old beg..."
1,Suicide in the Trenches.txt,Suicide in the trenches,Siegfried Sassoon,Counter-Attacks and other poems,1918,,,Suicide In The Trenches\nI knew a simple soldi...,Suicide In The Trenches I knew a simple soldie...
2,August 1914.txt,August 1914,Isaac Rosemberg,Collected Works,1922,,,August 1914\nWhat in our lives is burnt\nIn th...,August 1914 What in our lives is burnt In the ...
3,Survivors.txt,Survivors,Siegfried Sassoon,Counter-Attacks and other poems,1918,,,Survivors\nNo doubt they’ll soon get well; the...,Survivors No doubt they’ll soon get well; the ...
4,Break of Day in the Trenches.txt,Break of Day in the Trenches,Isaac Rosemberg,Collected Works,1922,,,Break of Day in the Trenches\nThe darkness cru...,Break of Day in the Trenches The darkness crum...
5,Anthem for Doomed Youth.txt,Anthem for Doomed Youth,Wilfred Owen,Poems,1920,,,Anthem for Doomed Youth\nWhat passing-bells fo...,Anthem for Doomed Youth What passing-bells for...


In [None]:
# Load nlp pipeline
nlp_it = spacy.load('it_core_news_sm')
# Check what functions it performs
print(nlp_it.pipe_names)

In [None]:
nlp_en = spacy.load('en_core_web_sm')
print(nlp_en.pipe_names)

In [54]:
# check if code works 
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp_en(sentence)

In [None]:
# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

In [58]:
# Define a function that runs the nlp pipeline on any given input text, one for each model
def process_texts_it(text):
    return nlp_it(text)

In [60]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each poem
final_ungaretti_df['Text'] = final_ungaretti_df['Text'].apply(process_texts_it)

In [62]:
def process_texts_en(text):
    return nlp_en(text)

In [64]:
final_warpoets_df['Text'] = final_warpoets_df['Text'].apply(process_texts_en)

In [66]:
# Define a function to retrieve tokens from a doc object
# (same for both English and Italian, as the process of tokenization is not language dependent)
def get_token(doc):
    return [(token.text) for token in doc]

In [68]:
# Run the token retrieval function on the doc objects in the dataframe
final_ungaretti_df['Tokens'] = final_ungaretti_df['Text'].apply(get_token)
final_ungaretti_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year,Document,Text,Tokens
0,In dormiveglia.txt,In dormiveglia,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Assisto la notte violentata\nL’aria è crivella...,"(Assisto, la, notte, violentata, L’, aria, è, ...","[Assisto, la, notte, violentata, L’, aria, è, ..."
1,Veglia.txt,Veglia,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,Un’intera nottata Buttato vicino A un compagno...,"(Un’, intera, nottata, Buttato, vicino, A, un,...","[Un’, intera, nottata, Buttato, vicino, A, un,..."
2,Pellegrinaggio.txt,Pellegrinaggio,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,In agguato in queste budella di macerie ore e ...,"(In, agguato, in, queste, budella, di, macerie...","[In, agguato, in, queste, budella, di, macerie..."
3,Fratelli.txt,Fratelli,Giuseppe Ungaretti,Il porto sepolto,1916,L'Allegria,1931,Di che reggimento siete fratelli?\nParola trem...,"(Di, che, reggimento, siete, fratelli, ?, Paro...","[Di, che, reggimento, siete, fratelli, ?, Paro..."
4,San Martino del Carso.txt,San Martino del Carso,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Di queste case \nNon è rimasto \nChe qualche \...,"(Di, queste, case, , Non, è, rimasto, , Che,...","[Di, queste, case, , Non, è, rimasto, , Che,..."
5,Soldati.txt,Soldati,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Si sta come \nD’autunno \nSugli alberi \nLe fo...,"(Si, sta, come, , D’, autunno, , Sugli, albe...","[Si, sta, come, , D’, autunno, , Sugli, albe..."
6,Sono una creatura.txt,Sono una creatura,Giuseppe Ungaretti,Allegria di naufragi,1919,L'Allegria,1931,Come questa pietra Del S. Michele Così fredda ...,"(Come, questa, pietra, Del, S., Michele, Così,...","[Come, questa, pietra, Del, S., Michele, Così,..."


In [70]:
final_warpoets_df['Tokens'] = final_warpoets_df['Text'].apply(get_token)
final_warpoets_df

Unnamed: 0,Filename,Title,Author,First Collection,First Publication Year,Second Collection,Second Publication Year,Document,Text,Tokens
0,Dulce et Decorum Est.txt,Dulce et decorum,Wilfred Owen,Poems,1920,,,"Dulce et Decorum Est\nBent double, like old be...","(Dulce, et, Decorum, Est, Bent, double, ,, lik...","[Dulce, et, Decorum, Est, Bent, double, ,, lik..."
1,Suicide in the Trenches.txt,Suicide in the trenches,Siegfried Sassoon,Counter-Attacks and other poems,1918,,,Suicide In The Trenches\nI knew a simple soldi...,"(Suicide, In, The, Trenches, I, knew, a, simpl...","[Suicide, In, The, Trenches, I, knew, a, simpl..."
2,August 1914.txt,August 1914,Isaac Rosemberg,Collected Works,1922,,,August 1914\nWhat in our lives is burnt\nIn th...,"(August, 1914, What, in, our, lives, is, burnt...","[August, 1914, What, in, our, lives, is, burnt..."
3,Survivors.txt,Survivors,Siegfried Sassoon,Counter-Attacks and other poems,1918,,,Survivors\nNo doubt they’ll soon get well; the...,"(Survivors, No, doubt, they, ’ll, soon, get, w...","[Survivors, No, doubt, they, ’ll, soon, get, w..."
4,Break of Day in the Trenches.txt,Break of Day in the Trenches,Isaac Rosemberg,Collected Works,1922,,,Break of Day in the Trenches\nThe darkness cru...,"(Break, of, Day, in, the, Trenches, The, darkn...","[Break, of, Day, in, the, Trenches, The, darkn..."
5,Anthem for Doomed Youth.txt,Anthem for Doomed Youth,Wilfred Owen,Poems,1920,,,Anthem for Doomed Youth\nWhat passing-bells fo...,"(Anthem, for, Doomed, Youth, What, passing, -,...","[Anthem, for, Doomed, Youth, What, passing, -,..."


In [72]:
tokens = final_ungaretti_df[['Text', 'Tokens']].copy()
tokens

Unnamed: 0,Text,Tokens
0,"(Assisto, la, notte, violentata, L’, aria, è, ...","[Assisto, la, notte, violentata, L’, aria, è, ..."
1,"(Un’, intera, nottata, Buttato, vicino, A, un,...","[Un’, intera, nottata, Buttato, vicino, A, un,..."
2,"(In, agguato, in, queste, budella, di, macerie...","[In, agguato, in, queste, budella, di, macerie..."
3,"(Di, che, reggimento, siete, fratelli, ?, Paro...","[Di, che, reggimento, siete, fratelli, ?, Paro..."
4,"(Di, queste, case, , Non, è, rimasto, , Che,...","[Di, queste, case, , Non, è, rimasto, , Che,..."
5,"(Si, sta, come, , D’, autunno, , Sugli, albe...","[Si, sta, come, , D’, autunno, , Sugli, albe..."
6,"(Come, questa, pietra, Del, S., Michele, Così,...","[Come, questa, pietra, Del, S., Michele, Così,..."


In [74]:
# Define a function to retrieve lemmas from a doc object, one for each model 
def get_lemma_it(text):
    doc = nlp_it(text)
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_ungaretti_df['Lemmas'] = final_ungaretti_df['Text'].apply(get_lemma_it)

In [76]:
def get_lemma_en(text):
    doc = nlp_en(text)
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_warpoets_df['Lemmas'] = final_warpoets_df['Text'].apply(get_lemma_en)

In [78]:
print(f'"pietra" appears in the text tokens column ' + str(final_ungaretti_df['Tokens'].apply(lambda x: x.count('pietra')).sum()) + ' times.')
print(f'"pietra" appears in the lemmas column ' + str(final_ungaretti_df['Lemmas'].apply(lambda x: x.count('pietra')).sum()) + ' times.')

"pietra" appears in the text tokens column 3 times.
"pietra" appears in the lemmas column 3 times.


In [80]:
# Define a function to retrieve parts of speech from a doc object (same for both English and Italian)
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Run the parts of speech retrieval function on the doc objects in the dataframe
final_ungaretti_df['POS'] = final_ungaretti_df['Text'].apply(get_pos)
final_warpoets_df['POS'] = final_warpoets_df['Text'].apply(get_pos)

In [None]:
# Create a list of part of speech tags
list(final_ungaretti_df['POS'])

In [None]:
list(final_warpoets_df['POS'])

In [None]:
# Get all NE labels and assign to variable, for both language models
labels_it = nlp_it.get_pipe("ner").labels

# Print each label and its description
for label in labels_it:
    print(label + ' : ' + spacy.explain(label))

In [None]:
labels_en = nlp_en.get_pipe("ner").labels

# Print each label and its description
for label in labels_en:
    print(label + ' : ' + spacy.explain(label))

In [None]:
# Define function to extract named entities from doc objects, one for each model 
def extract_named_entities_it(text):
    doc = nlp_it(text)
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_ungaretti_df['Named_Entities'] = final_ungaretti_df['Text'].apply(extract_named_entities_it)
final_ungaretti_df['Named_Entities']

In [None]:
def extract_named_entities_en(text):
    doc = nlp_en(text)
    return [ent.label_ for ent in doc.ents]

final_warpoets_df['Named_Entities'] = final_warpoets_df['Text'].apply(extract_named_entities_en)
final_warpoets_df['Named_Entities']

In [None]:
# Define function to extract text tagged with named entities from doc objects, one for each model 
def extract_named_entities_it(text):
    doc = nlp_it(text)
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_ungaretti_df['NE_Words'] = final_ungaretti_df['Text'].apply(extract_named_entities_it)
final_ungaretti_df['NE_Words']

In [None]:
def extract_named_entities_en(text):
    doc = nlp_en(text)
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_warpoets_df['NE_Words'] = final_warpoets_df['Text'].apply(extract_named_entities_en)
final_warpoets_df['NE_Words']

In [98]:
# Merge dataframes
final_df = pd.concat([final_ungaretti_df, final_warpoets_df], ignore_index=True)

In [100]:
# Turn DataFrame into .csv file
final_df.to_csv('war_poems.csv')

In [148]:
# Small analysis: predominance of personal/ possessive pronouns and first person verbs in the Italian vs. English 
# texts. How impactful is personal experience on the authors' writing, based on word frequencies


# calculate total words for each DataFrame
total_words_it = final_ungaretti_df['Tokens'].apply(len).sum()

# define function for word frequencies
def word_frequencies_it(word, total_words_it):
    single_word_count = final_ungaretti_df['Tokens'].apply(lambda x: x.count(word)).sum()
    frequency = single_word_count / total_words_it
    result = f'"{word}" appears {single_word_count} times. Frequency: {frequency}'
    return result

print(total_words_it)
print(word_frequencies_it('mio', total_words_it)) #'mio' = my (masculine)

print(word_frequencies_it('mia', total_words_it)) #'mia' = my (feminine)

print(word_frequencies_it('mi', total_words_it)) #'mi' = reflexive pronoun

print(word_frequencies_it('io', total_words_it)) #'io' = I

print(word_frequencies_it('ho', total_words_it)) #'ho' = (I) have

print(word_frequencies_it('sono', total_words_it))#'sono' = (I) am

print(word_frequencies_it('noi', total_words_it)) #'noi' = we




256
"mio" appears 3 times. Frequency: 0.01171875
"mia" appears 1 times. Frequency: 0.00390625
"mi" appears 1 times. Frequency: 0.00390625
"io" appears 1 times. Frequency: 0.00390625
"ho" appears 1 times. Frequency: 0.00390625
"sono" appears 1 times. Frequency: 0.00390625
"noi" appears 0 times. Frequency: 0.0


In [150]:
total_words_en = final_warpoets_df['Tokens'].apply(len).sum()

def word_frequencies_en(word, total_words_en):
    # Use the original case for 'I', lowercase for all other words
    search_word = word if word == 'I' else word.lower()
    single_word_count = final_warpoets_df['Tokens'].apply(lambda x: x.count(search_word)).sum()
    frequency = single_word_count / total_words_en
    result = f'"{word}" appears {single_word_count} times. Frequency: {frequency}'
    return result

print(total_words_en)
print(word_frequencies_en('my', total_words_en))
print(word_frequencies_en('I', total_words_en))
print(word_frequencies_en('am', total_words_en))
print(word_frequencies_en('we', total_words_en))

911
"my" appears 5 times. Frequency: 0.005488474204171241
"I" appears 3 times. Frequency: 0.003293084522502744
"am" appears 0 times. Frequency: 0.0
"we" appears 4 times. Frequency: 0.0043907793633369925
