In [None]:
import spacy

#install Italian language model
!spacy download it_core_news_sm
#install English language model
!spacy download en_core_web_sm



In [3]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px

In [9]:
# Initialize empty lists to store Italian and English texts
texts_it = []
texts_en = []
file_names_it = []
file_names_en = []

# Define paths to folders containing Italian and English texts 
folder_path_it = 'data_italian'
folder_path_en = 'data_english'

# Iterate through each file in the Italian texts folder
for _file_name in os.listdir(folder_path_it):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts_it.append(open(folder_path_it + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names_it.append(_file_name)

# Iterate through each file in the English texts folder
for _file_name in os.listdir(folder_path_en):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts_en.append(open(folder_path_en + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names_en.append(_file_name)


In [11]:
# Create dictionary objects associating each file name with its text, one for Italian texts and one for English
d_it = {'Filename':file_names_it,'Document':texts_it}
d_en = {'Filename':file_names_en,'Document':texts_en}

In [13]:
# Turn dictionaries into dataframes
ungaretti_df = pd.DataFrame(d_it)
warpoets_df = pd.DataFrame(d_en) 

In [None]:
ungaretti_df

In [None]:
warpoets_df

In [None]:
# create a preprocessed "Text" column
ungaretti_df['Text'] = ungaretti_df['Document'].str.replace('[\n\u2028]', ' ', regex=True).str.strip()
ungaretti_df

In [None]:
warpoets_df['Text'] = warpoets_df['Document'].str.replace('[\n\u2028]', ' ', regex=True).str.strip()
warpoets_df

In [None]:
# Load metadata
metadata_en_df = pd.read_csv('extra_columns_en.csv')
metadata_en_df

In [None]:
metadata_it_df = pd.read_csv('extra_columns_it2.csv')
metadata_it_df

In [None]:
# Merge metadata and original DataFrames into a new, complete one 
final_ungaretti_df = metadata_it_df.merge(ungaretti_df,on='Filename')
final_ungaretti_df

In [None]:
final_warpoets_df = metadata_en_df.merge(warpoets_df,on='Filename')
final_warpoets_df

In [None]:
# Load nlp pipeline
nlp_it = spacy.load('it_core_news_sm')
# Check what functions it performs
print(nlp_it.pipe_names)

In [None]:
nlp_en = spacy.load('en_core_web_sm')
print(nlp_en.pipe_names)

In [107]:
# check if code works 
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp_en(sentence)

In [None]:
# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

In [111]:
# Define a function that runs the nlp pipeline on any given input text, one for each model
def process_texts_it(text):
    return nlp_it(text)

In [113]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each poem
final_ungaretti_df['Text'] = final_ungaretti_df['Text'].apply(process_texts_it)

In [115]:
def process_texts_en(text):
    return nlp_en(text)

In [117]:
final_warpoets_df['Text'] = final_warpoets_df['Text'].apply(process_texts_en)

In [119]:
# Define a function to retrieve tokens from a doc object
# (same for both English and Italian, as the process of tokenization is not language dependent)
def get_token(doc):
    return [(token.text) for token in doc]

In [None]:
# Run the token retrieval function on the doc objects in the dataframe
final_ungaretti_df['Tokens'] = final_ungaretti_df['Text'].apply(get_token)
final_ungaretti_df

In [None]:
final_warpoets_df['Tokens'] = final_warpoets_df['Text'].apply(get_token)
final_warpoets_df

In [None]:
tokens = final_ungaretti_df[['Text', 'Tokens']].copy()
tokens

In [127]:
# Define a function to retrieve lemmas from a doc object, one for each model 
def get_lemma_it(text):
    doc = nlp_it(text)
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_ungaretti_df['Lemmas'] = final_ungaretti_df['Text'].apply(get_lemma_it)

In [129]:
def get_lemma_en(text):
    doc = nlp_en(text)
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_warpoets_df['Lemmas'] = final_warpoets_df['Text'].apply(get_lemma_en)

In [None]:
print(f'"pietra" appears in the text tokens column ' + str(final_ungaretti_df['Tokens'].apply(lambda x: x.count('pietra')).sum()) + ' times.')
print(f'"pietra" appears in the lemmas column ' + str(final_ungaretti_df['Lemmas'].apply(lambda x: x.count('pietra')).sum()) + ' times.')

In [133]:
# Define a function to retrieve parts of speech from a doc object (same for both English and Italian)
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Run the parts of speech retrieval function on the doc objects in the dataframe
final_ungaretti_df['POS'] = final_ungaretti_df['Text'].apply(get_pos)
final_warpoets_df['POS'] = final_warpoets_df['Text'].apply(get_pos)

In [None]:
# Create a list of part of speech tags
list(final_ungaretti_df['POS'])

In [None]:
list(final_warpoets_df['POS'])

In [None]:
# Get all NE labels and assign to variable, for both language models
labels_it = nlp_it.get_pipe("ner").labels

# Print each label and its description
for label in labels_it:
    print(label + ' : ' + spacy.explain(label))

In [None]:
labels_en = nlp_en.get_pipe("ner").labels

# Print each label and its description
for label in labels_en:
    print(label + ' : ' + spacy.explain(label))

In [None]:
# Define function to extract named entities from doc objects, one for each model 
def extract_named_entities_it(text):
    doc = nlp_it(text)
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_ungaretti_df['Named_Entities'] = final_ungaretti_df['Text'].apply(extract_named_entities_it)
final_ungaretti_df['Named_Entities']

In [None]:
def extract_named_entities_en(text):
    doc = nlp_en(text)
    return [ent.label_ for ent in doc.ents]

final_warpoets_df['Named_Entities'] = final_warpoets_df['Text'].apply(extract_named_entities_en)
final_warpoets_df['Named_Entities']

In [None]:
# Define function to extract text tagged with named entities from doc objects, one for each model 
def extract_named_entities_it(text):
    doc = nlp_it(text)
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_ungaretti_df['NE_Words'] = final_ungaretti_df['Text'].apply(extract_named_entities_it)
final_ungaretti_df['NE_Words']

In [None]:
def extract_named_entities_en(text):
    doc = nlp_en(text)
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_warpoets_df['NE_Words'] = final_warpoets_df['Text'].apply(extract_named_entities_en)
final_warpoets_df['NE_Words']

In [151]:
# Merge dataframes
final_df = pd.concat([final_ungaretti_df, final_warpoets_df], ignore_index=True)

In [153]:
# Turn DataFrame into .csv file
final_df.to_csv('war_poems.csv')