# National Library ALTO XML Text Mining Examples (Digitalia-Notebook)

Version history
* 0.2  30.9.2021 Downloading data using api. Fixes to code and translations (Erno Liukkonen)
* 0.1 23.4.2019 Small text corrections
* 0.02 Digitalia-project, minor fixes
* 0.01 15.1.2019 (Mika Koistinen)


# Thanks



<table><tr><td>
<img src="https://blogs.helsinki.fi/digitalia/files/2018/10/sosiaali_fi_90p.jpg" style="height:100px;width:100%">
    </td><td>
<img src="https://blogs.helsinki.fi/digitalia/files/2018/10/fi_EU_rgb_90p.jpg" style="height:100px;width:100%">
    </td><td>
<img src="https://blogs.helsinki.fi/digitalia/files/2015/10/digitalia_pien_512.png" style="height:70px;width:100%">
    </td></td></tr></table>

# Downloading data using application programming interface (API)

API is used to download data. Search collections using https://digi.kansalliskirjasto.fi/search and copy results page address to searchResultsUrl variable that is below (default value is https://digi.kansalliskirjasto.fi/search?startDate=1870-10-01&endDate=1870-12-31&pages=1-50&title=1457-4721&formats=NEWSPAPER).

bindingSearchQuery function downloads information related to search results using API. Search results page address is given to function as a parameter.

In [None]:
import urllib
from urllib.error import URLError, HTTPError
from socket import timeout
import simplejson as json

searchResultsUrl = "https://digi.kansalliskirjasto.fi/search?startDate=1870-10-01&endDate=1870-12-31&pages=1-50&title=1457-4721&formats=NEWSPAPER"

request_headers = {
"User-Agent": "Notebook dam-rajapinta",
"Referer": "Notebook dam-rajapinta",
"Connection": "keep-alive" 
}

pageImageTemplate = ""
altoXmlTemplate = ""
altoTxtTemplate = ""

def bindingSearchQuery(digiResultsUrl):
    
    
    global pageImageTemplate
    global altoXmlTemplate
    global altoTxtTemplate

    parameters = digiResultsUrl[digiResultsUrl.index('?'):]

    currentRows = []
    result = ""
    noMoreResults = False
    isFirstSearch = True

    digiBindingSearchURL = 'https://digi.kansalliskirjasto.fi/api/dam/binding-search' + parameters

    while noMoreResults == False:
    
      req = urllib.request.Request(digiBindingSearchURL, headers=request_headers)
      try:
        response = urllib.request.urlopen(req, timeout=30)
        responseResult = response.read()
        result = json.loads(responseResult)

        pageImageTemplate = result["pageImageTemplate"]
        altoXmlTemplate = result["altoXmlTemplate"]
        altoTxtTemplate = result["altoTxtTemplate"]

        if len(result["rows"]) != 0:
          currentRows = currentRows + result["rows"]
        else:
          noMoreResults = True

        if isFirstSearch == True:
          digiBindingSearchURL = 'https://digi.kansalliskirjasto.fi/api/dam/binding-search/' + result["scrollId"]
          isFirstSearch = False

      except HTTPError as e:
        content = e.read()
      except ConnectionError as e:
        print("No connection to the server!")
      except URLError as e:
        print("No connection to the server!")
      except TimeoutError as e:
        print("No connection to the server!")
      except timeout:
        print("No connection to the server!")

    result["rows"] = currentRows

    return result

Next ALTO- and text-files are downloaded using urlretrieve function, function calls are build using information that was retrieved using bindSearchQuery function. Files are downloaded to the "downloads"-folder, which is in the same folder where the notebook was started.

In [None]:
import sys
import os
import urllib
from urllib.error import URLError, HTTPError
from socket import timeout

def urlretrieve(url, localfile):
    
    global searchResultsUrl
    
    try:
      req = urllib.request.Request(url, headers=request_headers)
      f = urllib.request.urlopen(req, timeout=30)

      # Open our local file for writing
      with open(localfile, "wb") as fl:
          fl.write(f.read())

    #handle errors
    except HTTPError as e:
      print("No connection to the server!")
      return 0 
    except URLError as e:
      print("No connection to the server!")
      return 0
    except TimeoutError as e:
      print("No connection to the server!")
      return 0
    except timeout:
      print("No connection to the server!")
      return 0

    return 1

notebookPath =  os.path.dirname(os.path.realpath("__file__"))

result = bindingSearchQuery(searchResultsUrl)

rows = result["rows"]

isFirstResultRow = True
firstResultAlto = ""
firstResultPath = ""
 
resultCounter = 1

for row in rows:
    bindingTitle = row["bindingTitle"]
    bindingId = str(row["bindingId"])
    pageNumber =  str(row["pageNumber"])
    baseUrl = row["baseUrl"]
    
    altoUrl  = baseUrl + altoXmlTemplate.replace("{{page}}", str(pageNumber))
    txtUrl  = baseUrl + altoTxtTemplate.replace("{{page}}", str(pageNumber))
    
    altoPath = notebookPath + "/downloads/" + bindingTitle + "_" + bindingId + "/alto/"
    txtPath = notebookPath + "/downloads/" + bindingTitle + "_" + bindingId + "/txt/"

    if not os.path.exists(altoPath):
        os.makedirs(altoPath)
    
    if not os.path.exists(txtPath):
        os.makedirs(txtPath)
        
    downloadPathAlto = altoPath + bindingTitle + "_" + bindingId + "_page_" + pageNumber + ".xml"
    downloadPathTxt = txtPath + bindingTitle + "_" + bindingId + "_page_" + pageNumber + ".txt"
    
    print("Downloading result: " + str(resultCounter) + "/" + str(len(rows)))
    
    urlretrieve(altoUrl, downloadPathAlto)
    urlretrieve(txtUrl, downloadPathTxt)
    
    resultCounter += 1
    
    if isFirstResultRow:
        firstResultAlto = downloadPathAlto
        firstResultPath = altoPath
        isFirstResultRow = False

# Load single file to DataFrame

In [None]:
import os

global firstResultPath

#ALTO folder path of first binding that was was downloaded using API.
path = firstResultPath 

print(path)



In [None]:
#you can check what files are in the given path:
sorted(os.listdir(path)) 

In [None]:
from bs4 import BeautifulSoup as bs

global firstResultAlto

#sets the path of ALTO file that was first downloaded to filename variable.
filename = firstResultAlto 
    

In [None]:
xmlsoup=bs(open(filename,"r"),"lxml")
xmldata=str(xmlsoup)


In [None]:
import xml.etree.ElementTree as ET
import csv
import pandas as pd

In [None]:
#modfies XML-structure to  pandas.DataFrame
#original function can be found from:
#http://www.austintaylor.io/lxml/python/pandas/xml/dataframe/2016/07/08/convert-xml-to-pandas-dataframe/
class XML2DataFrame:

    def __init__(self, xml_data):
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.tag] = element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [None]:
#creates data structure
xml2df = XML2DataFrame(xmldata)
xml_dataframe = xml2df.process_data()
#shows data structures columns
xml_dataframe.columns



In [None]:
#creates pandas.DataFrame from files texts
strings=xmlsoup.find_all(["string","hyp"])


In [None]:
strings

In [None]:
#creates xmldata2, where strings are separately inside structure
xmldata2='<?xml version="1.0" encoding="UTF-8"?><html><body>'
for s in strings:
    xmldata2=xmldata2+str(s)+"\n"
xmldata2=xmldata2+'</body></html>'

In [None]:
xmldata2

In [None]:
tree = ET.XML(xmldata2)


In [None]:
#creates pd.Dataframe and prints it
data = []
for el in tree.iterfind('./*'):
    for i in el.iterfind('*'):
        data.append(dict(i.items()))
df = pd.DataFrame(data)        
df

In [None]:
#prints full text
contents=""
for w in df['content']: 
   #if w!="-":        
   contents=contents+w+" "
   #cont=contents.replace("\n"," ")
contents2=contents.replace(" - ","")
print(contents2)     

In [None]:
#creates word tokens
#NLTK
#pip install nltk
import nltk 
tokens=nltk.word_tokenize(contents2)
tokens[0:10]

# Word Frequencies

In [None]:
#removing stopwords
from nltk.corpus import stopwords
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#stop words
stop_words=set(stopwords.words('finnish'))
#print(stopwords[0:10],len(stopwords))
#filtered_sentence = [w for w in content if not w in stop_words]

#clears stop words
filtered_content=[]
for r in tokens:
    #print(r)
    if not r in stop_words:
        filtered_content.append(r)

#stop words, under 1 character words and over 20 character words are removed
filtered_content2=[]
for r in tokens:
    #print(r)
    if not r in stop_words and len(r) <20 and len(r)>1: # and
        filtered_content2.append(r)

In [None]:
#prints different word tokens
freqz=nltk.FreqDist(tokens)
freqz2=nltk.FreqDist(filtered_content)
freqz3=nltk.FreqDist(filtered_content2)

print("full text")
print(freqz.most_common()[0:15])
print("stopwords removed")
print(freqz2.most_common()[0:15])



# WordClouds

In [None]:
#Example can be found here:
#https://github.com/amueller/word_cloud
#!pip install wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


tokens_str = ''.join(tokens)
fig = plt.figure(figsize=(24, 14)) 
wordcloud=WordCloud().generate(tokens_str)
#wordcloud = WordCloud(max_font_size=50,min_font_size=5).generate(content_str)
plt.subplot(221)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("1 document (all tokens)")
plt.show()


fig = plt.figure(figsize=(24, 14)) 
content_str2 = ''.join(filtered_content)
wordcloud2=WordCloud().generate(content_str2)
#wordcloud2 = WordCloud(max_font_size=40,min_font_size=5).generate(content_str2)
plt.subplot(222)
plt.imshow(wordcloud2, interpolation="bilinear")
plt.title("1 document (removed stopwords)")


plt.axis("off")
plt.show()


print("words count:",len(tokens_str),len(content_str2))#,len(content_str3))
#content_str2




# Multiple files from folders


In [None]:
# loading all XML files from the folders
import os
global firstResultPath

#ALTO folder path of first binding that was downloaded using API.
path = firstResultPath 
files=os.listdir(path)
#sorted(os.listdir(path))


In [None]:
#!this can take some time
strings2=[]
for file1 in files:
    filename=path+file1
    xmlsoup=bs(open(filename,"r"),"lxml")
    strings2.extend(xmlsoup.find_all("string"))

print("words in all pages in the folder:",len(strings2))
    
    

In [None]:
content22=[stri2['content'].lower() for stri2 in strings2[0:]]
len(content22)

In [None]:
#removes words under 2 characters
content2=[stri2 for stri2 in content22 if len(stri2)>1]
len(content2)

In [None]:
#removes stopwords
filtered_content3=[]
for r in content2:
    #print(r)
    if not r in stop_words:
        filtered_content2.append(r)
len(filtered_content2)

In [None]:
freqz3=nltk.FreqDist(content2)
freqz3.most_common()[0:15]

freqz4=nltk.FreqDist(filtered_content2)

print(freqz3.most_common()[0:10])
print(freqz4.most_common()[0:10])



# WordClouds (all pages in a folder)

In [None]:
content_str2 = ' '.join(content2)


fig = plt.figure(figsize=(16, 10))
plt.subplot(223)
wordcloud2 = WordCloud(max_font_size=50,min_font_size=5).generate(content_str2)

plt.imshow(wordcloud2, interpolation="bilinear")
plt.title("all "+str(len(files))+" documents, and "+str(len(content2))+" words, from folder")

plt.axis("off")


content_str3 = ' '.join(filtered_content2)

plt.subplot(224)
wordcloud3 = WordCloud(max_font_size=50,min_font_size=5).generate(content_str3)
plt.imshow(wordcloud3, interpolation="bilinear")

plt.title("all "+str(len(files))+" documents, and "+str(len(filtered_content2))+" words, from folder (removed stopwords)")

plt.axis("off")
plt.show()








# Topic Modeling

In [None]:
#based on 
#https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
#read all docs into the corpora
import os
files=os.listdir(path)
FILES=[]
doc_complete=[]
for ind in range(0,len(files)):
    filename=path+files[ind]
    FILES.append(filename)
    xmlsoup=bs(open(filename,"r"),"xml")
    strings=xmlsoup.find_all("String")
    strings2=[]
    for str1 in strings:
        strings2.append(str1['CONTENT'].lower())
    doc_complete.append(strings2)

len(doc_complete),len(content2),len(filtered_content2)



In [None]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('finnish'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    doc2=[word for word in doc if len(word)>3]       
    stop_free = " ".join([i for i in doc2 if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

    
doc_clean = [clean(doc).split() for doc in doc_complete]        

In [None]:
#doc_clean=docs2#filtered_content2#map(unicode,filtered_content2)
# Importing Gensim
import gensim
from gensim import corpora
dictionary = corpora.Dictionary(doc_clean )
#dictionary
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]



In [None]:
# Creating the object for LDA model using gensim library

Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=100, id2word = dictionary, passes=50)

In [None]:
#Results
print(ldamodel.print_topics(num_topics=100, num_words=3))


In [None]:
print(ldamodel.print_topics(num_topics=10, num_words=5))

In [None]:
len(doc_complete),len(doc_clean)

TOPIC MODELING2 https://de.dariah.eu/tatom/topic_model_python.html

In [None]:
#FILES
FILES2=[]
import os
global firstResultPath

#ALTO folder path of first binding that was was downloaded using API, changes "alto" from the path to "txt". 
path = firstResultPath.replace("alto","txt") 
for file in os.listdir(path):
    if file.endswith(".txt"):
        FILES2.append("".join([path,file]))


In [None]:
############

import numpy as np
#FILES[0:4]
stop = list(set(stopwords.words('finnish')))

import sklearn.feature_extraction.text as text
vectorizer = text.CountVectorizer(input='filename', stop_words=stop, min_df=1)
dtm = vectorizer.fit_transform(FILES2).toarray()
vocab = np.array(vectorizer.get_feature_names())


In [None]:
dtm.shape,len(vocab)

In [None]:
from sklearn import decomposition
num_topics = 10
num_top_words = 10
clf = decomposition.NMF(n_components=num_topics, random_state=1)



In [None]:
doctopic = clf.fit_transform(dtm)

In [None]:
topic_words = []
for topic in clf.components_:
   word_idx = np.argsort(topic)[::-1][0:num_top_words]
   topic_words.append([vocab[i] for i in word_idx])

In [None]:
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

In [None]:
novel_names = []

# turn this into an array so we can use NumPy functions
#novel_names = np.asarray(novel_names)
novel_names=np.asarray(FILES2)
doctopic_orig = doctopic.copy()

# use method described in preprocessing section
num_groups = len(set(novel_names))
doctopic_grouped = np.zeros((num_groups, num_topics))

for i, name in enumerate(sorted(set(novel_names))):
        doctopic_grouped[i, :] = np.mean(doctopic[novel_names == name, :], axis=0)

doctopic = doctopic_grouped

In [None]:
novels = sorted(set(novel_names))

print("Top NMF topics in...")


for i in range(len(doctopic)):
        top_topics = np.argsort(doctopic[i,:])[::-1][0:3]
        top_topics_str = ' '.join(str(t) for t in top_topics)
        print("{}: {}".format(novels[i], top_topics_str))
    


In [None]:
for t in range(len(topic_words)):
   print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))