# Processing the meetings files
This notebook contains all the necessary steps needed to process the meetings files. It turns the data into a dataframe containing every document and the items they are referenced in, as well as which items are references in which meetings.

In [1]:
# importing the necessary packages
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET
import pickle
import math

# setting the directory containing the meetings
meetings_dir = "Data/20210415/"

# setting the directory for where to save the document files
documents_dir = "Data/Documents/"

In [15]:
# retrieve all meeting files
meetings = [s for s in os.listdir(meetings_dir) if "meetings_" in s]
# if you want to see the meeting files then print the meetings variable

# !!!! Important note !!!!
The way this method works is that it does overwrite old information with newer information if the ids are the same. However, if we assume that newer is more complete and that ids are more than likely to still be unique than this might be okay. Just wanted to mention that this method assumes unique IDs

# Extracting meeting, item, and document information

In [16]:
# dictionary version
# first create a list containing all meetings and their corresponding information
# list containing information about meetings
meetingsInformation = {}

# list containing all the item information
itemsInformation = {}

# list containing all the document information
documentsInformation = {}

for file in meetings:
    
    # reading only the second to last xml file
    tree = ET.parse(meetings_dir + file)
    root = tree.getroot()


    # loop through all meetings
    for meeting in root:
        #print(meeting.tag)

        # intermediary information for the meetings
        meetingInformation = {}


        meetingInformation["meeting id"] = meeting.attrib["id"]
        
        # list to store the items in
        items = []
        # Go through all children of the meeting
        for child in meeting:
            if child.text == "\n      " and not child.tag == "item":
                meetingInformation[child.tag] = child.attrib
            elif not child.tag == "item":
                meetingInformation[child.tag] = child.text





            # if child is an item go through all the children of the item and store their information
            if child.tag == "item":
                
                # append item to items
                items.append(child.attrib)
                
                # intermediary information for the items
                itemInformation = {}
                itemInformation["item id"] = child.attrib["id"]
                
                # list for saving all documents attached to this item
                documents = []
                
                # loop through all items
                for itemChild in child:
                    
                    
                    if itemChild.text == "\n        " and not child.tag == "document":
                        #print(itemChild.text)
                        itemInformation[itemChild.tag] = itemChild.attrib
                    elif not child.tag == "document":
                        itemInformation[itemChild.tag] = itemChild.text

                        
                        
                        
                    # now repeat once more to find the documents
                    if itemChild.tag == "document":
                        
                        # append this document to list of documents
                        documents.append(itemChild.attrib)
                        
                        # intermediary information for the document
                        documentInformation = {}
                        documentInformation["document id"] = itemChild.attrib["id"]

                        for documentChild in itemChild:
                            if itemChild.text == "\n     ":
                                documentInformation[documentChild.tag] = documentChild.attrib
                            else :
                                documentInformation[documentChild.tag] = documentChild.text


                        # store all information for this document
                        documentsInformation[itemChild.attrib["id"]] =  documentInformation

                    # attach all documents belonging to this item
                    itemInformation["Documents"] = documents
                # store all information for this item 
                itemsInformation[child.attrib["id"]] = itemInformation
            
            # save the items attached to this meeting
            meetingInformation["items"] = items
         # store all information for this meeting
        meetingsInformation[meeting.attrib["id"]] = meetingInformation

In [4]:
len(documentsInformation)

29229

## Saving the retrieved information

## Reading those same files again

In [2]:
with open("meetingsInformation.txt", "rb") as fp:   
    meetingsInformation = pickle.load(fp)
with open("itemsInformation.txt", "rb") as fp:   
    itemsInformation = pickle.load(fp)
with open("documentsInformation.txt", "rb") as fp:   
    documentsInformation = pickle.load(fp)

In [3]:
documentsInformation["0aab1c4d-3bb0-4593-8336-815704ad500a"]

{'document id': '0aab1c4d-3bb0-4593-8336-815704ad500a',
 'displayname': 'Bijlage 1 Rekenkamerrapport "Gericht en geregeld".pdf',
 'filename': 'Bijlage 1 Rekenkamerrapport Gericht en geregeld.pdf',
 'filesize': '1392841',
 'publicdownloadurl': 'https://api1.ibabs.eu/publicdownload.aspx?site=Utrecht&id=0aab1c4d-3bb0-4593-8336-815704ad500a'}

# Filtering duplicate documents based on file names

In [4]:
# dictionary of documents containing the most recent versions of documents with the same filename
nonDuplicateIDS = {}

# dictionary of filenames with the multiple ids linked to the filename
duplicate_files = {}

# a dictionary containing 

#[l for l in f if something in l]
for key in documentsInformation:
    
    doc = documentsInformation[key]
    filename = doc["filename"]
    
    # if this filename already exist then add the new id to the list of ids
    if filename in duplicate_files:
        duplicate_files.setdefault(filename, []).append(key)
    else:
        duplicate_files[filename] = [key]

    nonDuplicateIDS[filename] = key
        

In [16]:
duplicate_files['Bijlage 1 Verzoek van de fracties van VVD, CDA en D66 tot het mogen houden van een debat over het feit dat het College de NOUW 3 niet heeft opgenomen in het Actieplan Luchtkwaliteit.pdf']

['72cc4b22-0de8-4a83-ae9c-b4961b74050e']

In [14]:
len(nonDuplicateIDS)

23704

In [12]:
len(documentsInformation)

29229

In [5]:
# here we find a match between the file name and a non-duplicate id, which is necessary for constructing the links
name_id_match = {}

setValues = set(nonDuplicateIDS.values())
# remove every element with only one match
for key in duplicate_files:
    doc = duplicate_files[key]
    
    # if only one matching key was then skip it
    if len(doc) > 1:
        #print(len(doc))
        
        # loop through all keys and store only the ids that are not duplicates
        for id in doc:
            if id in setValues:
                name_id_match[key] = id

In [6]:
# creating a new documents information variable with only the nonDuplicate values
nonDuplicateDocumentsInformation = {}

setValues = set(nonDuplicateIDS.values())
# recreate the same document information but now without duplicates
for key in documentsInformation:
    if key in setValues:
        nonDuplicateDocumentsInformation[key] = documentsInformation[key]
 

In [None]:
# save this information
#with open("nonDuplicateDocumentsInformation.txt", "wb") as fp:
#    pickle.dump(nonDuplicateDocumentsInformation, fp) 

In [7]:
len(name_id_match)

3563

In [11]:
len(nonDuplicateDocumentsInformation)

23704

# Move duplicate pdfs to a seperate folder

In [8]:
# creating a list of all duplicate document versions
duplicateIDS = []
setValues = set(nonDuplicateIDS.values())

# first retrieve all ids of duplicate documents
for key in documentsInformation:
    if key not in setValues:
        duplicateIDS.append(key)
len(duplicateIDS)

5525

# now moving the duplicate documents
# folder to move the duplicate files to
duplicate_documents_dir = "Data/DuplicateDocuments/"
for id in duplicateIDS:
    os.rename(documents_dir + id + ".pdf", duplicate_documents_dir + id + ".pdf")

# Extracting links
These result in the documentItemMatch and itemMeetingMatch files. 

The way everything is handled means that in documentItemMatch every document is unique and each column consists of a new matching agenda point. This is also the same for itemMeetingMatch, where every item is unique and has a list of attached meetings (although this seems to be a 1-1 match).

In [9]:
# Now we are going to loop through every item 
# and link the non-duplicate documents to that item based on the name_id_match variable
# Document ids will be used as keys and item ids will be appended to a list
document_item_match = {}

setValues = set(nonDuplicateIDS.values())
i = 0
for key in itemsInformation:
    item = itemsInformation[key]    
    docs = item["Documents"]
    
    # only look at the docs if there is at least one
    if len(docs) >= 1:
        
        # now loop through every document attachted to the item
        for docID in docs:
            
            # if the document is a non-duplicate then no further actions are required
            if docID["id"] in setValues:
                #print("id is in setValues")
                nonDuplicateID = docID["id"]
            # otherwise find the right id using the name of the document
            else :
                filename = documentsInformation[docID["id"]]["filename"]
                
                # double check if the filename is in name_id_match (only 1 was found)
                if filename in name_id_match:
                    nonDuplicateID = name_id_match[filename]
                else :
                    #print(f"Could not find filename [{filename}] in name_id_match")
                    i += 1
                    nonDuplicateID = docID["id"]
            
            # this is another double 
            if nonDuplicateID is None:
                print("nonDuplicateID = None")
            
            # check if the key already exists in the dictionary. If not add it otherwise append item key
            if nonDuplicateID in document_item_match:
                #print(document_item_match[nonDuplicateID])
                document_item_match.setdefault(nonDuplicateID, []).append(key)
            else :
                document_item_match[nonDuplicateID] = [key]
            
# make a dataframe out of it with the item id as row and every column consisting of meeting ids            
document_item_match = pd.DataFrame.from_dict(document_item_match, orient = "index").reset_index()
document_item_match.columns = ["Document_id"] + ["item_id_"+ str(s) for s in document_item_match.columns[1:]]
document_item_match.to_csv("document_item_match.csv", index=False)
document_item_match

Unnamed: 0,Document_id,item_id_0,item_id_1,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6,item_id_7,item_id_8,...,item_id_99,item_id_100,item_id_101,item_id_102,item_id_103,item_id_104,item_id_105,item_id_106,item_id_107,item_id_108
0,72cc4b22-0de8-4a83-ae9c-b4961b74050e,f0ba2c10-26ce-4441-8541-0c63ab65215f,,,,,,,,,...,,,,,,,,,,
1,501dca51-6764-47e5-9eff-9072ea71b758,dae7f992-a782-4c04-ac58-62c5272318cc,,,,,,,,,...,,,,,,,,,,
2,6c3b5566-8ecc-40ff-8ec3-71d0ff26470c,be31f268-86db-42f8-985e-45d4a1147887,9b3af5fd-e90f-48a7-b90a-a5e5399ac70c,,,,,,,,...,,,,,,,,,,
3,5ab63e8f-f46e-4495-98bd-cd486c1f6445,5dab871f-fd96-416f-ada3-0d17de8d9a9e,,,,,,,,,...,,,,,,,,,,
4,d7f0e1f5-c293-4c01-b805-89ae823e0639,5dab871f-fd96-416f-ada3-0d17de8d9a9e,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23699,fab436ca-85c5-40ff-8364-7014a7a86adc,7a56ee03-b9bb-4b92-9b5c-9d4920f6590a,,,,,,,,,...,,,,,,,,,,
23700,7f00ca9d-6345-4433-96e1-ac7487228f0d,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,...,,,,,,,,,,
23701,8d7f4434-af9e-4ca4-b0cb-1e0e70bec4e0,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,...,,,,,,,,,,
23702,62d8dd40-7660-4478-9425-60473cb35efb,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,...,,,,,,,,,,


In [27]:
len(document_item_match)

23704

In [90]:
# Finding the documents that are linked often
test = document_item_match.isnull().sum(axis=1)
test[test < 50]

1388    39
8766     0
dtype: int64

In [10]:
document_item_match.iloc[[8766, 1388], :]

Unnamed: 0,Document_id,item_id_0,item_id_1,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6,item_id_7,item_id_8,...,item_id_99,item_id_100,item_id_101,item_id_102,item_id_103,item_id_104,item_id_105,item_id_106,item_id_107,item_id_108
8766,49a4869c-8976-43ce-981d-7dcf7dfef87a,c2fba4db-be9d-434d-8181-0f301ecbbfe0,e6c34f29-4a00-4ea3-a244-722b125903a7,,,,,,,,...,,,,,,,,,,
1388,3e119ce6-800b-4113-aee7-54b0e1f3f01b,7b2417e1-9b0a-4124-b2d8-3a8927b77dc4,,,,,,,,,...,,,,,,,,,,


In [98]:
documentsInformation["18ad66f4-9ea5-4427-a542-e229d71f9aa1"]

{'document id': '18ad66f4-9ea5-4427-a542-e229d71f9aa1',
 'displayname': 'NL.IMRO.0344.BPVELDHUIZENDEMEER-VA01-Kader_01-000',
 'filename': 'NL.IMRO.0344.pdf',
 'filesize': '4604386',
 'publicdownloadurl': 'https://api1.ibabs.eu/publicdownload.aspx?site=Utrecht&id=18ad66f4-9ea5-4427-a542-e229d71f9aa1'}

In [99]:
itemsInformation["e5fb5bd3-51ec-48e7-bf91-32d0396e2de9"]

{'item id': 'e5fb5bd3-51ec-48e7-bf91-32d0396e2de9',
 'title': 'Vaststelling bestemmingsplan Prinses Máxima Centrum, De Uithof',
 'Documents': [{'id': '76652811-0309-42c1-8821-68bc8b7baf3f'},
  {'id': '0e8b7552-dc64-420e-bc2d-dfa080e153d1'},
  {'id': '85777a2b-c1a0-482e-8887-8ab1f8dee497'},
  {'id': '1e41fba9-52a9-496c-bba6-1eb5863b561b'},
  {'id': 'c4bca5cf-7d83-4d98-b3d3-520f1a4740e2'},
  {'id': 'd7bf9fbc-af74-47b6-881b-51bf13683667'},
  {'id': '55ade1f4-9c5d-445d-a096-cd816dce9b64'},
  {'id': 'ce5e7ca0-a204-4a34-9413-3e24e51bef5e'},
  {'id': 'c370cb76-d3f3-48e4-bd82-3dcda2a32660'}],
 'features': '4',
 'document': {'id': 'c370cb76-d3f3-48e4-bd82-3dcda2a32660'}}

# Creating a network

Start by creating an edge list

In [28]:
sources = []
targets = []
# loop through all document matches to create an edgelist with
# a source item and target item
for index, row in document_item_match.iterrows():
    # remove the None values
    row = [x for x in row.values[1:] if x is not None]
    
    # only loop through the rows if there is more than one match
    if len(row) > 1:
        # check every item for every item
        for i in range(0,len(row) - 1):            
            for j in range(0,len(row) - 1):
                
                # do not add a link between the same two values
                if i != j:
                    sources.append(row[i])
                    targets.append(row[j])


In [29]:
df = pd.DataFrame()
df["source"] = sources
df["target"] = targets
df["weight"] = 1
df

Unnamed: 0,source,target,weight
0,1017fda6-e0eb-4a69-9400-b6b6445d2edb,51409ca8-cb19-4da1-aaa7-49e4222dfa98,1
1,1017fda6-e0eb-4a69-9400-b6b6445d2edb,35196b95-5364-4a4d-805b-57014c69b196,1
2,51409ca8-cb19-4da1-aaa7-49e4222dfa98,1017fda6-e0eb-4a69-9400-b6b6445d2edb,1
3,51409ca8-cb19-4da1-aaa7-49e4222dfa98,35196b95-5364-4a4d-805b-57014c69b196,1
4,35196b95-5364-4a4d-805b-57014c69b196,1017fda6-e0eb-4a69-9400-b6b6445d2edb,1
...,...,...,...
48925,5864c077-614b-4078-8e2e-cd1a9e8840bb,128c8418-b29f-4258-89d0-3ec4e710a7b6,1
48926,f8661b19-37d8-490b-959e-c83176ea0839,f85fd3be-3a1e-4c86-9f61-c69871fa183e,1
48927,f85fd3be-3a1e-4c86-9f61-c69871fa183e,f8661b19-37d8-490b-959e-c83176ea0839,1
48928,f8661b19-37d8-490b-959e-c83176ea0839,f85fd3be-3a1e-4c86-9f61-c69871fa183e,1


In [30]:
# save the edgelist
df.to_csv("Item_Edgelist.csv")

In [31]:
# create a graph from the edgelist using networkx package
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

graph = nx.from_pandas_edgelist(df, 
                                source = "source", 
                                target = "target",
                                edge_attr = "weight")

In [32]:
# create a dataframe with every item id set out against every other item id
shortest_paths = pd.DataFrame()
shortest_paths["items"] = itemsInformation.keys()
shortest_paths = shortest_paths.set_index("items").T
shortest_paths["items"] = itemsInformation.keys()
shortest_paths = shortest_paths.set_index("items")
shortest_paths.head(5)

items,f0607435-a601-4a06-85d4-63dd17ca3ec6,c6b49408-1158-495b-b69e-6137da9da1e7,0d47334d-7b3a-4b62-9ab6-7ff63fe49ebf,f804c38b-59f3-40c1-a9f3-c28c8b6078cc,400094ee-0e1f-432f-b1e4-ba41abdd516b,a9a3fed6-91dd-4885-9c0c-f4d9bad37ef1,2784c42c-760c-4d5a-9f45-2539604a8c6b,3c5a1fa5-2720-465a-b8ad-c8be74c5556a,e76d67a8-5f07-4e47-bbc5-29b1ed7ae7a0,b664cb20-7552-405e-957e-af7fec97f490,...,5b43d8ec-65b5-4d97-aee5-5050a921687c,76bc2df4-336a-4743-90ee-ba505a13ef8e,f46ad339-fecc-4228-9e48-b7ca8a83d93c,7a56ee03-b9bb-4b92-9b5c-9d4920f6590a,137f4d35-f4c8-497f-b149-b758276e2ecc,8c20e0fa-9227-4e77-90f5-1de4b96696f1,a6a6d28b-0715-419a-93ba-b95ce6ab6d50,9e63ca83-e8a7-48be-abfa-227b0f9e6116,8f199b1f-7e90-4007-b082-c8142e58ce35,7ef6e7b6-1082-4eb0-a150-46b5ec50e61d
items,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f0607435-a601-4a06-85d4-63dd17ca3ec6,,,,,,,,,,,...,,,,,,,,,,
c6b49408-1158-495b-b69e-6137da9da1e7,,,,,,,,,,,...,,,,,,,,,,
0d47334d-7b3a-4b62-9ab6-7ff63fe49ebf,,,,,,,,,,,...,,,,,,,,,,
f804c38b-59f3-40c1-a9f3-c28c8b6078cc,,,,,,,,,,,...,,,,,,,,,,
400094ee-0e1f-432f-b1e4-ba41abdd516b,,,,,,,,,,,...,,,,,,,,,,


In [33]:
# now add the shortest paths two the dataframe
# by looping through every match
for rowID in itemsInformation:
    for columnID in itemsInformation:
        
        # if they are a match set distance to 0 and skip
        if rowID == columnID:
            shortest_paths.loc[rowID, columnID] = 0
            continue
        
        # else if they exist and have a path calculate the distance, otherwise set to -1
        else :
            if rowID in graph and columnID in graph:
                if nx.has_path(graph, rowID, columnID):
                     shortest_paths.loc[rowID, columnID] = nx.shortest_path_length(graph, rowID, columnID)
                else :
                    shortest_paths.loc[rowID, columnID] = np.nan
                


In [35]:
# save the shortest paths file
shortest_paths.to_csv("shortest_paths.csv")

In [11]:
# read the file
shortest_paths = pd.read_csv("shortest_paths.csv")

In [37]:
shortest_paths.max().max()

9.0

# Returning items

In [36]:
def return_related_items(df, id, threshold):
    return df.loc[id][df.loc[id] > threshold]


In [38]:
return_related_items(shortest_paths, "1aa5b71f-f370-492e-8fdf-95d83f4dcefe", 0)

items
1e072b46-409f-4f8a-9b58-0fe32768a2d9    1.0
69fa1234-d770-480e-bf10-b71da4248512    1.0
6fe2ba0b-df45-4a14-870f-9b9d4b4a33b1    1.0
caaff37f-18ac-4d3a-923c-66148aa7d719    1.0
Name: 1aa5b71f-f370-492e-8fdf-95d83f4dcefe, dtype: float64

In [19]:
# original agenda point
itemsInformation["1aa5b71f-f370-492e-8fdf-95d83f4dcefe"]

{'item id': '1aa5b71f-f370-492e-8fdf-95d83f4dcefe',
 'title': 'Utrecht Circulair 2020-2023',
 'Documents': [{'id': '5c7abeb6-41f1-4875-8e70-7e0f4fbc5622'},
  {'id': '853cc5b5-f973-4a05-8045-b4e3319e3c91'},
  {'id': 'e638d5f8-f0e8-44b0-9168-bde292150c73'}],
 'features': '1',
 'document': {'id': 'e638d5f8-f0e8-44b0-9168-bde292150c73'}}

In [39]:
itemsInformation["1e072b46-409f-4f8a-9b58-0fe32768a2d9"]

{'item id': '1e072b46-409f-4f8a-9b58-0fe32768a2d9',
 'title': 'Raadsvoorstel Utrecht Circulair 2020-2023 ',
 'Documents': [{'id': '397ac64b-af1b-4bb0-9039-d188aecb431b'},
  {'id': '14a2161b-6f93-452d-945e-66bd2b0701c6'},
  {'id': 'e5d5a300-fe75-4528-88da-4799053c22b8'},
  {'id': '3e953295-bc00-4744-b3be-5ab2de0134e9'}],
 'features': '6',
 'explanation': 'Indicatieve aanvang: 14.00 uur; Digitaal\r\nIndicatie behandeltijd:120 min.\r\nBeleidsveld: Circulaire economie - wethouder Verschuure\r\nUtrecht wil in 2050 een circulaire stad zijn. De nota Utrecht Circulair 2030-2023 schetst een tijdspad met tussenstappen in 2023, 2030. Tot 2023 zet de gemeente vooral in op leren en experimenteren, na 2023 wil de gemeente versnellen. Er zijn vijf ambities met prioriteit benoemd. Utrecht Circulair creëert een klimaat waar duurzame en circulaire bedrijven zich welkom voelen, Utrecht versterkt het investeringsklimaat en koopt circulair in, Utrecht stimuleert circulair gebiedsontwikkeling en circulair b

In [40]:
itemsInformation["69fa1234-d770-480e-bf10-b71da4248512"]

{'item id': '69fa1234-d770-480e-bf10-b71da4248512',
 'title': 'Raadsvoorstel Utrecht Circulair 2020-2023 ',
 'Documents': [{'id': '51be0d46-9c46-406d-b733-17c7774f3943'},
  {'id': '14a2161b-6f93-452d-945e-66bd2b0701c6'},
  {'id': 'e5d5a300-fe75-4528-88da-4799053c22b8'},
  {'id': '3e953295-bc00-4744-b3be-5ab2de0134e9'},
  {'id': 'a4c90b14-3baa-4247-ba51-94903f4b4fd9'}],
 'features': '13',
 'explanation': "Wethouder Verschuure\r\nStatus: B\r\nIndicatieve tijd: 60 min\r\n\r\nHet college van burgemeester en wethouders stelt de raad voor het volgende te besluiten:\r\n1 Bij de ambitie om in 2050 circulaire stad te zijn volgende prioriteiten in samenhang te\r\n\xa0hanteren:\r\na) Versterken vestigingsklimaat, b) Versterken investeringsklimaat en circulaire\xa0\xa0inkoop, c) Stimuleren circulaire gebiedsontwikkeling en circulair bouwen, d) Afvalvrij\xa0\xa0worden (van afval naar grondstof) en e) Opleiden voor circulair bouwen en circulair\r\nondernemen en werken aan een circulaire kennisagenda

In [41]:
itemsInformation["6fe2ba0b-df45-4a14-870f-9b9d4b4a33b1"]

{'item id': '6fe2ba0b-df45-4a14-870f-9b9d4b4a33b1',
 'title': 'Raadsvoorstel Utrecht Circulair 2020-2023',
 'Documents': [{'id': '3976546a-58c6-42d3-96af-7e0643038b85'}],
 'features': '13.2',
 'document': {'id': '3976546a-58c6-42d3-96af-7e0643038b85'}}

In [42]:
itemsInformation["caaff37f-18ac-4d3a-923c-66148aa7d719"]

{'item id': 'caaff37f-18ac-4d3a-923c-66148aa7d719',
 'title': 'Raadsvoorstel Utrecht Circulair 2020-2023 ',
 'Documents': [{'id': '4c8b15d5-74a7-4f4a-8d8d-9721ac3db4c7'},
  {'id': '14a2161b-6f93-452d-945e-66bd2b0701c6'},
  {'id': 'e5d5a300-fe75-4528-88da-4799053c22b8'},
  {'id': '3e953295-bc00-4744-b3be-5ab2de0134e9'}],
 'features': '1.1',
 'document': {'id': '3e953295-bc00-4744-b3be-5ab2de0134e9'}}

# I do not think this old code is valuable anymore

## Other version for extracting information 
This version works better for finding matches, but is not necessary anymore, since the results have already been saved.

In [172]:
# first create a list containing all meetings and their corresponding information
# list containing information about meetings
meetingsInformation = []

# list containing all the item information
itemsInformation = []

# list containing all the document information
documentsInformation = []

for file in meetings:
    
    # reading only the second to last xml file
    tree = ET.parse(dir + file)
    root = tree.getroot()


    # loop through all meetings
    for meeting in root:
        #print(meeting.tag)

        # intermediary information for the meetings
        meetingInformation = []


        meetingInformation.append(("meeting id", meeting.attrib["id"]))
        # Go through all children of the meeting
        for child in meeting:
            #print(child.tag)
            if child.text == "\n      ":
                meetingInformation.append((child.tag, child.attrib))
            else :
                meetingInformation.append((child.tag, child.text))





            # if child is an item go through all the children of the item and store their information
            if child.tag == "item":

                # intermediary information for the items
                itemInformation = []
                itemInformation.append(("item id", child.attrib["id"]))

                for itemChild in child:
                    if itemChild.text == "\n        ":
                        itemInformation.append((itemChild.tag, itemChild.attrib))
                    else :
                        itemInformation.append((itemChild.tag, itemChild.text))

                    # now repeat once more to find the documents
                    if itemChild.tag == "document":
                        # intermediary information for the document
                        documentInformation = []
                        documentInformation.append(("document id", itemChild.attrib["id"]))

                        for documentChild in itemChild:
                            if itemChild.text == "\n     ":
                                documentInformation.append((documentChild.tag, documentChild.attrib))
                            else :
                                documentInformation.append((documentChild.tag, documentChild.text))


                        # store all information for this document
                        documentsInformation.append(documentInformation)


                # store all information for this item 
                itemsInformation.append(itemInformation)

         # store all information for this meeting    
        meetingsInformation.append(meetingInformation)

#meetingsInformation
len(documentsInformation)

38216

In [142]:
len(meetingsInformation

1696

In [68]:

len(itemsInformation)

15716

# Extracting matches 
These result in the documentItemMatch and itemMeetingMatch files. 

The way everything is handled means that in documentItemMatch every document is unique and each column consists of a new matching agenda point. This is also the same for itemMeetingMatch, where every item is unique and has a list of attached meetings (although this seems to be a 1-1 match).

In [173]:
# now we can extract which items correspond to which meeting

# dictionairy containing items as keys and matching meetings as values
itemMeetingMatch = {}

for meeting in meetingsInformation:
    
    for i in range(len(meeting)):
        #print(meeting[i])
        if meeting[i][0] == "item":
            key = meeting[i][1]["id"]
            # check if the key already exists in the dictionairy
            if key in itemMeetingMatch:
                itemMeetingMatch[key] = itemMeetingMatch[key].append(meeting[0][1])
            else :
                itemMeetingMatch[key] = [meeting[0][1]]
            
# make a dataframe out of it with the item id as row and every column consisting of meeting ids            
itemMeetingMatch = pd.DataFrame(itemMeetingMatch).T.reset_index()
itemMeetingMatch.columns = ["Items_id", "meeting_ids"]
#itemMeetingMatch.to_csv("itemMeetingMatch.csv", index=False)
itemMeetingMatch

Unnamed: 0,Items_id,meeting_ids
0,f0607435-a601-4a06-85d4-63dd17ca3ec6,dfda7a32-ea7a-4ef5-b187-dd11853e230e
1,c6b49408-1158-495b-b69e-6137da9da1e7,dfda7a32-ea7a-4ef5-b187-dd11853e230e
2,0d47334d-7b3a-4b62-9ab6-7ff63fe49ebf,dfda7a32-ea7a-4ef5-b187-dd11853e230e
3,f804c38b-59f3-40c1-a9f3-c28c8b6078cc,dfda7a32-ea7a-4ef5-b187-dd11853e230e
4,400094ee-0e1f-432f-b1e4-ba41abdd516b,dfda7a32-ea7a-4ef5-b187-dd11853e230e
...,...,...
15309,8c20e0fa-9227-4e77-90f5-1de4b96696f1,e4791ada-7a35-4d9b-b31c-8205d2c96566
15310,a6a6d28b-0715-419a-93ba-b95ce6ab6d50,e4791ada-7a35-4d9b-b31c-8205d2c96566
15311,9e63ca83-e8a7-48be-abfa-227b0f9e6116,e4791ada-7a35-4d9b-b31c-8205d2c96566
15312,8f199b1f-7e90-4007-b082-c8142e58ce35,e4791ada-7a35-4d9b-b31c-8205d2c96566


In [174]:
# Repeat the same steps but now for the matches between items 

# dictionairy containing items as keys and matching meetings as values
documentItemMatch = {}

for item in itemsInformation:
    #print(item[0][1])
    for i in range(len(item)):
        if item[i][0] == "document":
            key = item[i][1]["id"]
            
            # check if the key already exists in the dictionairy
            if key in documentItemMatch:
                documentItemMatch.setdefault(key, []).append(item[0][1])
            else :
                documentItemMatch[key] = [item[0][1]]
            
# make a dataframe out of it with the item id as row and every column consisting of meeting ids            
documentItemMatch = pd.DataFrame.from_dict(documentItemMatch, orient = "index").reset_index()
documentItemMatch.columns = ["document_id", "item_id_1", "item_id_2", "item_id_3", "item_id_4", "item_id_5"
                            , "item_id_6", "item_id_7", "item_id_8", "item_id_9", "item_id_10", "item_id_11"]
# documentItemMatch.to_csv("documentItemMatch.csv", index=False)
documentItemMatch

Unnamed: 0,document_id,item_id_1,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6,item_id_7,item_id_8,item_id_9,item_id_10,item_id_11
0,72cc4b22-0de8-4a83-ae9c-b4961b74050e,f0ba2c10-26ce-4441-8541-0c63ab65215f,,,,,,,,,,
1,501dca51-6764-47e5-9eff-9072ea71b758,dae7f992-a782-4c04-ac58-62c5272318cc,,,,,,,,,,
2,916ea22e-4016-45a2-ac43-2e0099706d72,be31f268-86db-42f8-985e-45d4a1147887,,,,,,,,,,
3,5ab63e8f-f46e-4495-98bd-cd486c1f6445,5dab871f-fd96-416f-ada3-0d17de8d9a9e,,,,,,,,,,
4,d7f0e1f5-c293-4c01-b805-89ae823e0639,5dab871f-fd96-416f-ada3-0d17de8d9a9e,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
29224,fab436ca-85c5-40ff-8364-7014a7a86adc,7a56ee03-b9bb-4b92-9b5c-9d4920f6590a,,,,,,,,,,
29225,7f00ca9d-6345-4433-96e1-ac7487228f0d,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,,
29226,8d7f4434-af9e-4ca4-b0cb-1e0e70bec4e0,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,,
29227,62d8dd40-7660-4478-9425-60473cb35efb,8c20e0fa-9227-4e77-90f5-1de4b96696f1,,,,,,,,,,
