Global Variables

In [1]:
file_path = "ITIL Books/ITIL 3/ITIL3 Service Operation chapter 4.pdf"
# extracted_text_file_path = "ITIL Books/ITIL 3/Continual service improvement chapter from notebook.txt"
extracted_text_file_path = "ITIL Books/ITIL 3/Service operation chapter 4/Service operation chapter 4 - 4.txt"
output_file_path = "output/ITIL3 Continual Service Improvement.txt"

Imports

In [2]:
import pdfminer
import nltk
import re

Global Functions

In [3]:
with open(extracted_text_file_path, 'r') as file:
    extracted_text = file.read()
    
# extracted_text = "Particle dynamics involves the study of physics and chemistry"

# tokens = nltk.word_tokenize(extracted_text)
# print(tokens)

# ### Part of speech tagging ###
# part_of_speech_array = nltk.pos_tag(tokens)
# print(part_of_speech_array)


Text sanitization and word tokenizing

In [4]:
### Grab sections from text ###
# print(re.findall("^\d(\.|\d)*(\s|\w)*$", extracted_text))
# title_pattern = re.compile(r"^\d(\.|\d)*(\s|\w)*$", re.MULTILINE)
title_pattern = re.compile(r"^\d+.*$", re.MULTILINE)

sections = title_pattern.findall(extracted_text)
for counter, section in enumerate(sections):
    if not (section.find("%") == -1 and section.find(")") == -1):
        sections.remove(section)

### Sanitise extracted text ###
extracted_text_sanitised = extracted_text
extracted_text_sanitised = extracted_text.replace("¦", "")
extracted_text_sanitised = extracted_text_sanitised.replace("–", "")
        
### Tokenise extracted text ###
tokens = nltk.word_tokenize(extracted_text_sanitised)
# print(tokens)


Part to speech tagging

In [5]:
### Part of speech tagging ###
part_of_speech_array = nltk.pos_tag(tokens)
# print(part_of_speech_array)

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

part_of_speech_array_lemmatized = []

for part_of_speech in part_of_speech_array:
    part_of_speech_array_lemmatized.append(
        (lemmatizer.lemmatize(part_of_speech[0]), part_of_speech[1])
    ) 
    
# print(part_of_speech_array_lemmatized)


Term extraction

In [6]:
### Term Extraction (NNP next to each other) ###
def extract_terms(part_of_speech_array_array, tags_to_use):
    terms_array = []
    term_phrase = []
    start_new_term = True
    for index, part in enumerate(part_of_speech_array_array):
        if(part[1] in tags_to_use):
            term_phrase.append(part[0])
            start_new_term = False if part_of_speech_array_array[index + 1][1] in tags_to_use else True

            if start_new_term == True:
                terms_array.append(" ".join(term_phrase))
                term_phrase = []
    return terms_array

major_named_concepts = extract_terms(part_of_speech_array, {"NNP", "NNPS"})
other_concepts = extract_terms(part_of_speech_array, {"NN", "NNS"})
all_noun_phrases = extract_terms(part_of_speech_array, {"NNP", "NNPS", "NN", "NNS"})

print(f"major named concepts: \n{major_named_concepts}")
print(f"\nother concepts: \n{other_concepts}")
print(f"\nall noun phrases: \n{all_noun_phrases}")


major named concepts: 
['Service Operation', 'Please', 'Chapters', 'Management', 'IT', 'Incident Management', 'Problem Management', 'Known Error', 'Incident', 'Problem Records', '’', 'SLA', '’', 'Request Fulfilment', 'Incident Management', 'Request Fulfilment', 'Request Fulfilment', 'Request Fulfilment', 'IT', 'SLA', 'Access', 'HR', 'Access Management', 'Identity', 'Rights', 'Service Operation', 'Service Management Lifecycle', 'Change Management', 'Configuration Management', 'Release Management', 'Service Transition', 'Capacity', 'Availability Management', 'Service Design', 'Financial Management', 'Service Strategy', 'Knowledge Management', 'Service Transition', 'IT Service Continuity', 'Service Design', 'Service Reporting', 'Measurement', 'Continual Service Improvement']

other concepts: 
['processes', 'paragraph', 'detail', 'chapter', 'reference', 'structure', 'processes', 'detail', 'chapter', 'note', 'roles', 'process', 'tools', 'process', 'process', 'monitors', 'events', 'infrastru

Major/common concept extraction

In [7]:
# concept_relationships = extract_terms(part_of_speech_array, {"VP"})
# print(concept_relationships)
# print(all_noun_phrases)

### Perform frequency analysis ###
### Concept Extraction Frequency analysis ###
major_named_concept_frequency_distribution = nltk.FreqDist(major_named_concepts)
other_concept_frequency_distribution = nltk.FreqDist(other_concepts)
all_noun_phrases_frequency_distribution = nltk.FreqDist(all_noun_phrases)

print(f"major named concepts: \n{major_named_concept_frequency_distribution.most_common(50)}")
print(f"\nother concepts: \n{other_concept_frequency_distribution.most_common(50)}")
print(f"\nall noun phrases: \n{all_noun_phrases_frequency_distribution.most_common(50)}")

major named concepts: 
[('Request Fulfilment', 4), ('Service Operation', 2), ('IT', 2), ('Incident Management', 2), ('’', 2), ('SLA', 2), ('Service Transition', 2), ('Service Design', 2), ('Please', 1), ('Chapters', 1), ('Management', 1), ('Problem Management', 1), ('Known Error', 1), ('Incident', 1), ('Problem Records', 1), ('Access', 1), ('HR', 1), ('Access Management', 1), ('Identity', 1), ('Rights', 1), ('Service Management Lifecycle', 1), ('Change Management', 1), ('Configuration Management', 1), ('Release Management', 1), ('Capacity', 1), ('Availability Management', 1), ('Financial Management', 1), ('Service Strategy', 1), ('Knowledge Management', 1), ('IT Service Continuity', 1), ('Service Reporting', 1), ('Measurement', 1), ('Continual Service Improvement', 1)]

other concepts: 
[('process', 7), ('publication', 7), ('incidents', 6), ('users', 5), ('processes', 4), ('requests', 4), ('detail', 3), ('chapter', 3), ('service', 3), ('organizations', 3), ('events', 2), ('Incidents', 

Concept relationship extraction

In [91]:
def get_sentence_at_index(part_of_speech_array, index):
    sentence_starting_index = 0
    sentence_end_index = len(part_of_speech_array)
    
    ### Get sentence start index ###
    for i in range(0, index):
#         print(part_of_speech_array[index])
        if part_of_speech_array[index - i][1] == ".":
            sentence_starting_index = index - i
            break
    
    ### Get sentence end index ###
    for i in range(0, len(part_of_speech_array)):
        if part_of_speech_array[index + i][1] == ".":
            sentence_end_index = index + i
            break
            
    return (sentence_starting_index, sentence_end_index + 1)

### Term Extraction (NNP next to each other) ###
def extract_terms(part_of_speech_array_array, tags_to_use):
    part_of_speech_array_with_terms = []
    
    terms_array = []
    term_phrase = []
    start_new_term = True
    for index, part in enumerate(part_of_speech_array_array):
        if(part[1] in tags_to_use):
            term_phrase.append(part[0])
            start_new_term = False if part_of_speech_array_array[index + 1][1] in tags_to_use else True

            if start_new_term == True:
                if len(term_phrase) > 1:
#                     part_of_speech_array_with_terms.append((" ".join(term_phrase), f"NPhrase-{part[1]}"))
                    part_of_speech_array_with_terms.append((" ".join(term_phrase), "NPhrase"))
                else:
                    part_of_speech_array_with_terms.append((" ".join(term_phrase), part[1]))
                term_phrase = []
        else:
            part_of_speech_array_with_terms.append((part[0], part[1]))
    return part_of_speech_array_with_terms

part_of_speech_array_with_terms = extract_terms(part_of_speech_array, {"NNP", "NNPS", "NN", "NNS"})
# print(part_of_speech_array_with_terms)

sentences = []
temp_sentence = []
for word in part_of_speech_array_with_terms:
    if word[1] is ".":
        temp_sentence.append(word)
        sentences.append(temp_sentence)
        temp_sentence = []
    else:
        temp_sentence.append(word)
        
for sentence in sentences:
    print(sentence)
    print("-" * 100)
    


[('4', 'CD'), ('Service Operation', 'NPhrase'), ('processes', 'VBZ'), ('The', 'DT'), ('processes', 'NNS'), ('listed', 'VBN'), ('in', 'IN'), ('paragraph', 'NN'), ('2.4.5', 'CD'), ('are', 'VBP'), ('discussed', 'VBN'), ('in', 'IN'), ('detail', 'NN'), ('in', 'IN'), ('this', 'DT'), ('chapter', 'NN'), ('.', '.')]
----------------------------------------------------------------------------------------------------
[('As', 'IN'), ('a', 'DT'), ('reference', 'NN'), (',', ','), ('the', 'DT'), ('overall', 'JJ'), ('structure', 'NN'), ('is', 'VBZ'), ('briefly', 'JJ'), ('described', 'VBN'), ('here', 'RB'), ('and', 'CC'), ('then', 'RB'), ('each', 'DT'), ('of', 'IN'), ('the', 'DT'), ('processes', 'NNS'), ('is', 'VBZ'), ('described', 'VBN'), ('in', 'IN'), ('more', 'RBR'), ('detail', 'NN'), ('later', 'RB'), ('in', 'IN'), ('the', 'DT'), ('chapter', 'NN'), ('.', '.')]
----------------------------------------------------------------------------------------------------
[('Please note', 'NPhrase'), ('that', 'I

In [10]:
most_common_major_concepts = major_named_concept_frequency_distribution.most_common(50)
# print(most_common_major_concepts)
# print(tokens.index(most_common_major_concepts[0][0]))
# print(part_of_speech_array[206])
## Get indices of all common concepts
indices = [i for i, x in enumerate(part_of_speech_array) if x[0] == most_common_major_concepts[0][0]]
# print(indices)
# print(most_common_major_concepts[1][0])

def get_sentence_at_index(part_of_speech_array, index):
    sentence_starting_index = 0
    sentence_end_index = len(part_of_speech_array)
    
    ### Get sentence start index ###
    for i in range(0, index):
        if part_of_speech_array[index - i][1] == ".":
            sentence_starting_index = index - i
            break
    
    ### Get sentence end index ###
    for i in range(0, index):
        if part_of_speech_array[index + i][1] == ".":
            sentence_end_index = index + i
            break
            
    return (sentence_starting_index + 1, sentence_end_index + 1)

def does_list_contain_verb_pos(part_of_speech_array):
    for word_pos in part_of_speech_array:
        if word_pos[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            return True
    return False
    
# sentence_index = get_sentence_at_index(part_of_speech_array, indices[0])
# print(sentence_index)

concept_relations = []

i = 0
for index in indices:
#     if i < 3:
#         i = i + 1
#         continue
    sentence_pos_containing_concept = part_of_speech_array[
        get_sentence_at_index(part_of_speech_array, index)[0]:
        get_sentence_at_index(part_of_speech_array, index)[1]
    ]
    print(sentence_pos_containing_concept)
        
    last_concept = ()
    last_concept_index = -1
    # For word part_of_speech in sentence_part_of_speech_containing_concept
    for index, word_pos in enumerate(sentence_pos_containing_concept):
#         print(f"{word_pos[0]}: {word_pos[1]}")
#         print(word_pos[0] in all_noun_phrases)
        
#         if (word_pos[0] in all_noun_phrases):
        if (word_pos[0] in major_named_concepts):
            if last_concept_index != -1:# and does_list_contain_verb_pos(sentence_pos_containing_concept[last_concept_index + 1:index]):
                concept_relations.append(f"{last_concept}::{sentence_pos_containing_concept[last_concept_index + 1:index]}::{word_pos}")
            
            last_concept = word_pos
            last_concept_index = index
        
    # print(related_concepts)
    # print()
    
#     print('-----')
    i = i + 1
#     if i == 4:
#         break

# for concept_relation in concept_relations:
#     print(concept_relation)


Accuract metrics for term extraction chapter 4 first section

In [9]:
automatic_concepts_file_path = "ITIL Books/ITIL 3/Service operation chapter 4/Automated concepts extracted/4/Automated concepts extracted 4.txt"
manual_concepts_file_path = "ITIL Books/ITIL 3/Service operation chapter 4/Automated concepts extracted/4/Manual concepts extracted 4.txt"


with open(automatic_concepts_file_path, 'r') as file:
    automatic_concepts = file.read()

with open(manual_concepts_file_path, 'r') as file:
    manual_concepts = file.read()

manual_concepts_list = manual_concepts.split('\n')
manual_concepts_list = [x.lower() for x in manual_concepts_list]

# print("Manual concepts")
# print(list(dict.fromkeys(manual_concepts_list)))
# print()

automatic_concepts_list = ['Service Operation', 'processes', 'paragraph', 'detail', 'chapter', 'reference', 'structure', 'processes', 'detail', 'chapter', 'Please note', 'roles', 'process', 'tools', 'process', 'Chapters', 'Management', 'process', 'monitors', 'events', 'IT infrastructure', 'operation', 'exception conditions', 'Incident Management', 'service', 'users', 'order', 'business impact', 'Problem Management', 'root-cause analysis', 'cause', 'events', 'incidents', 'activities', 'problems/incidents', 'Known Error subprocess', 'quicker diagnosis', 'resolution', 'incidents', 'NOTE', 'distinction', 'incidents', 'problems', 'Incident', 'Problem Records', 'danger', 'Incidents', 'support cycle', 'actions', 'recurrence', 'incidents', 'Incidents', 'root cause analysis', 'visibility', 'user ’ s service', 'SLA targets', 'service', 'users', 'expectations', 'results', 'number', 'incidents', '‘ purge ’', 'visibility', 'issues', 'Request Fulfilment', 'management', 'customer', 'user requests', 'incident', 'service delay', 'disruption', 'organizations', 'requests', 'category ’', 'incidents', 'information', 'Incident Management system', 'others', 'volumes', 'business priority', 'requests', 'provision', 'Request Fulfilment', 'Request Fulfilment process', 'practice', 'Request Fulfilment process', 'customer', 'user requests', 'types', 'requests', 'facilities', 'moves', 'supplies', 'IT services', 'requests', 'SLA measures', 'records', 'process flow', 'practice', 'organizations', 'Access Management', 'process', 'users', 'right', 'service', 'access', 'users', 'users', 'ability', 'access services', 'stages', 'resources', 'HR', 'lifecycle', 'Access Management', 'Identity', 'Rights Management', 'organizations', 'addition', 'processes', 'Service Operation', 'phases', 'Service Management Lifecycle', 'aspects', 'processes', 'part', 'chapter', 'include', 'Change Management', 'process', 'Configuration Management', 'Release Management', 'topics', 'Service Transition publication', 'Capacity', 'Availability Management', 'aspects', 'publication', 'detail', 'Service Design publication', 'Financial Management', 'Service Strategy publication', 'Knowledge Management', 'Service Transition publication', 'IT Service Continuity', 'Service Design publication', 'Service Reporting', 'Measurement', 'Continual Service Improvement publication']
automatic_concepts_list = [x.lower() for x in automatic_concepts_list]

# print("all noun phrases")
# print(list(dict.fromkeys(automatic_concepts_list)))

count = 0
for concept in manual_concepts_list:
    if concept in automatic_concepts_list:
        count = count + 1

number_of_fully_correct_manual_concepts = count
print(number_of_fully_correct_manual_concepts)

number_of_manual_concepts = len(manual_concepts_list)
print(number_of_manual_concepts)

count = 0
for concept in automatic_concepts_list:
    if concept in manual_concepts_list:
        count = count + 1
    
number_of_fully_correct_automatic_concepts = count
print(number_of_fully_correct_automatic_concepts)

number_of_automatic_concepts = len(automatic_concepts_list)
print(number_of_automatic_concepts)

# Lists to words for partial matches
automatic_concepts_list_single_words = [x.split() for x in automatic_concepts_list]
# print(automatic_concepts_list_single_words)

manual_concepts_list_single_words = [x.split() for x in manual_concepts_list]
# print(manual_concepts_list_single_words)

count = 0
for concept in manual_concepts_list_single_words:
    for word in concept:
        if word in ' '.join(automatic_concepts_list).split():
            count = count + 1
            break
        
number_of_full_and_partial_correct_manual_concepts = count
print(number_of_full_and_partial_correct_manual_concepts)

count = 0
for concept in automatic_concepts_list_single_words:
    for word in concept:
        if word in ' '.join(manual_concepts_list).split():
            count = count + 1
            break
            
number_of_full_and_partial_correct_automatic_concepts = count
print(number_of_full_and_partial_correct_automatic_concepts)

74
100
80
150
98
107
