In [1]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
# We need to use the tokenizer manually since we need special tokens.
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor("Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic", return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text[0])
# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<s><triplet> Punta Cana <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> Higuey <subj> La Altagracia Province <obj> located in the administrative territorial entity <subj> Dominican Republic <obj> country <triplet> La Altagracia Province <subj> Dominican Republic <obj> country <triplet> Dominican Republic <subj> La Altagracia Province <obj> contains administrative territorial entity</s>
[{'head': 'Punta Cana', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Punta Cana', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Higuey', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Higuey', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'La Altagracia Province', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Dominican Republic', 'type': 'contains administrative 

In [2]:
print(extracted_triplets)

[{'head': 'Punta Cana', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Punta Cana', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Higuey', 'type': 'located in the administrative territorial entity', 'tail': 'La Altagracia Province'}, {'head': 'Higuey', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'La Altagracia Province', 'type': 'country', 'tail': 'Dominican Republic'}, {'head': 'Dominican Republic', 'type': 'contains administrative territorial entity', 'tail': 'La Altagracia Province'}]


In [3]:
import spacy


In [4]:

def extract_relationships_concept_rebel(txt):
    extracted_triplets=[]
    try:
        txt=txt.replace('\nu\nu','').replace('\n',' ').lower().replace("Not For Distribution, Sale or Reproduction".lower(),'.')

        extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(txt, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets
    except:
        pass
    return extracted_triplets    
  

In [15]:
import pandas as pd
concept=pd.read_csv('first_concepts.csv')

In [16]:
concept

Unnamed: 0,Concept,Content,Parent_process,Parent_relationship,n_gram,ngram,key_words,rake_keyword,processed,synonym
0,11.1.1.1 PROJECT CHARTER,11.1.1.1 PROJECT CHARTER\nDescribed in Section...,PLAN RISK MANAGEMENT,INPUTS,"['11.1.1.1', 'PROJECT', 'CHARTER', '\n', 'Desc...","['project', 'charter', 'described', 'section',...","['or reproduction 403', 'requirements and risk...","['high-level project description', 'high- leve...",project charter described section project char...,"['guide •', 'charter plan', '• project', 'ente..."
1,11.1.1.2 PROJECT MANAGEMENT PLAN,11.1.1.2 PROJECT MANAGEMENT PLAN\nDescribed in...,PLAN RISK MANAGEMENT,INPUTS,"['11.1.1.2', 'PROJECT', 'MANAGEMENT', 'PLAN', ...","['project', 'management', 'plan', 'described',...","['plan components might', 'other project manag...","['approved subsidiary management plans', 'risk...",project management plan described section plan...,"['• organizational process', 'data flow diagra..."
2,11.1.1.3 PROJECT DOCUMENTS,11.1.1.3 PROJECT DOCUMENTS\nProject documents ...,PLAN RISK MANAGEMENT,INPUTS,"['11.1.1.3', 'PROJECT', 'DOCUMENTS', '\n', 'Pr...","['project', 'documents', 'project', 'documents...","['project as well', 'is useful in', 'this is u...","['3 project documents project documents', 'set...",project documents project documents considered...,"['documents project', 'assumption log']"
3,11.1.1.4 ENTERPRISE ENVIRONMENTAL FACTORS,11.1.1.4 ENTERPRISE ENVIRONMENTAL FACTORS\nThe...,PLAN RISK MANAGEMENT,INPUTS,"['11.1.1.4', 'ENTERPRISE', 'ENVIRONMENTAL', 'F...","['enterprise', 'environmental', 'factors', 'en...","['enterprise environmental factors', 'or key s...","['plan risk management process include', 'risk...",enterprise environmental factors enterprise en...,"['factors enterprise environmental', 'environm..."
4,11.1.1.5 ORGANIZATIONAL PROCESS ASSETS,11.1.1.5 ORGANIZATIONAL PROCESS ASSETS\nThe or...,PLAN RISK MANAGEMENT,INPUTS,"['11.1.1.5', 'ORGANIZATIONAL', 'PROCESS', 'ASS...","['organizational', 'process', 'assets', 'organ...","['organizational process assets', '404 part gu...","['plan risk management process include', 'risk...",organizational process assets organizational p...,"['process assets organizational', 'process ass..."
...,...,...,...,...,...,...,...,...,...,...
77,11.7.3.1 WORK PERFORMANCE INFORMATION,11.7.3.1 WORK PERFORMANCE INFORMATION\nDescrib...,MONITOR RISKS,OUTPUTS,"['11.7.3.1', 'WORK', 'PERFORMANCE', 'INFORMATI...","['work', 'performance', 'information', 'descri...","['work performance information', 'response imp...",['work performance information includes inform...,work performance information described section...,"['described section work', 'section work perfo..."
78,11.7.3.2 CHANGE REQUESTS,11.7.3.2 CHANGE REQUESTS\nDescribed in Section...,MONITOR RISKS,OUTPUTS,"['11.7.3.2', 'CHANGE', 'REQUESTS', '\n', 'Desc...","['change', 'requests', 'described', 'section',...","['risk or to', 'project risk or', 'preventive ...","['perform integrated change control process', ...",change requests described section monitor risk...,"['schedule baselines', 'baselines components',..."
79,11.7.3.3 PROJECT MANAGEMENT PLAN UPDATES,11.7.3.3 PROJECT MANAGEMENT PLAN UPDATES\nAny ...,MONITOR RISKS,OUTPUTS,"['11.7.3.3', 'PROJECT', 'MANAGEMENT', 'PLAN', ...","['project', 'management', 'plan', 'updates', '...","['project management plan', 'the project manag...","['3 project management plan updates', 'project...",project management plan updates change project...,"['perform integrated change control', 'change ..."
80,11.7.3.4 PROJECT DOCUMENTS UPDATES,11.7.3.4 PROJECT DOCUMENTS UPDATES\nProject do...,MONITOR RISKS,OUTPUTS,"['11.7.3.4', 'PROJECT', 'DOCUMENTS', 'UPDATES'...","['project', 'documents', 'updates', 'project',...","['the monitor risks', 'during the monitor', 'c...","['individual project risks generated', 'major...",project documents updates project documents ma...,"['documents updates •', 'updates • assumption'..."


In [17]:
extract_relationships_concept_rebel(concept['Content'][80].replace('\nu\nu','').replace('\n',' '))

[{'head': 'risk register', 'type': 'subclass of', 'tail': 'project documents'}]

In [77]:
string='Cost management plan. Described in Section 7.1.3.1. Changes to the cost management plan, such as changes to cost accounting, tracking, and reports, as well as updates to the budget strategy and how contingency reserves are consumed, are incorporated.'

In [78]:
extract_relationships_concept_rebel(string)

[{'head': 'Cost management plan',
  'type': 'facet of',
  'tail': 'cost accounting'}]

In [23]:
concept_liste = concept['Concept'].tolist()
concept_liste = list(map(clean_indexation, concept_liste))

def new_concepts(concept,concept_liste):
    new_concept=[]
    for rabel in concept:
        for i in rabel:
            if in_liste_bool(concept_liste,i['head'].lower())==False:
                new_concept.append(i['head'])
            if in_liste_bool(concept_liste,i['tail'].lower())==False:
                new_concept.append(i['tail'])     
    return set(new_concept)  


In [24]:
import re
def clean_indexation(text) :
    pattern = r'^(\d+(\.\d+)+\s+)?'  

    
    cleaned_subtitles = re.sub(pattern, '', text)
    return cleaned_subtitles


def in_liste_bool(liste,term):
    
    statue=False
    i=0
    while statue==False and i<len(liste):
        if term.lower() in liste[i].lower():
            statue=True
            break
        else:
            i=i+1
    return statue   

            
        

### in_liste_bool(['fadiss','karimss','azizzz'],'aziz')

In [25]:
def clean_for_rebel(content,concept):
    content=content.replace(concept,'').replace('\nu\nu',',').replace('\n',' ')
    content=content.lower()
    print(content,'****')
    return content
    
concept['clean_rebel_2']=concept.apply(lambda row : clean_for_rebel(row['Content'],row['Concept']), axis=1)

 described in section 4.1.3.1. the project charter documents the high-level project description and boundaries, high- level requirements, and risks. not for distribution, sale or reproduction. 403  ****
 described in section 4.2.3.1. in planning project risk management, all approved subsidiary management plans  should be taken into consideration in order to make the risk management plan consistent with them. the methodology  outlined in other project management plan components might inﬂuence the plan risk management process.  ****
 project documents that can be considered as inputs for this process include but are not limited to the stakeholder  register as described in section 13.1.3.1. the stakeholder register contains details of the project’s stakeholders and  provides an overview of their project roles and their attitude toward risk on this project. this is useful in determining  roles and responsibilities for managing risk on the project, as well as setting risk thresholds for the

In [21]:
concept['relationship_entites']=concept['clean_rebel_2'].apply(lambda txt:extract_relationships_concept_rebel(txt))


Token indices sequence length is longer than the specified maximum sequence length for this model (2380 > 1024). Running this sequence through the model will result in indexing errors


In [26]:
concept_liste = concept['Concept'].tolist()
concept_liste = list(map(clean_indexation, concept_liste))
new_concept_2=new_concepts(concept['relationship_entites'],concept_liste)

In [28]:
print(len(new_concept_2))

74


In [29]:
print(len(new_concept_2))

74


In [30]:
def extract_relationships_concept_rebel_with_extrat_cleaning(txt):
    extracted_triplets=[]
    try:
        bullet_points=txt.split('\nu\nu')
        for point in bullet_points:
            print(point)
            point=point.replace('\nu\nu','').replace('\n',' ').replace("Not For Distribution, Sale or Reproduction",'.')
            extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(point, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
            extracted_triplets =extracted_triplets  + extract_triplets(extracted_text[0])
        return extracted_triplets
    except:
        pass
    return extracted_triplets  

In [109]:
string=concept['Content'][50].replace(concept['Concept'][20],'')

In [31]:
extract_relationships_concept_rebel_with_extrat_cleaning(string)

NameError: name 'string' is not defined

In [32]:
concept['relationship_entites_clean']=concept.apply(lambda row:extract_relationships_concept_rebel_with_extrat_cleaning(row['Content'].replace(row['Concept'],'')),axis=1)


Described in Section 4.1.3.1. The project charter documents the high-level project description and boundaries, high-
level requirements, and risks.
Not For Distribution, Sale or Reproduction.
403


Described in Section 4.2.3.1. In planning Project Risk Management, all approved subsidiary management plans 
should be taken into consideration in order to make the risk management plan consistent with them. The methodology 
outlined in other project management plan components might inﬂuence the Plan Risk Management process.


Project documents that can be considered as inputs for this process include but are not limited to the stakeholder 
register as described in Section 13.1.3.1. The stakeholder register contains details of the project’s stakeholders and 
provides an overview of their project roles and their attitude toward risk on this project. This is useful in determining 
roles and responsibilities for managing risk on the project, as well as setting risk thresholds for the project.


 Reporting formats. Reporting formats deﬁne how the outcomes of the Project Risk Management process will 
be documented, analyzed, and communicated. This section of the risk management plan describes the content 
and format of the risk register and the risk report, as well as any other required outputs from the Project Risk 
Management processes.
 Tracking. Tracking documents how risk activities will be recorded and how risk management processes will 
be audited.
Probability
Probability
0.05
0.04
0.03
0.02
0.01
Very Low
0.05
Very High
0.90
High
0.70
Medium
0.50
Low
0.30
Very Low
0.10
Very High
0.90
High
0.70
Medium
0.50
Low
0.30
Very Low
0.10
0.09
0.07
0.05
0.03
0.01
Low
0.10
0.18
0.14
0.10
0.06
0.02
Moderate
0.20
0.36
0.28
0.20
0.12
0.04
High
0.40
0.72
0.56
0.40
0.24
0.08
Very High
0.80
Threats
0.05
0.04
0.03
0.02
0.01
Very Low
0.05
0.09
0.07
0.05
0.03
0.01
Low
0.10
0.18
0.14
0.10
0.06
0.02
Moderate
0.20
0.36
0.28
0.20
0.12
0.04
High
0.40
0.72
0.56
0.40
0.24
0.08
Very High
0.80
Opport

 Duration estimates. Described in Section 6.4.3.1. Duration estimates provide quantitative assessments of 
project durations, ideally expressed as a range, indicating the degree of risk, where a structured review of the 
documents may indicate that the current estimate is insufﬁcient and poses a risk to the project.
 Issue log. Described in Section 4.3.3.3. Issues recorded in the issue log may give rise to individual project risks 
and may also inﬂuence the level of overall project risk.
 Lessons learned register. Described in Section 4.4.3.1. Lessons learned about risk identiﬁed from earlier phases 
of the project are reviewed to determine whether similar risks might recur during the remainder of the project.
 Requirements documentation. Described in Section 5.2.3.1. Requirements documentation lists the project 
requirements and allows the team to identify those that could be at risk.
Not For Distribution, Sale or Reproduction.
413
 Resource requirements. Described in Section 9.2.3.1.


A prompt list is a predetermined list of risk categories that might give rise to individual project risks and that could also 
act as sources of overall project risk. The prompt list can be used as a framework to aid the project team in idea generation 
when using risk identiﬁcation techniques. The risk categories in the lowest level of the risk breakdown structure can be 
used as a prompt list for individual project risks. Some common strategic frameworks are more suitable for identifying 
sources of overall project risk, for example PESTLE (political, economic, social, technological, legal, environmental), TECOP 
(technical, environmental, commercial, operational, political), or VUCA (volatility, uncertainty, complexity, ambiguity).


To undertake risk identiﬁcation, the project team may conduct a specialized meeting (often called a risk workshop). 
Most risk workshops include some form of brainstorming (see Section 4.1.2.2), but other risk identiﬁcation techniques 
may be included 


Described in Section 4.2.3.1. Project management plan components include the risk management plan as 
described in Section 11.1.3.1. Of particular interest in this process are the roles and responsibilities for conducting risk 
management, budgets for risk management, schedule activities for risk management, risk categories (often deﬁned in 
a risk breakdown structure), deﬁnitions of probability and impact, the probability and impact matrix, and stakeholders’ 
risk thresholds. These inputs are usually tailored to the project during the Plan Risk Management process. If they are 
not available, they may be developed during the Perform Qualitative Risk Analysis process and presented to the project 
sponsor for approval before use.


Project documents that can be considered as inputs for this process include but are not limited to:
 Assumption log. Described in Section 4.1.3.2. The assumption log is used for identifying, managing, and 
monitoring key assumptions and constraints that may a


Data representation techniques that can be used during this process include but are not limited to:
 Probability and impact matrix. A probability and impact matrix is a grid for mapping the probability of each 
risk occurrence and its impact on project objectives if that risk occurs. This matrix speciﬁes combinations of 
probability and impact that allow individual project risks to be divided into priority groups (see Figure 11-5). Risks 
can be prioritized for further analysis and planning of risk responses based on their probability and impacts. 
The probability of occurrence for each individual project risk is assessed as well as its impact on one or more 
project objectives if it does occur, using deﬁnitions of probability and impact for the project as speciﬁed in the 
risk management plan. Individual project risks are assigned to a priority level based on the combination of their 
assessed probability and impact, using a probability and impact matrix.
An organization can assess a


Described in Section 4.2.3.1. Project management plan components include but are not limited to:
 Risk management plan. Described in Section 11.1.3.1. The risk management plan speciﬁes whether quantitative 
risk analysis is required for the project. It also details the resources available for the analysis and the expected 
frequency of analyses.
 Scope baseline. Described in Section 5.4.3.1. The scope baseline describes the starting point from which the 
effect of individual project risks and other sources of uncertainty are evaluated.
 Schedule baseline. Described in Section 6.5.3.1. The schedule baseline describes the starting point from which 
the effect of individual project risks and other sources of uncertainty can be evaluated.
 Cost baseline. Described in Section 7.3.3.1. The cost baseline describes the starting point from which the effect 
of individual project risks and other sources of uncertainty can be evaluated.


Project documents that can be considered as inputs for th

 Sensitivity analysis. Sensitivity analysis helps to determine which individual project risks or other sources of 
uncertainty have the most potential impact on project outcomes. It correlates variations in project outcomes with 
variations in elements of the quantitative risk analysis model.
One typical display of sensitivity analysis is the tornado diagram, which presents the calculated correlation 
coefﬁcient for each element of the quantitative risk analysis model that can inﬂuence the project outcome. 
This can include individual project risks, project activities with high degrees of variability, or speciﬁc sources 
of ambiguity. Items are ordered by descending strength of correlation, giving the typical tornado appearance.  
An example tornado diagram is shown in Figure 11-14.
Figure 11-14. Example Tornado Diagram
Activity or Risk Driving
Projection Duration
Correlation with Project Duration
-0.2          -0.1          0             0.1           0.2           0.3          0.4   


Described in Section 4.2.3.1. Project management plan components include but are not limited to:
 Resource management plan. Described in Section 9.1.3.1. The resource management plan is used to help 
determine how resources allocated to agreed-upon risk responses will be coordinated with other project resources.
 Risk management plan. Described in Section 11.1.3.1. Risk management roles and responsibilities and risk 
thresholds are used in this process.
 Cost baseline. Described in Section 7.3.3.1. The cost baseline has information on the contingency fund that is 
allocated to respond to risks.
Not For Distribution, Sale or Reproduction.
440 
Part 1 - Guide


Project documents that can be considered as inputs for this process include but are not limited to:
 Lessons learned register. Described in Section 4.4.3.1. Lessons learned about effective risk responses used in 
earlier phases of the project are reviewed to determine if similar responses might be useful during the remainder 
of 

 Accept. Risk acceptance acknowledges the existence of a threat, but no proactive action is taken. This strategy 
may be appropriate for low-priority threats, and it may also be adopted where it is not possible or cost-effective 
to address a threat in any other way. Acceptance can be either active or passive. The most common active 
acceptance strategy is to establish a contingency reserve, including amounts of time, money, or resources to 
handle the threat if it occurs. Passive acceptance involves no proactive action apart from periodic review of the 
threat to ensure that it does not change signiﬁcantly.
Not For Distribution, Sale or Reproduction.
444 
Part 1 - Guide


Five alternative strategies may be considered for dealing with opportunities, as follows:
 Escalate. This risk response strategy is appropriate when the project team or the project sponsor agrees that 
an opportunity is outside the scope of the project or that the proposed response would exceed the project 
manager’s


A number of alternative risk response strategies may be considered. Data analysis techniques that can be used to 
select a preferred risk response strategy include but are not limited to:
 Alternatives analysis. A simple comparison of the characteristics and requirements of alternative risk response 
options can lead to a decision on which response is most appropriate.
 Cost-benefit analysis. If the impact of an individual project risk can be quantiﬁed in monetary terms, then the 
cost-effectiveness of alternative risk response strategies can be determined using cost-beneﬁt analysis (see 
Section 8.1.2.3). The ratio of (change in impact level) divided by (implementation cost) gives the cost effectiveness 
of the response strategy, with a higher ratio indicating a more effective response.


Decision-making techniques that can be used to select a risk response strategy include but are not limited to 
multicriteria decision analysis (described in Section 8.1.2.4). One or more risk respon


Described in Section 4.2.3.1. Project management plan components include but are not limited to the risk 
management plan. Described in Section 11.1.3.1, the risk management plan lists the roles and responsibilities of 
project team members and other stakeholders for risk management. This information is used when allocating owners 
for agreed-upon risk responses. The risk management plan also deﬁnes the level of detail for the risk management 
methodology for the project. It also speciﬁes risk thresholds for the project based on the risk appetite of key stakeholders, 
which deﬁne the acceptable target that the implementation of risk responses is required to achieve.


Project documents that can be considered as inputs for this process include but are not limited to:
 Lessons learned register. Described in Section 4.4.3.1. Lessons learned earlier in the project with regard 
to implementing risk responses can be applied to later phases in the project to improve the effectiveness 
of thi

 Lessons learned register. Described in Section 4.4.3.1. Risk-related lessons from earlier in the project can be 
applied to later phases in the project.
 Risk register. Described in Section 11.2.3.1. The risk register has key inputs that include identiﬁed individual 
project risks, risk owners, agreed-upon risk responses, and speciﬁc implementation actions. It may also provide 
signs of risk, residual and secondary risks, and a watch list of low-priority risks.
 Risk report. Described in Section 11.2.3.2. The risk report includes an assessment of the current overall project 
risk exposure as well as the agreed-upon risk response strategy. It also describes the major individual risks with 
planned responses and risk owners.
Not For Distribution, Sale or Reproduction.
456 
Part 1 - Guide


Described in Section 4.3.3.2. Work performance data contains data on project status such as risk responses that 
have been implemented, risks that have occurred, risks that are active and those that h

In [34]:
concept_liste = concept['Concept'].tolist()
concept_liste = list(map(clean_indexation, concept_liste))
new_concept_3=new_concepts(concept['relationship_entites_clean'],concept_liste)

In [35]:
len(list(new_concept_3)+list(new_concept_2))

331

In [36]:
all_concept=list(new_concept_3)+list(new_concept_2)

In [37]:
len(all_concept)

331

In [38]:
def clean_single_word(text):
    if len(text.split(' '))<=1:
        return False
    else:
        return True



In [39]:
all_concept_v=list(filter(clean_single_word,all_concept))    

In [40]:
len(all_concept_v)

219

In [41]:
all_concept_v

['root cause analysis',
 'Data representation techniques',
 'Qualitative Risk Analysis',
 'Cost forecasts',
 'Risk categories',
 'schedule targets',
 'risk response strategy',
 'Risk statement',
 'Milestone list',
 'Plan Risk Management',
 'Project Risk Management',
 'actual data',
 'Section 4.1.2.3.',
 'issue log',
 'procurement management',
 'dealing with threats',
 'Qualitative risk analysis',
 'cost and schedule',
 'Requirements management plan',
 'quality management',
 'earned value data',
 'Threat response',
 'Tornado Diagram',
 'lessons learned register',
 'Risk Analysis',
 'joint venture',
 'areas of expertise',
 'Lessons learned repository',
 'World Heritage Site',
 'Plan Risk Responses',
 'change control',
 'quantitative risk analysis',
 'project process control',
 'dealing with opportunities',
 'project status',
 'response plan',
 'schedule strategy',
 'EXTERNAL RISK',
 'contingency plan',
 'Academic studies',
 'Probability and impact matrix',
 'Residual risk',
 'project cos

In [42]:
import pickle as pk
file_path = 'evalution_metric.pkl'

with open(file_path,'wb') as f :
    pk.dump(all_concept_v,f)

In [47]:
import spacy

nlp = spacy.load('en_core_web_sm')
def extract_definition(dictionary,text):
    doc = nlp(text)
    defintion=[]
    for sent in doc.sents:
        
        if any(normalize_text(token.lemma_) in dictionary for token in sent): 
            
            defintion.append(sent)
            break
            
    return defintion  

dic="define definition setting identify defining set establishing classifying specifying means meaning is"
doc = nlp(dic)
liste=[]
for token in doc:
    liste.append(str(token.lemma_))
definition_dic=list(set(liste))

In [44]:
concept.to_csv('first_concepts_2.csv',header=True,index=False)

In [45]:
new_concept_df=pd.DataFrame(all_concept_v,columns=['Concept'])

NameError: name 'new_concept_df' is not defined

In [48]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

In [49]:
import faiss
import numpy as np
encoded_data = model.encode(concept.Content.tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(concept))))
faiss.write_index(index, 'content.index')

In [3]:
def fetch_content_info(dataframe_idx):
    info = concept.iloc[dataframe_idx]
    meta_dict = dict()
    meta_dict['Content'] = info['Content']
    return meta_dict
import time    
def search(query, top_k, index, model):
    t=time.time()
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print('>>>> Results in Total Time: {}'.format(time.time()-t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results =  [fetch_content_info(idx) for idx in top_k_ids]
    return results

In [4]:
from pprint import pprint
query="what is requirements management plan ?"
results=search(query, top_k=1, index=index, model=model)
print("\n")
for result in results:
    print('\t',result)


NameError: name 'index' is not defined

In [54]:
def document_search(concept,index,model):
    query=f"what is {concept}?"
    results=search(query, top_k=1, index=index, model=model)
    for result in results:
        return result['Content']


In [55]:
new_concept_df['Content']=new_concept_df['Concept'].apply(lambda txt:document_search(txt,index,model))

>>>> Results in Total Time: 0.09840130805969238
>>>> Results in Total Time: 0.05814194679260254
>>>> Results in Total Time: 0.06402778625488281
>>>> Results in Total Time: 0.054141998291015625
>>>> Results in Total Time: 0.05340886116027832
>>>> Results in Total Time: 0.054326534271240234
>>>> Results in Total Time: 0.05077171325683594
>>>> Results in Total Time: 0.050595760345458984
>>>> Results in Total Time: 0.050469398498535156
>>>> Results in Total Time: 0.059110164642333984
>>>> Results in Total Time: 0.054439544677734375
>>>> Results in Total Time: 0.051282644271850586
>>>> Results in Total Time: 0.050875186920166016
>>>> Results in Total Time: 0.05752110481262207
>>>> Results in Total Time: 0.05809926986694336
>>>> Results in Total Time: 0.05202460289001465
>>>> Results in Total Time: 0.04831695556640625
>>>> Results in Total Time: 0.05548501014709473
>>>> Results in Total Time: 0.06090354919433594
>>>> Results in Total Time: 0.054543495178222656
>>>> Results in Total Time: 0.0

>>>> Results in Total Time: 0.07097935676574707
>>>> Results in Total Time: 0.06407546997070312
>>>> Results in Total Time: 0.06085920333862305
>>>> Results in Total Time: 0.05993151664733887
>>>> Results in Total Time: 0.06441879272460938
>>>> Results in Total Time: 0.059902191162109375
>>>> Results in Total Time: 0.059378862380981445
>>>> Results in Total Time: 0.0631248950958252
>>>> Results in Total Time: 0.06012678146362305
>>>> Results in Total Time: 0.05501246452331543
>>>> Results in Total Time: 0.05770063400268555
>>>> Results in Total Time: 0.05906558036804199
>>>> Results in Total Time: 0.05583333969116211
>>>> Results in Total Time: 0.05864214897155762
>>>> Results in Total Time: 0.05515623092651367
>>>> Results in Total Time: 0.05837702751159668
>>>> Results in Total Time: 0.057883262634277344
>>>> Results in Total Time: 0.05600142478942871
>>>> Results in Total Time: 0.05573534965515137
>>>> Results in Total Time: 0.05836844444274902
>>>> Results in Total Time: 0.05810308

In [85]:
f


In [86]:
new_concept_df['definition']=new_concept_df.apply(lambda row:find_definion_in_string(row['Concept'],row['Content'].lower().replace('\n',' ')),axis=1)

1
1
1
1
1


In [87]:
new_concept_df['reference']=new_concept_df.apply(lambda row:Extract_refenrence(row['Concept'],row['Content'].lower().replace('\n',' ')),axis=1)

yes
yes
yes
yes
yes
yes
yes
yes
yes


In [89]:
result=new_concept_df.copy()

In [121]:
result

Unnamed: 0,Concept,Content,definition,reference
0,root cause analysis,11.2.2.3 DATA ANALYSIS\nData analysis techniqu...,,[root cause analysis. root cause analysis (see...
1,Data representation techniques,11.1.2.2 DATA ANALYSIS\nData analysis techniqu...,,[]
2,Qualitative Risk Analysis,11.4.2.5 DATA ANALYSIS\nData analysis techniqu...,,[]
3,Cost forecasts,11.4.2.4 REPRESENTATIONS OF UNCERTAINTY\nQuant...,,[]
4,Risk categories,11.3.2.5 RISK CATEGORIZATION\nRisks to the pro...,,[]
...,...,...,...,...
214,quantitative risk analysis,11.4.2.2 DATA GATHERING\nInterviews (see Secti...,,[]
215,project process control,11.7.3.3 PROJECT MANAGEMENT PLAN UPDATES\nAny ...,,[]
216,fallback plan,11.5.3.1 CHANGE REQUESTS\nDescribed in Section...,,[]
217,risk responses,11.6.2.2 INTERPERSONAL AND TEAM SKILLS\nInterp...,,[]


In [109]:
def add_column_relationships(column1,column2):
    print(column2+column1)
    column3=column1+column2
    return column3
    


def remove_unnecessary_relationships(entite_relationship_liste,concept_liste):
    
    for i in entite_relationship_liste:
        if not i['head'] in concept_liste or not i['tail'] in concept_liste :
            entite_relationship_liste.remove(i)
    return entite_relationship_liste        

In [114]:
concept['relationship_entites_final']=concept.apply(lambda row:add_column_relationships(row['relationship_entites'],row['relationship_entites_clean']),axis=1)

[{'head': 'requirements', 'type': 'part of', 'tail': 'project'}, {'head': 'project description', 'type': 'part of', 'tail': 'project charter'}]
[{'head': 'Project Risk Management', 'type': 'subclass of', 'tail': 'risk management'}, {'head': 'project management plan', 'type': 'facet of', 'tail': 'project risk management'}]
[{'head': 'risk thresholds', 'type': 'facet of', 'tail': 'risk'}, {'head': 'risk thresholds', 'type': 'facet of', 'tail': 'risk'}]
[{'head': 'Risk Management', 'type': 'studies', 'tail': 'risk'}, {'head': 'risk', 'type': 'studied by', 'tail': 'Risk Management'}, {'head': 'risk management', 'type': 'studies', 'tail': 'risk threshold'}, {'head': 'risk threshold', 'type': 'studied by', 'tail': 'risk management'}]
[{'head': 'Plan Risk Management', 'type': 'instance of', 'tail': 'organizational process'}, {'head': 'Organizational risk policy', 'type': 'subclass of', 'tail': 'risk policy'}, {'head': 'Risk categories', 'type': 'part of', 'tail': 'risk breakdown structure'}, 

In [115]:
concept['relationship_entites_final']=concept.apply(lambda row:remove_unnecessary_relationships(row['relationship_entites_final'],concept_liste+all_concept_v),axis=1)

In [116]:
concept['relationship_entites_final']

0     [{'head': 'requirements', 'type': 'part of', '...
1     [{'head': 'Project Risk Management', 'type': '...
2     [{'head': 'risk thresholds', 'type': 'facet of...
3     [{'head': 'risk threshold', 'type': 'studied b...
4     [{'head': 'Plan Risk Management', 'type': 'ins...
                            ...                        
77    [{'head': 'response planning', 'type': 'facet ...
78    [{'head': 'Monitor Risks', 'type': 'facet of',...
79    [{'head': 'change control', 'type': 'facet of'...
80    [{'head': 'project risk', 'type': 'part of', '...
81    [{'head': 'risk management plan', 'type': 'has...
Name: relationship_entites_final, Length: 82, dtype: object

In [117]:
def parent_child_relationship(parent,relation,child):
    return {'head':parent,'type':relation,'trail':child}

def clean_indexation(text) :
    pattern = r'^(\d+(\.\d+)+\s+)?'  

    
    cleaned_subtitles = re.sub(pattern, '', text)
    return cleaned_subtitles

In [118]:
concept['parent_child_relationship']=concept.apply(lambda row:parent_child_relationship(row['Parent_process'].lower(),row['Parent_relationship'].lower(),clean_indexation(row['Concept'].lower())),axis=1)

In [119]:
concept['parent_child_relationship']

0     {'head': 'plan risk management', 'type': ' inp...
1     {'head': 'plan risk management', 'type': ' inp...
2     {'head': 'plan risk management', 'type': ' inp...
3     {'head': 'plan risk management', 'type': ' inp...
4     {'head': 'plan risk management', 'type': ' inp...
                            ...                        
77    {'head': 'monitor risks', 'type': ' outputs', ...
78    {'head': 'monitor risks', 'type': ' outputs', ...
79    {'head': 'monitor risks', 'type': ' outputs', ...
80    {'head': 'monitor risks', 'type': ' outputs', ...
81    {'head': 'monitor risks', 'type': ' outputs', ...
Name: parent_child_relationship, Length: 82, dtype: object

In [120]:
concept.to_csv('first_concepts_2.csv',header=True,index=False)

In [122]:
result.to_csv('new_concepts.csv',header=True,index=False)