# Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import spacy
import itertools
import pickle
import torch
import pandas as pd
import numpy as np
from collections import Counter
from collections import Counter
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import CamembertModel, CamembertTokenizer
from transformers import pipeline
from itertools import groupby, combinations

# Camembert vectors

In [None]:
#Define tokenizer

tokenizer = AutoTokenizer.from_pretrained("gilf/french-camembert-postag-model")
model = AutoModelForTokenClassification.from_pretrained("gilf/french-camembert-postag-model")
texts = "L’innovation principale réside dans l’aspect transverse de l’échange de données qu’elle permet, et ce sans données perte de qualité ni d’intégrité, entre tous les acteurs du métier du SLI, de l’analyse de la maintenance, en passant par la documentation technique, la gestion des approvisionnements et ainsi le maintien en condition opérationnelle du système tout au long de sa durée de vie."
tokens = tokenizer.tokenize(texts)

#Extract the POS of tokens
nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
list_tokens = nlp_token_class(texts)

#########################################################################################################
#########################################################################################################

#Get rid of '_' and "''" in the beginning of the tokens

for i,item in enumerate(list_tokens):
    word_1 = item.get('word')
    if word_1[0] == "’" and len(word_1)>1:
        item['word'] =  word_1[1:]
        
for i,item in enumerate(tokens):
    if item[0] == '▁':
        tokens[i] = item[1:]

        
#Create a list of indexed tokens
final_list = []
for i, item in enumerate(tokens):
    for item_2 in list_tokens:        
        if item_2.get('word')== item:
             final_list.append((i, item_2.get('entity_group'), item))      


#########################################################################################################
#########################################################################################################

#Create a list of nouns
noun_list=[]
for item in set(final_list):
    if item[1] == 'NC' or item[1] == 'NPP': 
        noun_list.append(item)

# Remove stop words         
filterd_noun_list=[]
for item in noun_list:
    if item[2].isalpha():
        filterd_noun_list.append(item)

#A list only of names without indexes and  POSs       
just_nouns=[]
for item in filterd_noun_list:
    just_nouns.append(item[2])
    
    
#Create a list of repeated nouns    
noun_counter = Counter(just_nouns)  
repeated_noun = []
for item in filterd_noun_list:
    if noun_counter[list(item)[2]]>1:
        repeated_noun.append(item)
sorted_repeated_noun= sorted(repeated_noun, key=lambda tup: tup[2], reverse=True)


#Create a dictionary of nouns so that Key: nouns, key.value: index of noun
noun_dictionary = {}
for idx_token,dep_tok,tok in filterd_noun_list:
        noun_dictionary.setdefault(tok, []).append((idx_token))
        
        
#Extract the last layer of Camembert vector
model = CamembertModel.from_pretrained('camembert-base')
input_ids = torch.tensor(tokenizer.encode(texts, add_special_tokens=True)).unsqueeze(0)
output = model(input_ids)
final_output_camembert_list = output.last_hidden_state



final_list_=[]
for key in noun_dictionary:
    new_list=[]
    for item in noun_dictionary.get(key):
        new_list.append(final_output_camembert_list[0][item+1])
    final_list_.append(new_list)
        
#Convert to np array       
final_list_np = []
for item in final_list_:
    np_list=[]
    for item_1 in item:
        np_list.append(item_1.detach().numpy())
    final_list_np.append(np_list)
        
#mean velue of repeated nouns
final_list_mean = []
for item in final_list_np:
    final_list_mean.append(np.mean(item, axis=0))
    
    
# Calculate Cosine Similarity
cos_sim_final = []
for item in list(combinations(final_list_mean, 2)):
    outputs_1 = item[0]
    outputs_2 = item[1]
    cos_sim_final.append(np.dot(outputs_1, outputs_2)/(np.linalg.norm(outputs_1)*np.linalg.norm(outputs_2)))
    
#Remove tokens that do not exist in the Lexique383
lex = pd.read_csv('http://www.lexique.org/databases/Lexique383/Lexique383.tsv', sep='\t')
df1 = lex[lex['cgram'] == 'NOM']
list_lex = df1['ortho'].to_list()
# Keep only the names in the Lex dictionary 
flitered_dict_noun = [x for x in list(noun_dictionary.keys()) if x in list_lex]

# Delete all incorrectly split tokens and recalculate all similarities 
lex = pd.read_csv('http://www.lexique.org/databases/Lexique383/Lexique383.tsv', sep='\t')
df1 = lex[lex['cgram'] == 'NOM']
list_lex = df1['ortho'].to_list()
# Keep only the names in the Lex dictionary 
flitered_dict_noun = [x for x in list(noun_dictionary.keys()) if x in list_lex]


flitered_dict = {}
for idx_token,dep_tok,tok in filterd_noun_list:
    if tok in flitered_dict_noun:
        flitered_dict.setdefault(tok, []).append(idx_token)
        
flitered_final_list_=[]
for key in flitered_dict:
    new_list=[]
    for item in flitered_dict.get(key):
        new_list.append(final_output_camembert_list[0][item+1])
    flitered_final_list_.append(new_list)
    
    
filtered_final_list_np = []
for item in flitered_final_list_:
    np_list=[]
    for item_1 in item:
        np_list.append(item_1.detach().numpy())
    filtered_final_list_np.append(np_list)
    
filtered_final_list_mean = []
for item in filtered_final_list_np:
    filtered_final_list_mean.append(np.mean(item, axis=0))

filtered_cos_sim_final = []
for item in list(combinations(filtered_final_list_mean, 2)):
    outputs_1 = item[0]
    outputs_2 = item[1]
    filtered_cos_sim_final.append(np.dot(outputs_1, outputs_2)/(np.linalg.norm(outputs_1)*np.linalg.norm(outputs_2)))
    
    
filtered_list_0_50 = []
filtered_list_50_60 = []
filtered_list_60_70 = []
filtered_list_70_80 = []
filtered_list_80_90 = []
filtered_list_90_100 = []
filtered_comb_noun_dictionary = list(combinations(list(flitered_dict.keys()), 2))
for indx, item in enumerate(filtered_cos_sim_final):
    if 0<item<0.5:
        filtered_list_0_50.append((filtered_comb_noun_dictionary[indx],item))
    elif 0.5<item<0.6:
        filtered_list_50_60.append((filtered_comb_noun_dictionary[indx],item))
    elif 0.6<item<0.7:
        filtered_list_60_70.append((filtered_comb_noun_dictionary[indx],item))
    elif 0.7<item<0.8:
        filtered_list_70_80.append((filtered_comb_noun_dictionary[indx],item))
    elif 0.8<item<0.9:
        filtered_list_80_90.append((filtered_comb_noun_dictionary[indx],item))
    elif item>0.9:
        filtered_list_90_100.append((filtered_comb_noun_dictionary[indx],item))
        
ls_cos_sim_all_filtered = filtered_list_0_50+filtered_list_50_60+filtered_list_60_70+filtered_list_70_80+filtered_list_80_90+filtered_list_90_100

ls_cos_sim_all_filtered = filtered_list_0_50+filtered_list_50_60+filtered_list_60_70+filtered_list_70_80+filtered_list_80_90+filtered_list_90_100
ls_cos_sim_all_filtered_final_0 = []
for item in ls_cos_sim_all_filtered:
    ls_cos_sim_all_filtered_final_0.append((item[0][0],item[0][1],item[1]))

ls_cos_sim_all_filtered_final_1=sorted(ls_cos_sim_all_filtered_final_0, key=lambda tup: (tup[0],tup[2]) , reverse=True )

# Since our data is private, we are not allowed to publish it and only one sentence is used in this file as an example. The code to get all the variables used in the knowledge graph file is in the next block, but since the input sentence is too small, you will get an error. You can get these values using your own text. 

In [None]:
#Keep the words with cosine similarity > 0.7
ls_cos_sim_all_filtered_final_2 = {}
for w_1,w_2,cs in ls_cos_sim_all_filtered_final_1:
    if cs > 0.70:
        ls_cos_sim_all_filtered_final_2.setdefault(w_1, []).append((w_2,cs))
#################################################################################        
#elete lists that are less than 10 in length
ls_cos_sim_all_filtered_final_3={}
for item in ls_cos_sim_all_filtered_final_2.keys():
    if len(ls_cos_sim_all_filtered_final_2[item])>=10:
          ls_cos_sim_all_filtered_final_3.setdefault(item, []).append(ls_cos_sim_all_filtered_final_2[item]) 
#################################################################################

best_words = []
for item in ls_cos_sim_all_filtered_final_3.keys():
    first_ls= [item]
    for item_2 in range(len(ls_cos_sim_all_filtered_final_3[item][0])):
        first_ls.append(ls_cos_sim_all_filtered_final_3[item][0][item_2][0])
    best_words.append(first_ls)
#################################################################################
len_list= []
for item in best_words:
    len_list.append(len(item))
    
avg_l = np.mean(len_list)
max_l = np.max(len_list)
min_l = np.min(len_list)
print(f'mean is {avg_l}, max is {max_l}, min is {min_l}')
#################################################################################
best_words.sort(key=len, reverse=True)
com_best_words = list(combinations(best_words, 2))


list_common_words_sorted = []
for item_1,item_2 in  enumerate(com_best_words):
    list3 = set(item_2[0])&set(item_2[1])
    list4 = sorted(list3, key = lambda k : item_2[0].index(k))
    list_common_words_sorted.append(list4)
    
    
list_to_drop = []
for idx,item in enumerate(com_best_words):
    l_0 = len(list_common_words_sorted[idx])
    l_1 = len(item[0])
    l_2 = len(item[1])
    if l_0>(l_2*0.40):
        list_to_drop.append(item[1])
        
# Creat the final list of 50 sub-lists         
final_list_50=[]
for element in best_words:
    if element not in list_to_drop:
        final_list_50.append(element)

#Associate the cammembert vector of each word in list 
Noun_1 = list(flitered_dict.keys())
cam_vec_final_list_50 = []
for item in final_list_50:
    temp_list = []
    for item_1 in item:
        idx = Noun_1.index(item_1)
        temp_list.append(filtered_final_list_mean[idx])
    cam_vec_final_list_50.append(temp_list)
    
#Associate the label of each cammembert vector according to the number of their sublist    
total_cam_vec_final_list_50 = []
for item in total_cam_vec_final_list_50:
    temp_ls = []
    for item_1 in item:
        temp_ls.append(item_1)
    total_cam_vec_final_list_50.append(temp_ls)
    
labeled_cam_vector_data=[]
for idx, item in enumerate(total_cam_vec_final_list_50):
    for item_1 in item:
        labeled_cam_vector_data.append((list(item_1),idx))
        

# Train a LR model

In [None]:
X = np.zeros((len(labeled_cam_vector_data), 768))
y = []
for idx,item in enumerate(labeled_cam_vector_data):
    y.append(item[1])
    for i in range(len(item[0])):
        X[idx,i] = item[0][i]

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr_2 = LogisticRegression()
logisticRegr_2.fit(X, y)