In [8]:
import pandas as pd
import numpy as np
import json
import re
import unidecode
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval
import json

In [9]:
OUTPUT_FORMULE_PATH = '../data/output/secondFormule/'
ARTICLES_PATH = '../data/input/articles.csv'
OUTPUT_DIST_PATH = '../data/output/'
'''
Function that reconstructs a dictionary of dataframes that was divided into a file with the keys and
a csv file for each dataframe of the corresponding key.
Returns: A dictionary composed of the keys and dataframes read from files of format key.txt and 
data_'key value'.csv
'''
def loader():
    with open(OUTPUT_FORMULE_PATH + 'keys.txt', 'r') as f:
        keys = eval(f.read())
        
    dictex = {}
    for key in keys:
        dictex[key] = pd.read_csv(OUTPUT_FORMULE_PATH + "data_{}.csv".format(str(key)))
    return dictex

In [10]:
dict_res = loader()
corpuses = pd.read_csv(ARTICLES_PATH, keep_default_na=False)
corpuses['corpus'] = corpuses.apply(lambda x:'%s\n%s' % (x['title'],x['text']),axis=1)

FileNotFoundError: [Errno 2] No such file or directory: '../../data/output/secondFormule/keys.txt'

In [18]:
'''Function that search the corpuses that contain the listed ngrams.
Params:
    corpuses_list: A list of the corpuses where to search the presence of the ngrams.
    
    ngram_list: A list of the ngrams to search in the corpuses.
Returns:
    matchs: A list of the indices of the corpuses where at least one of the ngrams appear.'''
def search(corpuses_list, ngram_list):
    matchs = set()
    for index, corpus in enumerate(corpuses_list):
        corpus = unidecode.unidecode(corpus)
        for ngram in ngram_list:
            #Warning, corpus data had to be pretreited to avoid problems with accents
            pat = r'\b'+ngram+r'\b'
            if re.search(pat, corpus, flags=re.I) != None:
                matchs.add(index)
                break
    return matchs

#Compute distance for each combination of parameters
'''
Function that compute the distance for each combinations of parameters, given a specific threshold. It saves
in a file named distances.json the calculated distances in a progressive way, so if it is stopped in the middle
of computation, the already measured distances shouldn't be lost.
Params:
    corpuses: A dataframe that contains the used corpuses (title + text of articles) and the company with
              which it is linked according to ground-truth.
    combinations: A dictionary structure that contains the values of the weights of the pertinence function
                  and the dataframe with all the calculated pertinences.
    threshold: The value of the threshold from which to take into account the ngrams. The scores that are above
               are considered in the distance calculation, those who are below or equal are excluded from computing.
Returns:
    res: A dictionary that contains a list of the weight combinations evaluated, and the list of corresponding
         measured accuracies.
'''
def get_distance_all(corpuses, combinations, treshold):
    #Compute distance as accuracy for each combination of weights
    res = {'weights': [], 'accuracy': []}
    mlb = MultiLabelBinarizer(sparse_output=True)
    mlb.fit([corpuses.index.values])
    enterprises = corpuses['name'].unique().tolist()
    for key, df in combinations.items():
        founds = []
        truths = []
        for company in enterprises:
            #Compute ground truth of corpuses for one company
            truths.append(corpuses.loc[corpuses['name'] == company].index.values)
            #Filter only pertinent ngrams
            ngrams = df.loc[(df['pertinence'] > treshold) & (df['name'] == company)]['ngram'].tolist()
            #What is what we get
            founds.append(search(corpuses['corpus'].tolist(), ngrams))
            
        y_correct = mlb.transform(truths)
        y_pred = mlb.transform(founds)
        res['weights'].append(key)
        #Accuray agaisnt ground truth
        res['accuracy'].append(accuracy_score(y_correct, y_pred, normalize=True))
        with open(OUTPUT_DIST_PATH + 'distances.json', 'w') as file:
            file.write(json.dumps(res))
    return res

In [None]:
#List of combination of keys that we are going to test
required_fields = ['0.2,0.4,0.4']
dict_eval = {key:value for key, value in dict_res.items() if key in required_fields}
#Eval
distances = get_distance_all(corpuses, dict_eval, 0.2)

In [19]:
#Test
corpus_t = pd.DataFrame()
corpus_t['corpus'] = ['IPSEN n\'est pas', 'Ipsadoil y a', 'Voilà ipsen']
corpus_t['name'] = ['IPSEN', 'x', 'x']
df_t = pd.DataFrame()
df_t['pertinence'] = [0.9]
df_t['name'] = ['IPSEN']
df_t['ngram'] = ['IPSEN']
treshold_t = 0.5
combination_t = {'xyz': df_t}
res_t = get_distance_all(corpus_t, combination_t, treshold_t)