In [1]:
from generator import TagGenerator
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import glob
import os
import nltk.stem.wordnet as lemmatizer
import time

In [2]:
document_keys = []

for file in glob.glob("fao/docs/*"):
    #print(file)
    filename = os.path.basename(file)
    #print(filename)
    
    doc_file = open(file, "r")
    keys_file = open(os.path.join("fao/keys", filename.split(".")[0] + ".key"), "r")
    
    doc = doc_file.read()
    doc = " ".join(RegexpTokenizer(r'[a-zA-Z]{2,}').tokenize(doc)).lower()
    
    
    keys = keys_file.read()
    keys = ",".join(keys.split("\n"))
    #print(keys)
    
    #print(doc[:10])
    #print(keys[:10])
    document_keys.append([doc, keys])

In [3]:
document_keys[0]

['issn fao animal production and health proceedings the dynamics of sanitary and technical requirements assisting the poor to cope expert consultation rome june food and agriculture organization of the united nations rome the designations employed and the presentation of material in this information product do not imply the expression of any opinion whatsoever on the part of the food and agriculture organization of the united nations concerning the legal or development status of any country territory city or area or of its authorities or concerning the delimitation of its frontiers or boundaries isbn all rights reserved reproduction and dissemination of material in this information product for educational or other non commercial purposes are authorized without any prior written permission from the copyright holders provided the source is fully acknowledged reproduction of material in this information product for resale or other commercial purposes is prohibited without written permissi

In [4]:
dataset= pd.DataFrame(document_keys, columns=["text", "tags"])

In [5]:
dataset

Unnamed: 0,text,tags
0,issn fao animal production and health proceedi...,"animal health,animal production,animal product..."
1,cover illustration by emanuela antoni model pl...,"asia and the pacific,cooperation,delinquent be..."
2,fao fisheries circular no firm en issn manual ...,"bioassays,biomass,caribbean,case studies,data ..."
3,the legal framework for the management of anim...,"agriculture,animal breeding,animal genetic res..."
4,introducing the international bioenergy platfo...,"agricultural development,bioenergy,biofuels,bi..."
5,guidelines for soil description guidelines for...,"carbonates,climate,colour,fao,genetic soil typ..."
6,time for action changing the gender situation ...,"armenia,baltic states,canada,croatia,economic ..."
7,pour for producteurs et exporters from west af...,"agricultural products,agriculture,biological c..."
8,the role of local institutions in reducing vul...,"capacity building,case studies,community invol..."
9,the growing global obesity problem some policy...,"developed countries,developing countries,diet,..."


In [6]:
import time

start = time.time()
a, b, c = TagGenerator(semantic_field_size=40, stemmer = "porter", generate_bigrams=True).generate(dataset["text"].tolist(), 2, root="a", expand_doc_tags=True, max_additions = 3)
print(time.time() - start)

51.46257543563843


In [7]:
#a

In [8]:
def score(estimated, expected, lemmatize=False):
    if lemmatize:
        estimated = [lemmatizer.lemmatize(tag) for tag in estimated]
        expected = [lemmatizer.lemmatize(tag) for tag in expected]
        
    evaluation = [1 if tag in expected else 0 for tag in estimated]
    
    true_positives = sum(evaluation)
    
    precision = true_positives / len(estimated)
    recall = true_positives / len(expected)
    
    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

In [9]:
#for i in range(len(a)):
#    precision, recall, f1 = score(a[i], dataset.iloc[i]["tags"].split(","))
#    print(precision, recall, f1)


score(["a","b"], ["d","c","e",])

(0.0, 0.0, 0.0)

In [10]:
def evaluate(results, lemmatize=False):
    precisions = []
    recalls = []
    f1s = []
    
    for i in range(len(results)):
        precision, recall, f1 = score(results[i], dataset.iloc[i]["tags"].split(","), lemmatize)
        #print(precision, recall, f1)
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    return sum(precisions)/len(precisions), sum(recalls)/len(recalls), sum(f1s)/len(f1s)

In [11]:
avg_scores = evaluate(a)

In [12]:
print(avg_scores)

(0.05810997025179357, 0.12067895922047653, 0.07725925655970779)


In [13]:
print(avg_scores)

(0.05810997025179357, 0.12067895922047653, 0.07725925655970779)


In [14]:
#doc_tag_sets = {"abstract": a, "summary": b, "differential": c}
#expand_doc_tags = [True, False]
#max_additions = [3, 5, 7]

In [15]:
def experiment(methods, root, expand_doc_tags, max_additions, lemmatized_scoring):
    results = []
    
    for method in methods:
        if method == 1:
            print(method)
            start_time = time.time()
            abstract, summary, differential = TagGenerator(
                semantic_field_size=40, 
                stemmer = "porter", 
                generate_bigrams=True,
                use_tfidf=True
            ).generate(
                dataset["text"].tolist(), 
                method, 
                root=root, 
            )
            total_time = time.time() - start_time
            
            for lemmatize in lemmatized_scoring:
                abst_avg_precision, abst_avg_recall, abst_avg_f1 = evaluate(abstract)
                diff_avg_precision, diff_avg_recall, diff_avg_f1 = evaluate(differential)
                
                results.append([method, "abstract", "", "", lemmatize, abst_avg_precision, abst_avg_recall, abst_avg_f1, total_time])
                results.append([method, "differential", "", "", lemmatize, diff_avg_precision, diff_avg_recall, diff_avg_f1, total_time])
            
        if method == 2:
            for expand in expand_doc_tags:
                run_once = False
                
                if not expand:
                    run_once = True
                    
                for max_addition in max_additions:
                    print(method, expand, max_addition) # doc_tag_sets[doc_tag_set]
                    
                    start_time = time.time()
                    abstract, summary, differential = TagGenerator(
                        semantic_field_size=40, 
                        stemmer = "porter", 
                        generate_bigrams=True,
                        use_tfidf=True
                    ).generate(
                        dataset["text"].tolist(), 
                        method, 
                        root="", 
                        expand_doc_tags=expand, 
                        max_additions = max_addition
                    )
                    total_time = time.time() - start_time
                    
                    for lemmatize in lemmatized_scoring:
                        abst_avg_precision, abst_avg_recall, abst_avg_f1 = evaluate(abstract)
                        diff_avg_precision, diff_avg_recall, diff_avg_f1 = evaluate(differential)
                        
                        results.append([method, "abstract", expand, max_addition, lemmatize, abst_avg_precision, abst_avg_recall, abst_avg_f1, total_time])
                        results.append([method, "differential", expand, max_addition, lemmatize, diff_avg_precision, diff_avg_recall, diff_avg_f1, total_time])
                    
                    if run_once:
                        break
                    
    return results

In [16]:
results = experiment(
    methods=[1, 2],
    root="",
    expand_doc_tags=[False, True],
    max_additions=[3, 5, 7, 10],
    lemmatized_scoring=[False, True]
)

1
2 False 3
2 True 3
2 True 5
2 True 7
2 True 10


In [17]:
len(results)

24

In [18]:
results

[[1,
  'abstract',
  '',
  '',
  False,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.136765241622925],
 [1,
  'differential',
  '',
  '',
  False,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.136765241622925],
 [1,
  'abstract',
  '',
  '',
  True,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.136765241622925],
 [1,
  'differential',
  '',
  '',
  True,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.136765241622925],
 [2,
  'abstract',
  False,
  3,
  False,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.183427810668945],
 [2,
  'differential',
  False,
  3,
  False,
  0.08561643850233266,
  0.09085667217095227,
  0.08680166995412333,
  50.183427810668945],
 [2,
  'abstract',
  False,
  3,
  True,
  0.09500000000000003,
  0.11960369040327223,
  0.10483837635142365,
  50.183427810668945],
 [2,
  'differential',
  False,
  3,
  True,
  0.085616438502

In [19]:
results_df = pd.DataFrame(results, columns=["method", "type", "expand", "max_addition", "lemmatized", "avg_precision", "avg_recall", "avg_f1", "time"])

In [20]:
results_df

Unnamed: 0,method,type,expand,max_addition,lemmatized,avg_precision,avg_recall,avg_f1,time
0,1,abstract,,,False,0.095,0.119604,0.104838,50.136765
1,1,differential,,,False,0.095,0.119604,0.104838,50.136765
2,1,abstract,,,True,0.095,0.119604,0.104838,50.136765
3,1,differential,,,True,0.095,0.119604,0.104838,50.136765
4,2,abstract,False,3.0,False,0.095,0.119604,0.104838,50.183428
5,2,differential,False,3.0,False,0.085616,0.090857,0.086802,50.183428
6,2,abstract,False,3.0,True,0.095,0.119604,0.104838,50.183428
7,2,differential,False,3.0,True,0.085616,0.090857,0.086802,50.183428
8,2,abstract,True,3.0,False,0.05811,0.120679,0.077259,50.379144
9,2,differential,True,3.0,False,0.056426,0.101663,0.071485,50.379144


In [21]:
results_df.to_csv("results/test_results_bigger.csv", sep=",")

In [22]:
results_df = results_df[results_df["lemmatized"] == False]

In [23]:
results_df

Unnamed: 0,method,type,expand,max_addition,lemmatized,avg_precision,avg_recall,avg_f1,time
0,1,abstract,,,False,0.095,0.119604,0.104838,50.136765
1,1,differential,,,False,0.095,0.119604,0.104838,50.136765
4,2,abstract,False,3.0,False,0.095,0.119604,0.104838,50.183428
5,2,differential,False,3.0,False,0.085616,0.090857,0.086802,50.183428
8,2,abstract,True,3.0,False,0.05811,0.120679,0.077259,50.379144
9,2,differential,True,3.0,False,0.056426,0.101663,0.071485,50.379144
12,2,abstract,True,5.0,False,0.053874,0.120679,0.073309,51.450159
13,2,differential,True,5.0,False,0.053155,0.103544,0.069188,51.450159
16,2,abstract,True,7.0,False,0.053093,0.120679,0.072502,50.984283
17,2,differential,True,7.0,False,0.050821,0.100782,0.066484,50.984283


In [24]:
_ = TagGenerator(
    semantic_field_size=40, 
    stemmer = "porter", 
    generate_bigrams=True,
    use_tfidf=True
).generate_tag_cloud(
    dataset["text"].tolist(), 
    1, 
    root="agriculture", 
    outputdir="results/tcmethod1"
)

In [25]:
test = TagGenerator(
    semantic_field_size=40, 
    stemmer = "porter", 
    generate_bigrams=True,
    use_tfidf=True
)
res = test.generate_tag_cloud(
    dataset["text"].tolist(), 
    2, 
    root="", 
    outputdir="results/tcmethod2",
    expand_doc_tags=True, 
    max_additions = 5
)

In [26]:
res

({'area': 0.35838665900847,
  'food': 1.0,
  'price': 0.4864679244992557,
  'export': 0.4575236827134709,
  'data': 0.30054823397764374,
  'growth': 0.27778034178161304,
  'sector': 0.3284727252015498,
  'fisherman': 0.005542376989333922,
  'plan': 0.2258319375684825,
  'function': 0.08693390705840857,
  'level': 0.42390054314657083,
  'states': 0.005542376989333922,
  'country': 0.005542376989333922,
  'outputs': 0.005542376989333922,
  'nations': 0.005542376989333922,
  'production': 0.005542376989333922,
  'marketplace': 0.005542376989333922,
  'grocery store': 0.005542376989333922,
  'markets': 0.005542376989333922,
  'requirements': 0.005542376989333922,
  'crops': 0.005542376989333922,
  'monetary value': 0.005542376989333922,
  'consequences': 0.005542376989333922,
  'designations': 0.005542376989333922,
  'craw': 0.005542376989333922,
  'residential district': 0.005542376989333922,
  'community': 0.005542376989333922,
  'value': 0.005542376989333922,
  'processes': 0.0055423769