### Helper: get all necessary data

In [24]:
import json

def get_data( filepath_query, filepath_results ):
    with open( filepath_query, 'r' ) as query_file:
        query = json.load( query_file )
    
    query_text = query['query']['multi_match']['query']
    query_scores = query['nlp_scores']
    query_data = {
        'query_text' : query_text,
        'bias_score' : query_scores['bias_score'],
        'vocab_richness' : query_scores['stylo_scores']['vocab_richness'],
        'hapax_legomena' : query_scores['stylo_scores']['hepax_legomena'],
        'wordlength' : query_scores['stylo_scores']['readability_measures']['average_wordlength'],
        'sentlength' : query_scores['stylo_scores']['readability_measures']['average_sentlength'],
        'spelling_errors' : query_scores['stylo_scores']['spelling_errors'],
        'topics' : query_scores['topics']
    }

    with open( filepath_results ) as results_file:
        results = json.load( results_file )
    
    results_data = []
    for doc in results:
        premise = doc['_source']['premise']
        average_wordlength = doc['nlp_scores']['stylo_scores']['readability_measures']['average_wordlength']
        average_sentlength = doc['nlp_scores']['stylo_scores']['readability_measures']['average_sentlength']
        bias_distance = doc['bias_distance']
        stylo_distance = doc['stylo_distance']
        topic_match_count = doc['topic_match_count']
        old_score  = doc['old_score']
        new_score = doc['new_score']
        scoring_distance = doc['scoring_distance']
        old_rank = doc['old_rank']
        new_rank = doc['new_rank']
        
        doc_data = {
            'premise' : premise,
            'wordlength' : average_wordlength,
            'sentlength' : average_sentlength,
            'bias_distance' : bias_distance,
            'stylo_distance' : stylo_distance,
            'topic_match_count' : topic_match_count,
            'old_score' : old_score,
            'new_score' : new_score,
            'scoring_distance' : scoring_distance,
            'old_rank' : old_rank,
            'new_rank' : new_rank
        }
        results_data.append( doc_data )

    data_tuple = ( query_data, results_data )
    return data_tuple
    
    

### Helper: extract notable documents

In [25]:
def extract_notable_docs( documents, min_jump_distance ):
    notable_docs = []
    max_rank_distance_up = 0
    max_rank_distance_down = 0

    for doc in documents:
        rank_distance = doc['old_rank'] - doc['new_rank']

        if rank_distance > max_rank_distance_up:
            max_rank_distance_up = rank_distance
        elif rank_distance < max_rank_distance_down:
            max_rank_distance_down = rank_distance
        
        if abs( rank_distance ) > min_jump_distance:
            jump = True
            if rank_distance < 0:
                jump = False

            doc['remarks'] = {
                'jump' : jump,
                'jump_distance' : rank_distance,
                'max_jump' : 0
            }
            notable_docs.append( doc )

    for doc in notable_docs:
        if doc['remarks']['jump_distance'] == max_rank_distance_up:
            doc['remarks']['max_jump'] = 1
        elif doc['remarks']['jump_distance'] == max_rank_distance_down:
            doc['remarks']['max_jump'] = -1

    return notable_docs
    
    

### Helper: focus setter

In [26]:
def first_5_documents( documents ):
    results_data = documents[:5]
    return results_data

def first_10_documents( documents ):
    results_data = documents[:10]
    return results_data

def first_20_documents( documents ):
    results_data = documents[:20]
    return results_data

def first_50_documents( documents ):
    results_data = documents[:50]
    return results_data
    
    

## Evaluate rank changes

In [62]:
from extract_notable_docs import extract_notable_docs


def rank_changes( documents, min_jump_distance ):
    
    count_changes = 0
    max_rank_distance_up = 0
    max_rank_distance_down = 0
    count_rank_up = 0
    count_rank_down = 0

    for doc in documents:
        rank_distance = doc['old_rank'] - doc['new_rank']

        count_changes += abs( rank_distance )

        if rank_distance > max_rank_distance_up:
            max_rank_distance_up = rank_distance
        elif rank_distance < max_rank_distance_down:
            max_rank_distance_down = rank_distance

        if rank_distance > 0:
            count_rank_up += 1
        elif rank_distance < 0:
            count_rank_down += 1

    average_rank_changes = count_changes / len( documents )
    max_rank_changes_up = max_rank_distance_up
    max_rank_changes_down = abs( max_rank_distance_down )
    tendency = 0
    if count_rank_up > count_rank_down and count_rank_down != 0:
        tendency = count_rank_up / count_rank_down
    elif count_rank_down > count_rank_up and count_rank_up != 0:
        tendency = count_rank_down / count_rank_up
        
    notable_docs = extract_notable_docs( documents, min_jump_distance )

    ranking_tuple = ( average_rank_changes, max_rank_changes_up, max_rank_changes_down, tendency, notable_docs )
    return ranking_tuple
    
    

## Investigate notable documents

In [63]:
def investigate_notable_docs( documents ):

    rank_up = []
    rank_down = []
    
    for doc in documents:
        rank = doc['new_rank']
        jump = doc['remarks']['jump_distance']
        bias = doc['bias_distance']
        stylo = doc['stylo_distance']
        #print( f'rank: {rank} \tjump distance: {jump} \tbias distance: {bias:.2f} \tstylo distance: {stylo:.2f}' )

        if doc['remarks']['jump']:
            rank_up.append( doc )
        else:
            rank_down.append( doc )

    count_bias_rank_up = 0
    max_bias_rank_up = 0
    count_stylo_rank_up = 0
    max_stylo_rank_up = 0
    print( '\n' + str( len(rank_up) ) + ' up ranked documents:' )

    for doc in rank_up:    
        rank = doc['new_rank']
        jump = doc['remarks']['jump_distance']
        bias = doc['bias_distance']
        stylo = doc['stylo_distance']
        print( f'rank: {rank} \tjump distance: {jump} \tbias distance: {bias:.2f} \tstylo distance: {stylo:.2f}' )

        count_bias_rank_up += doc['bias_distance']
        if doc['bias_distance'] > max_bias_rank_up:
            max_bias_rank_up = doc['bias_distance']

        count_stylo_rank_up += doc['stylo_distance']
        if doc['stylo_distance'] > max_stylo_rank_up:
            max_stylo_rank_up = doc['stylo_distance']

    average_bias_rank_up = ( count_bias_rank_up - max_bias_rank_up ) / ( len( rank_up ) - 1 )
    average_stylo_rank_up = ( count_stylo_rank_up - max_stylo_rank_up ) / ( len( rank_up ) - 1 )
    print( f'average bias distance: {average_bias_rank_up:.2f} \taverage stylo distance: {average_stylo_rank_up:.2f}' )

    count_bias_rank_down = 0
    min_bias_rank_down = 0
    count_stylo_rank_down = 0
    min_stylo_rank_down = 0
    print( '\n' + str( len(rank_down) ) + ' down ranked documents:' )

    for doc in rank_down:
        rank = doc['new_rank']
        jump = doc['remarks']['jump_distance']
        bias = doc['bias_distance']
        stylo = doc['stylo_distance']
        print( f'rank: {rank} \tjump distance: {jump} \tbias distance: {bias:.2f} \tstylo distance: {stylo:.2f}' )

        count_bias_rank_down += doc['bias_distance']
        if doc['bias_distance'] < min_bias_rank_down:
            min_bias_rank_down = doc['bias_distance']
        
        count_stylo_rank_down += doc['stylo_distance']
        if doc['stylo_distance'] > min_stylo_rank_down:
            min_stylo_rank_down = doc['stylo_distance']
    
    average_bias_rank_down = ( count_bias_rank_down - min_bias_rank_down ) / ( len( rank_down ) - 1 )
    average_stylo_rank_down = ( count_stylo_rank_down - min_stylo_rank_down ) / ( len( rank_down ) -1 )
    print( f'average bias distance: {average_bias_rank_down:.2f} \taverage stylo distance: {average_stylo_rank_down:.2f}' )
    
    notable_documents_analysis = ( average_bias_rank_up, average_stylo_rank_up, average_bias_rank_down, average_stylo_rank_down )
    return notable_documents_analysis
    
    

### Topic 1
## Evaluation: first 5, 10, 20, 50, 100 ranked documents / notable documents > 9

In [64]:
from load_data import get_data
from setter import *
#from evaluade_rank_changes import rank_changes
#from extract_notable_docs import extract_notable_docs
#from investigate_notable_docs import investigate_notable_docs

filepath_query = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/query_topic1.json'
filepath_results = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/results_custom_topic1.json'
data_tuple = get_data( filepath_query, filepath_results )

query = data_tuple[0]['query_text']

ranked5  = first_5_documents( data_tuple[1] ) 
ranked10 = first_10_documents( data_tuple[1] )
ranked20 = first_20_documents( data_tuple[1] )
ranked50 = first_50_documents( data_tuple[1] )
ranked100 = data_tuple[1]

rank_tuple5 = rank_changes( ranked5, 9 )
rank_tuple10 = rank_changes( ranked10, 9 )
rank_tuple20 = rank_changes( ranked20, 9 )
rank_tuple50 = rank_changes( ranked50, 9 )
rank_tuple100 = rank_changes( ranked100, 9 )

average_rank_changes5 = rank_tuple5[0]
max_rank_changes_up5 = rank_tuple5[1]
max_rank_changes_down5 = rank_tuple5[2]
tendency5 = rank_tuple5[3]
notable_docs5 = rank_tuple5[4]

average_rank_changes10 = rank_tuple10[0]
max_rank_changes_up10 = rank_tuple10[1]
max_rank_changes_down10 = rank_tuple10[2]
tendency10 = rank_tuple10[3]
notable_docs10 = rank_tuple10[4]

average_rank_changes20 = rank_tuple20[0]
max_rank_changes_up20 = rank_tuple20[1]
max_rank_changes_down20 = rank_tuple20[2]
tendency20 = rank_tuple20[3]
notable_docs20 = rank_tuple20[4]

average_rank_changes50 = rank_tuple50[0]
max_rank_changes_up50 = rank_tuple50[1]
max_rank_changes_down50 = rank_tuple50[2]
tendency50 = rank_tuple50[3]
notable_docs50 = rank_tuple50[4]

average_rank_changes100 = rank_tuple100[0]
max_rank_changes_up100 = rank_tuple100[1]
max_rank_changes_down100 = rank_tuple100[2]
tendency100 = rank_tuple100[3]
notable_docs100 = rank_tuple100[4]


print( f'Topic 1: {query}')
print( 'Rank changes:' )
print( f'FIRST 005: average: {average_rank_changes5} \tmaximum up the ranking: {max_rank_changes_up5}' +
        f'\tmaximum down the ranking: {max_rank_changes_down5} \ttendency: {tendency5:.2f}' )
print( f'FIRST 010: average: {average_rank_changes10} \tmaximum up the ranking: {max_rank_changes_up10}' +
        f'\tmaximum down the ranking: {max_rank_changes_down10} \ttendency: {tendency10:.2f}' )
print( f'FIRST 020: average: {average_rank_changes20} \tmaximum up the ranking: {max_rank_changes_up20}' +
        f'\tmaximum down the ranking: {max_rank_changes_down20} \ttendency: {tendency20:.2f}' )
print( f'FIRST 050: average: {average_rank_changes50} \tmaximum up the ranking: {max_rank_changes_up50}' +
        f'\tmaximum down the ranking: {max_rank_changes_down50} \ttendency: {tendency50:.2f}' )
print( f'FIRST 100: average: {average_rank_changes100} \tmaximum up the ranking: {max_rank_changes_up100}' +
        f'\tmaximum down the ranking: {max_rank_changes_down100} \ttendency: {tendency100:.2f}' )


print( '\n' + str( len(notable_docs100) ) + ' notable documents which jumped > 9 ranks:' )
investigate_notable_docs( notable_docs100 )

print(' ')

Topic 1: Should Teachers Get Tenure?
Rank changes:
FIRST 005: average: 0.0 	maximum up the ranking: 0	maximum down the ranking: 0 	tendency: 0.00
FIRST 010: average: 0.6 	maximum up the ranking: 3	maximum down the ranking: 1 	tendency: 3.00
FIRST 020: average: 1.35 	maximum up the ranking: 3	maximum down the ranking: 4 	tendency: 1.80
FIRST 050: average: 4.68 	maximum up the ranking: 35	maximum down the ranking: 7 	tendency: 1.41
FIRST 100: average: 8.7 	maximum up the ranking: 35	maximum down the ranking: 38 	tendency: 1.07

32 notable documents which jumped > 9 ranks:

16 up ranked documents:
rank: 36 	jump distance: 35 	bias distance: 0.04 	stylo distance: 3.02
rank: 43 	jump distance: 21 	bias distance: 0.83 	stylo distance: 8.69
rank: 45 	jump distance: 24 	bias distance: 0.83 	stylo distance: 8.69
rank: 46 	jump distance: 26 	bias distance: 0.65 	stylo distance: 6.93
rank: 48 	jump distance: 29 	bias distance: 0.69 	stylo distance: 7.20
rank: 52 	jump distance: 24 	bias distance:

### Topic 5
## Evaluation: first 5, 10, 20, 50, 100 ranked documents / notable documents > 9

In [65]:
from load_data import get_data
from setter import *
#from evaluade_rank_changes import rank_changes
#from extract_notable_docs import extract_notable_docs
#from investigate_notable_docs import investigate_notable_docs

filepath_query = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/query_topic5.json'
filepath_results = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/results_custom_topic5.json'
data_tuple = get_data( filepath_query, filepath_results )

query = data_tuple[0]['query_text']

ranked5  = first_5_documents( data_tuple[1] ) 
ranked10 = first_10_documents( data_tuple[1] )
ranked20 = first_20_documents( data_tuple[1] )
ranked50 = first_50_documents( data_tuple[1] )
ranked100 = data_tuple[1]

rank_tuple5 = rank_changes( ranked5, 9 )
rank_tuple10 = rank_changes( ranked10, 9 )
rank_tuple20 = rank_changes( ranked20, 9 )
rank_tuple50 = rank_changes( ranked50, 9 )
rank_tuple100 = rank_changes( ranked100, 9 )

average_rank_changes5 = rank_tuple5[0]
max_rank_changes_up5 = rank_tuple5[1]
max_rank_changes_down5 = rank_tuple5[2]
tendency5 = rank_tuple5[3]
notable_docs5 = rank_tuple5[4]

average_rank_changes10 = rank_tuple10[0]
max_rank_changes_up10 = rank_tuple10[1]
max_rank_changes_down10 = rank_tuple10[2]
tendency10 = rank_tuple10[3]
notable_docs10 = rank_tuple10[4]

average_rank_changes20 = rank_tuple20[0]
max_rank_changes_up20 = rank_tuple20[1]
max_rank_changes_down20 = rank_tuple20[2]
tendency20 = rank_tuple20[3]
notable_docs20 = rank_tuple20[4]

average_rank_changes50 = rank_tuple50[0]
max_rank_changes_up50 = rank_tuple50[1]
max_rank_changes_down50 = rank_tuple50[2]
tendency50 = rank_tuple50[3]
notable_docs50 = rank_tuple50[4]

average_rank_changes100 = rank_tuple100[0]
max_rank_changes_up100 = rank_tuple100[1]
max_rank_changes_down100 = rank_tuple100[2]
tendency100 = rank_tuple100[3]
notable_docs100 = rank_tuple100[4]


print( f'Topic 5: {query}')
print( 'Rank changes:' )
print( f'FIRST 005: average: {average_rank_changes5} \tmaximum up the ranking: {max_rank_changes_up5}' +
        f'\tmaximum down the ranking: {max_rank_changes_down5} \ttendency: {tendency5:.2f}' )
print( f'FIRST 010: average: {average_rank_changes10} \tmaximum up the ranking: {max_rank_changes_up10}' +
        f'\tmaximum down the ranking: {max_rank_changes_down10} \ttendency: {tendency10:.2f}' )
print( f'FIRST 020: average: {average_rank_changes20} \tmaximum up the ranking: {max_rank_changes_up20}' +
        f'\tmaximum down the ranking: {max_rank_changes_down20} \ttendency: {tendency20:.2f}' )
print( f'FIRST 050: average: {average_rank_changes50} \tmaximum up the ranking: {max_rank_changes_up50}' +
        f'\tmaximum down the ranking: {max_rank_changes_down50} \ttendency: {tendency50:.2f}' )
print( f'FIRST 100: average: {average_rank_changes100} \tmaximum up the ranking: {max_rank_changes_up100}' +
        f'\tmaximum down the ranking: {max_rank_changes_down100} \ttendency: {tendency100:.2f}' )


print( '\n' + str( len(notable_docs100) ) + ' notable documents which jumped > 9 ranks:' )
investigate_notable_docs( notable_docs100 )

print(' ')

Topic 5: Should Social Security Be Privatized?
Rank changes:
FIRST 005: average: 5.8 	maximum up the ranking: 9	maximum down the ranking: 0 	tendency: 0.00
FIRST 010: average: 5.9 	maximum up the ranking: 9	maximum down the ranking: 3 	tendency: 8.00
FIRST 020: average: 7.8 	maximum up the ranking: 15	maximum down the ranking: 17 	tendency: 2.80
FIRST 050: average: 10.76 	maximum up the ranking: 24	maximum down the ranking: 31 	tendency: 2.00
FIRST 100: average: 10.64 	maximum up the ranking: 24	maximum down the ranking: 31 	tendency: 1.18

53 notable documents which jumped > 9 ranks:

27 up ranked documents:
rank: 19 	jump distance: 15 	bias distance: 0.11 	stylo distance: 1.22
rank: 21 	jump distance: 15 	bias distance: 0.00 	stylo distance: 1.80
rank: 23 	jump distance: 15 	bias distance: 0.03 	stylo distance: 1.92
rank: 25 	jump distance: 17 	bias distance: 0.10 	stylo distance: 3.01
rank: 26 	jump distance: 13 	bias distance: 0.25 	stylo distance: 3.09
rank: 29 	jump distance: 11 

### Topic 10
## Evaluation: first 5, 10, 20, 50, 100 ranked documents / notable documents > 9

In [68]:
from load_data import get_data
from setter import *
#from evaluade_rank_changes import rank_changes
#from extract_notable_docs import extract_notable_docs
#from investigate_notable_docs import investigate_notable_docs

filepath_query = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/query_topic10.json'
filepath_results = 'C:/Users/simon/programming/python/ElasticSearch/latest_version/git_version/results/results_custom_topic10.json'
data_tuple = get_data( filepath_query, filepath_results )

query = data_tuple[0]['query_text']

ranked5  = first_5_documents( data_tuple[1] ) 
ranked10 = first_10_documents( data_tuple[1] )
ranked20 = first_20_documents( data_tuple[1] )
ranked50 = first_50_documents( data_tuple[1] )
ranked100 = data_tuple[1]

rank_tuple5 = rank_changes( ranked5, 9 )
rank_tuple10 = rank_changes( ranked10, 9 )
rank_tuple20 = rank_changes( ranked20, 9 )
rank_tuple50 = rank_changes( ranked50, 9 )
rank_tuple100 = rank_changes( ranked100, 9 )

average_rank_changes5 = rank_tuple5[0]
max_rank_changes_up5 = rank_tuple5[1]
max_rank_changes_down5 = rank_tuple5[2]
tendency5 = rank_tuple5[3]
notable_docs5 = rank_tuple5[4]

average_rank_changes10 = rank_tuple10[0]
max_rank_changes_up10 = rank_tuple10[1]
max_rank_changes_down10 = rank_tuple10[2]
tendency10 = rank_tuple10[3]
notable_docs10 = rank_tuple10[4]

average_rank_changes20 = rank_tuple20[0]
max_rank_changes_up20 = rank_tuple20[1]
max_rank_changes_down20 = rank_tuple20[2]
tendency20 = rank_tuple20[3]
notable_docs20 = rank_tuple20[4]

average_rank_changes50 = rank_tuple50[0]
max_rank_changes_up50 = rank_tuple50[1]
max_rank_changes_down50 = rank_tuple50[2]
tendency50 = rank_tuple50[3]
notable_docs50 = rank_tuple50[4]

average_rank_changes100 = rank_tuple100[0]
max_rank_changes_up100 = rank_tuple100[1]
max_rank_changes_down100 = rank_tuple100[2]
tendency100 = rank_tuple100[3]
notable_docs100 = rank_tuple100[4]


print( f'Topic 10: {query}')
print( 'Rank changes:' )
print( f'FIRST 005: average: {average_rank_changes5} \tmaximum up the ranking: {max_rank_changes_up5}' +
        f'\tmaximum down the ranking: {max_rank_changes_down5} \ttendency: {tendency5:.2f}' )
print( f'FIRST 010: average: {average_rank_changes10} \tmaximum up the ranking: {max_rank_changes_up10}' +
        f'\tmaximum down the ranking: {max_rank_changes_down10} \ttendency: {tendency10:.2f}' )
print( f'FIRST 020: average: {average_rank_changes20} \tmaximum up the ranking: {max_rank_changes_up20}' +
        f'\tmaximum down the ranking: {max_rank_changes_down20} \ttendency: {tendency20:.2f}' )
print( f'FIRST 050: average: {average_rank_changes50} \tmaximum up the ranking: {max_rank_changes_up50}' +
        f'\tmaximum down the ranking: {max_rank_changes_down50} \ttendency: {tendency50:.2f}' )
print( f'FIRST 100: average: {average_rank_changes100} \tmaximum up the ranking: {max_rank_changes_up100}' +
        f'\tmaximum down the ranking: {max_rank_changes_down100} \ttendency: {tendency100:.2f}' )


print( '\n' + str( len(notable_docs100) ) + ' notable documents which jumped > 9 ranks:' )
investigate_notable_docs( notable_docs100 )

print(' ')

Topic 10: Should Any Vaccines Be Required for Children?
Rank changes:
FIRST 005: average: 0.8 	maximum up the ranking: 2	maximum down the ranking: 1 	tendency: 2.00
FIRST 010: average: 2.3 	maximum up the ranking: 10	maximum down the ranking: 3 	tendency: 1.67
FIRST 020: average: 3.85 	maximum up the ranking: 13	maximum down the ranking: 4 	tendency: 1.57
FIRST 050: average: 7.06 	maximum up the ranking: 28	maximum down the ranking: 21 	tendency: 1.30
FIRST 100: average: 9.32 	maximum up the ranking: 32	maximum down the ranking: 47 	tendency: 1.16

37 notable documents which jumped > 9 ranks:

19 up ranked documents:
rank: 10 	jump distance: 10 	bias distance: 0.66 	stylo distance: 10.21
rank: 15 	jump distance: 10 	bias distance: 5.39 	stylo distance: 11.98
rank: 17 	jump distance: 13 	bias distance: 2.54 	stylo distance: 11.32
rank: 24 	jump distance: 10 	bias distance: 3.83 	stylo distance: 13.21
rank: 25 	jump distance: 17 	bias distance: 0.41 	stylo distance: 7.78
rank: 36 	jump d