In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import cochrans_q
import numpy as np
import scikit_posthocs as sp

In [2]:
THRESHOLD = 0.05 #alpha significance level
tasks = {
    'binary': [0, 1], 
    'type': [0, 1, 2, 3]
}
representations = ['bow', 'freq', 'tfidf']
models = ['decision-tree', 'svm', 'naive-bayes', 'naive-bayes-multinomial']

In [3]:
#find the mode of a numpy array
def mode(array):
    freq = np.bincount(array)
    return np.argmax(freq)

In [4]:
#combine columns in a dataframe whose names contain certain substrings
def combine_columns(df, substrings):
    '''
    df: the input dataframe that has many column names that contain certain common substrings
    substrings: the list of substrings used for combination
    
    returns: `output_df`, a DataFrame with `substrings` as its columns. Each row is the mode
    of the value for that row of all of the columns in `df` that contained that substring.
    '''
    output_df = pd.DataFrame()
    
    for column_substring in substrings:
        temp_df = pd.DataFrame()
        to_be_combined = [column for column in df.columns.values if column_substring in column]
        
        for column in to_be_combined:
            temp_df[column] = df[column]
            
        array = np.array(temp_df).astype(int)
        array = [mode(row) for row in array]
            
        output_df[column_substring] = array
    
    return output_df

In [5]:
#find which models performed best using basic accuracy score
def evaluate_models(df, substrings):
    output_df = pd.DataFrame()
    temp_df = combine_columns(df, substrings)
    for column in substrings:
        temp_df[column] = temp_df[column] == df['Actual']
        output_df[column] = [sum(temp_df[column])/len(temp_df),]
    output_df.index = ['Score']
    if 'index' in output_df.columns.values: del output_df['index']
    return output_df

#test model significance difference w/ Friedman Chi^2 while combining certain columns
def test_model_difference(df, substrings):
    df = combine_columns(df, substrings)
    columns = [df[substring] for substring in substrings]
    cochran_result = cochrans_q(columns)
    return cochran_result

def post_hoc_tests(df, substrings):
    df = combine_columns(df, substrings)
    columns = [df[substring] for substring in substrings]
    p_values = sp.posthoc_dunn(columns)
    return p_values

In [20]:
#get a list of binary dfs of whether or not they fit with a particular classification
def get_dfs(df, classifications):
    dfs = []
    for classification in classifications:
        temp_df = df[df['Actual'] == classification]
        temp_df = temp_df.reset_index()
        dfs.append(temp_df)
    return dfs

#find specific rankings for each model
def overall_evaluation(task, classifications):
    df = pd.read_csv(f'{task}_predictions.csv')
    score_df = pd.DataFrame()
    dfs = get_dfs(df, classifications)
        
    for (i, temp_df) in enumerate(dfs):
        temp_df = evaluate_models(temp_df, temp_df.columns.values).transpose()
        temp_df = temp_df.sort_values('Score', ascending=False)
        
        #get results of hypothesis tests (cochran's q with post-hoc analysis if significant)
        cochran_result = test_model_difference(temp_df, temp_df.columns.values)
        post_hoc_p = []
        if cochran_result.pvalue <= THRESHOLD:
            post_hoc_p = post_hoc_tests(temp_df, temp_df.columns.values)
            
        score_df[f'Score {i}'] = temp_df['Score']
    
    score_df['Score'] = score_df.mean(axis=1)
    score_df = score_df.sort_values('Score', ascending = False)
    
    print(f'Task: {task}.')
    print(f'Cochran q test p-value: {cochran_result.pvalue}')
    if post_hoc_p:
        print(f'Post-hoc p-values: {post_hoc_p}')
    
    print(score_df)
    print(f'Mean: {score_df["Score"].mean()}')
    print('\n\n')

In [21]:
def evaluate_rep_model(grouping, dfs):
    pvalues = []
    final_df = pd.DataFrame()
    all_post_hocs = []
    
    for i, df in enumerate(dfs):
        temp_df = evaluate_models(df, grouping).transpose()
        temp_df = temp_df.sort_values('Score', ascending=False)
        pvalue = test_model_difference(df, grouping).pvalue
        
        post_hoc_ps = post_hoc_tests(df, grouping)
        all_post_hocs.append(post_hoc_ps)
        
        pvalues.append(pvalue)
        temp_df.columns = ['Score']
        final_df[f'Score {i}'] = temp_df['Score']
        
    return final_df, pvalues, post_hoc_ps

#evaluate algorithms and representations, find significance level
def algorithm_and_rep_eval(task, classifications):
    df = pd.read_csv(f'{task}_predictions.csv')
    dfs = []
    for classification in classifications:
        temp_df = df[df['Actual'] == classification]
        temp_df = temp_df.reset_index()
        dfs.append(temp_df)
    for grouping in [representations, models]:
        score_df, pvalues, post_hocs = evaluate_rep_model(grouping, dfs)
        score_df['Score'] = score_df.mean(axis=1)
        score_df = score_df.sort_values('Score', ascending = False)
        
        print(f'Task: {task}. Grouping: {grouping}.')
        print(f'Cochran q test p-values: {pvalues}')
        print(f'Post-hoc Dunn test p-values: \n{post_hocs}')
        print(f"Scores: \n{score_df}")
        print(f'Mean: {score_df["Score"].mean()}')
        print('\n\n')

In [22]:
for task in tasks:
    print(f'\nTask: {task.title()}\n')
    overall_evaluation(task, tasks[task]) #find the rankings for all models
    algorithm_and_rep_eval(task, tasks[task]) #group representations and algorithms together, evaluate


Task: Binary

Task: binary.
Cochran q test p-value: 0.44567964136461097
                                               Score 0   Score 1     Score
Actual                                        1.000000  1.000000  1.000000
decision-tree-freq-classification             0.848174  0.488372  0.668273
decision-tree-tfidf-classification            0.839041  0.482558  0.660800
svm-bow-classification                        0.921233  0.398256  0.659744
naive-bayes-multinomial-bow-classification    0.855023  0.418605  0.636814
decision-tree-bow-classification              0.827626  0.444767  0.636197
svm-freq-classification                       0.965753  0.235465  0.600609
naive-bayes-multinomial-tfidf-classification  0.696347  0.462209  0.579278
naive-bayes-bow-classification                0.730594  0.375000  0.552797
naive-bayes-freq-classification               0.676941  0.404070  0.540505
naive-bayes-tfidf-classification              0.688356  0.383721  0.536039
svm-tfidf-classification   