In [1]:
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
tasks = ['binary', 'type']
representations = ['bow', 'freq', 'tfidf']
models = ['decision-tree', 'svm', 'naive-bayes']

In [3]:
#combine columns in a dataframe whose names contain certain substrings
def combine_columns(df, substrings):
    '''
    df: the input dataframe that has many column names that contain certain common substrings
    substrings: the list of substrings used for combination
    
    returns: `output_df`, a DataFrame with `substrings` as its columns. Each row is the mean
    of the value for that row of all of the columns in `df` that contained that substring.
    '''
    output_df = pd.DataFrame()
    
    for column_substring in substrings:
        temp_array = np.zeros(len(df))
        to_be_combined = [column for column in df.columns.values if column_substring in column]
        
        for column in to_be_combined:
            temp_array += np.array(df[column])
            
        output_df[column_substring] = temp_array
    
    return output_df

In [4]:
#find which models performed best using basic accuracy score
def evaluate_models(df, substrings):
    output_df = pd.DataFrame()
    temp_df = combine_columns(df, substrings)
    for column in substrings:
        temp_df[column] = temp_df[column] == df['Actual']
        output_df[column] = [sum(temp_df[column])/len(temp_df),]
    return output_df

#test model significance difference w/ Friedman Chi^2 while combining certain columns
def test_model_difference(df, substrings):
    df = combine_columns(df, substrings)
    friedman_result = stats.friedmanchisquare(
        df[substrings[0]],
        df[substrings[1]],
        df[substrings[2]],
        )
    return friedman_result

In [5]:
for task in tasks:
    df = pd.read_csv(f'{task}_predictions.csv')
    temp_df = evaluate_models(df, df.columns.values).transpose()
    temp_df = temp_df.sort_values(0, ascending=False)
    friedman_result = test_model_difference(df, df.columns.values[1:])
    print(f'Task: {task}.')
    print(f'Friedman chi square test p-value: {friedman_result.pvalue}')
    print(temp_df)
    print()

Task: binary.
Friedman chi square test p-value: 6.302286079316834e-89
                                           0
Actual                              1.000000
svm-bow-classification              0.769672
decision-tree-tfidf-classification  0.760656
svm-freq-classification             0.757377
decision-tree-freq-classification   0.756557
svm-tfidf-classification            0.731967
decision-tree-bow-classification    0.722131
naive-bayes-bow-classification      0.638525
naive-bayes-tfidf-classification    0.599180
naive-bayes-freq-classification     0.591803

Task: type.
Friedman chi square test p-value: 2.5197389103373674e-29
                                                 0
Actual                                    1.000000
svm-bow-type_of_antisemitism              0.582353
decision-tree-bow-type_of_antisemitism    0.576471
decision-tree-freq-type_of_antisemitism   0.567647
decision-tree-tfidf-type_of_antisemitism  0.555882
naive-bayes-freq-type_of_antisemitism     0.538235
naive-ba

In [6]:
for task in tasks:
    df = pd.read_csv(f'{task}_predictions.csv')
    for grouping in [representations, models]:
        temp_df = evaluate_models(df, grouping).transpose()
        temp_df = temp_df.sort_values(0, ascending=False)
        friedman_result = test_model_difference(df, grouping)
        
        print(f'Task: {task}. Grouping: {grouping[:3]}.')
        print(f'Friedman chi square test p-value: {friedman_result.pvalue}')
        print(temp_df)
        print()

Task: binary. Grouping: ['bow', 'freq', 'tfidf'].
Friedman chi square test p-value: 5.20681817543709e-10
              0
tfidf  0.550000
freq   0.516393
bow    0.506557

Task: binary. Grouping: ['decision-tree', 'svm', 'naive-bayes'].
Friedman chi square test p-value: 4.3991847899599944e-73
                      0
svm            0.713934
decision-tree  0.568033
naive-bayes    0.466393

Task: type. Grouping: ['bow', 'freq', 'tfidf'].
Friedman chi square test p-value: 8.366475064253767e-08
              0
tfidf  0.397059
bow    0.347059
freq   0.338235

Task: type. Grouping: ['decision-tree', 'svm', 'naive-bayes'].
Friedman chi square test p-value: 1.936914965049763e-23
                      0
svm            0.479412
naive-bayes    0.320588
decision-tree  0.300000



It can be concluded that both the model and text representation have significant results on the predictions, especially the model.