# Setup

## Function Setup

In [23]:
from scipy.stats import spearmanr
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import pandas as pd
from scipy import stats
import requests
import json
import random
from sklearn.metrics import accuracy_score
from scipy.stats import norm
import math
from scipy.stats import kendalltau
from statsmodels.stats.multicomp import MultiComparison

In [24]:
def encode_categorical_variables(df, categorical_vars):
    df_encoded = pd.get_dummies(df, columns=categorical_vars, drop_first=False)
    return df_encoded

from sklearn.linear_model import LinearRegression

def calculate_feature_importances_and_coef(df, variables, target):
    # Subset the dataframe to include only the columns of interest
    subset = df[variables + [target]]

    # Split the data into features and target
    X = subset[variables]
    y = subset[target]

    # Define the model
    model = LinearRegression()

    # Train the model
    model.fit(X, y)

    # Retrieve and print the feature importances (coefficients in this case)
    importances = model.coef_
    for variable, importance in zip(variables, importances):
        print(f"{variable}: {importance} (influence direction: {'positive' if importance > 0 else 'negative'})")

def calculate_feature_importances(df, variables, target):
    # Subset the dataframe to include only the columns of interest
    subset = df[variables + [target]]

    # Split the data into features and target
    X = subset[variables]
    y = subset[target]

    # Define the model
    model = RandomForestRegressor(random_state=0)

    # Cross validate model to ensure robustness
    cv_scores = cross_val_score(model, X, y, cv=5)
    print(f"Cross validation scores: {cv_scores}")
    print(f"Average cross-validation score: {np.mean(cv_scores)}")

    # Train the model
    model.fit(X, y)

    # Retrieve and print the feature importances
    importances = model.feature_importances_
    for variable, importance in zip(variables, importances):
        print(f"{variable}: {importance}")

def perform_anova(df, cat_vars, target_var):
    anova_results = {}
    
    data=[]
    for cat in cat_vars:
        groups = [df[target_var][df[cat] == c].values for c in df[cat].unique()]
        F, p = stats.f_oneway(*groups)
        anova_results[cat] = {"F": F, "p": p}
        data.append((cat,F,p))
    anova_df = pd.DataFrame(data,columns=['Feature','F','p'])    

    return anova_results, anova_df

def compare_rankings_spearman(rank1, rank2):
    coef, _ = spearmanr(rank1, rank2)
    return coef

def compare_rankings_kendall(rank1, rank2):
    tau, _ = kendalltau(rank1, rank2)
    return tau

def check_anova_assumptions(df, input_vars, target_var):
    for var in input_vars:
        print(f"\nChecking assumptions for {var}:")

        # Group data by input variable categories
        groups = df.groupby(var)[target_var].apply(list)

        # Perform Shapiro-Wilk test for normality
        for i, group in enumerate(groups):
            _, p_value = stats.shapiro(group)
            if p_value > 0.05:
                print(f'Group {i} looks Gaussian (fail to reject H0)')
            else:
                print(f'Group {i} does not look Gaussian (reject H0)')
            print(p_value)

        # Perform Levene's test for equal variances
        _, p_value = stats.levene(*groups)
        if p_value > 0.05:
            print('The variances look equal (fail to reject H0)')
        else:
            print('The variances do not look equal (reject H0)')

def calulate_range(df,target_name,sample):
    lower_bounds=[]
    upper_bounds=[]
    confidence_level = 0.95  # for a 95% confidence interval
    z = norm.ppf(1 - (1 - confidence_level) / 2)
    for _,row in df.iterrows():
        standard_error = math.sqrt((row[target_name] * (1 - row[target_name])) / (sample))
        lower_bounds.append(row[target_name] - z * standard_error)
        upper_bounds.append(row[target_name] + z * standard_error)
    df['lower']=lower_bounds
    df['upper']=upper_bounds
    return df
from statsmodels.stats.multicomp import MultiComparison

## Data Setup


In [25]:
df = pd.read_csv('data/data_files/wandb_results/wandb_finalrun.csv')
categorical_vars = ["articles", "filter", "labeling", "pos", "resolved"]
encoded_cats=[ 'articles_excellent',
       'articles_protected', 'articles_random', 'articles_readworthy',
       'filter_filter.csv', 'filter_nofilter.csv', 'labeling_link',
       'labeling_namelink', 'labeling_quot', 'pos_masked', 'pos_nopos',
       'pos_pos', 'resolved_nonresolved', 'resolved_resolved']

test_rank = df[categorical_vars + ['transfer_acc', 'test_acc','fileindex']].sort_values(by='test_acc',ascending=False)
trans_rank = df[categorical_vars + ['transfer_acc', 'test_acc', 'fileindex']].sort_values(
    by='transfer_acc', ascending=False)

### POS-Speedup

In [26]:
rest_performance = df['Runtime'][df.pos != 'pos'].mean()
pos_performance = df['Runtime'][df.pos == 'pos'].mean()
pos_decrease = rest_performance - pos_performance
percent_speedup =  pos_decrease / rest_performance
print(f"POS masking has a computational speed up of: {percent_speedup*100} % ")


POS masking has a computational speed up of: 33.02631578947369 % 


# Results

## Top 10 

### By Test Accuracy:

In [27]:
test_rank.head(10)

Unnamed: 0,articles,filter,labeling,pos,resolved,transfer_acc,test_acc,fileindex
22,protected,nofilter.csv,namelink,pos,resolved,0.628644,0.6641,20
21,random,nofilter.csv,namelink,pos,resolved,0.641318,0.663,98
5,protected,nofilter.csv,namelink,pos,nonresolved,0.690748,0.6596,115
0,readworthy,nofilter.csv,namelink,pos,resolved,0.760456,0.6584,135
6,excellent,nofilter.csv,namelink,pos,nonresolved,0.685678,0.6583,16
8,random,nofilter.csv,namelink,pos,nonresolved,0.675539,0.657,74
32,random,filter.csv,link,pos,nonresolved,0.608365,0.655,32
1,readworthy,nofilter.csv,namelink,pos,nonresolved,0.757921,0.6523,58
13,readworthy,filter.csv,namelink,pos,nonresolved,0.661597,0.6493,89
15,excellent,filter.csv,namelink,pos,nonresolved,0.65526,0.649,130


### By Transfer Accuracy:

In [28]:
trans_rank.head(10)

Unnamed: 0,articles,filter,labeling,pos,resolved,transfer_acc,test_acc,fileindex
0,readworthy,nofilter.csv,namelink,pos,resolved,0.760456,0.6584,135
1,readworthy,nofilter.csv,namelink,pos,nonresolved,0.757921,0.6523,58
2,protected,nofilter.csv,link,pos,resolved,0.745247,0.6301,24
3,excellent,nofilter.csv,namelink,pos,resolved,0.736375,0.6384,123
4,protected,nofilter.csv,link,nopos,nonresolved,0.697085,0.6114,21
5,protected,nofilter.csv,namelink,pos,nonresolved,0.690748,0.6596,115
6,excellent,nofilter.csv,namelink,pos,nonresolved,0.685678,0.6583,16
7,random,nofilter.csv,link,pos,resolved,0.679341,0.6382,65
8,random,nofilter.csv,namelink,pos,nonresolved,0.675539,0.657,74
9,readworthy,nofilter.csv,link,pos,resolved,0.667934,0.639,0


## Feature importance

### ANOVA

#### ANOVA prerequisites Transfer Accuracy

In [29]:
check_anova_assumptions(df,categorical_vars,'transfer_acc')


Checking assumptions for articles:
Group 0 does not look Gaussian (reject H0)
0.0001191661722259596
Group 1 does not look Gaussian (reject H0)
0.0018771820468828082
Group 2 does not look Gaussian (reject H0)
0.00802804995328188
Group 3 looks Gaussian (fail to reject H0)
0.19863443076610565
The variances look equal (fail to reject H0)

Checking assumptions for filter:
Group 0 does not look Gaussian (reject H0)
0.0005169930518604815
Group 1 does not look Gaussian (reject H0)
2.686220977921039e-05
The variances do not look equal (reject H0)

Checking assumptions for labeling:
Group 0 does not look Gaussian (reject H0)
0.04491310566663742
Group 1 does not look Gaussian (reject H0)
0.0018110014498233795
Group 2 does not look Gaussian (reject H0)
0.0002676627191249281
The variances do not look equal (reject H0)

Checking assumptions for pos:
Group 0 does not look Gaussian (reject H0)
1.0775072951219045e-05
Group 1 does not look Gaussian (reject H0)
0.00031883417977951467
Group 2 looks Gauss

#### Results Anova Transfer Accuracy

In [30]:
anova_dict, anova_df = perform_anova(df,categorical_vars,'transfer_acc')
print(anova_df)

    Feature          F             p
0  articles   1.290157  2.802434e-01
1    filter   0.608140  4.367868e-01
2  labeling   4.071418  1.909758e-02
3       pos  50.363434  3.129971e-17
4  resolved   0.026183  8.716836e-01


#### Results Anova Test Accuracy

In [31]:
anova_dict_test, anova_df_test = perform_anova(df,categorical_vars,'test_acc')
print(anova_df_test)

    Feature          F             p
0  articles   1.132583  3.381400e-01
1    filter   9.186926  2.897609e-03
2  labeling  50.871189  2.329070e-17
3       pos  15.051441  1.189340e-06
4  resolved   0.280503  5.971990e-01


### Tukey HSD

#### Tukey HSD of 'labeling'

In [32]:
mc = MultiComparison(df['transfer_acc'], df['labeling'])
result = mc.tukeyhsd()
print(result)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
    link namelink    0.015 0.6325 -0.0238 0.0538  False
    link     quot  -0.0309  0.147 -0.0697 0.0079  False
namelink     quot  -0.0458 0.0161 -0.0846 -0.007   True
-------------------------------------------------------


#### Tukey HSD of pos/mask

In [33]:
mc = MultiComparison(df['transfer_acc'], df['pos'])
result = mc.tukeyhsd()
print(result)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
masked  nopos   0.0198 0.2778 -0.0107 0.0502  False
masked    pos   0.1204    0.0  0.0899 0.1509   True
 nopos    pos   0.1007    0.0  0.0702 0.1311   True
---------------------------------------------------


In [34]:
mc = MultiComparison(df['transfer_acc'], df['labeling'])
result = mc.tukeyhsd()
print(result)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1   group2  meandiff p-adj   lower  upper  reject
-------------------------------------------------------
    link namelink    0.015 0.6325 -0.0238 0.0538  False
    link     quot  -0.0309  0.147 -0.0697 0.0079  False
namelink     quot  -0.0458 0.0161 -0.0846 -0.007   True
-------------------------------------------------------


#### Extra Calculation Test Accuracy: Tukey HSD of 'labeling'

In [35]:

mc = MultiComparison(df['test_acc'], df['filter'])
result = mc.tukeyhsd()
print(result)

    Multiple Comparison of Means - Tukey HSD, FWER=0.05     
  group1      group2    meandiff p-adj  lower  upper  reject
------------------------------------------------------------
filter.csv nofilter.csv   0.0193 0.0029 0.0067 0.0319   True
------------------------------------------------------------
