# F-test

## Import packages

In [1]:
'''
Credit to GitHub user Jaimin09
Link: https://github.com/Jaimin09/Coding-Lane-Assets/tree/main/Logistic%20Regression%20in%20Python%20from%20Scratch
Last accessed: 28/10/2021
'''
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import portablelogresmodel as model
import scipy

from scipy import stats



## Import data and set seed 

In [2]:
# ! Get dataset
filepath = 'Datasets\dec_sep_MPHWA.csv' # Data for Specialized athletes i.e. competing in one sport
df = pd.read_csv(filepath)
df = df.reset_index() # Resets index for dataset 

dec_path = 'Datasets\dec_MPHWA.csv' # Data for decathlon athletes 
dec_df = pd.read_csv(dec_path)
dec_df = dec_df.reset_index() # Resets index for dataset 

# ! Set seed and seed calling function
rng = np.random.default_rng(1) 

# List of variables we are interested in 
X_list = ['ID',  
        'PreviousMedals',
        'Height_div_avg', 
        'Weight_div_avg', 
        'Age_div_avg'
        ]

# List containing medals earned for each athlete  
Y_list = ['ID', 'MedalEarned']

# Import model weights after portableregressionmodel.py has been run 
W_array = np.genfromtxt('Parameters/W.csv', delimiter=',')# Len of array is equal to the number of iterations of the model
B_array = np.genfromtxt('Parameters/B.csv', delimiter=',')

# F-test

# Wrangle data

In [3]:
# Produce weight for model
def ProduceWeights(df, W, B):

    # Get data for model
    X_model_df = df[X_list] # Dataframe contains X_list columns 
    Y_model_df = df[Y_list]

    X_array, Y_array = model.Reshape(X_model_df, Y_model_df) # Drops ID and transforms each column to a numpy array

    # Get model guesses 
    
    # Use dot product on the four weights and the variable values for each athlete i.e. Ath1 = w1*var1 + w2*var2(..) + B
    lin_func = np.dot(W.T, X_array) + B
    
    # Use linear expression in sigmoid function to get model guess for each athlete 
    sf = model.Sigmoid(lin_func) 
    
    return sf



In [4]:
# Generate model predictions in two cases. 
# 0. Model weights remain unchanged 
# 1. ith weight is set to 0
# Append model guesses to lists dependent on  the number of variables i.e. 4 variables creates a list of 4 arrays. 

def f_test_loop(W, B):
    
    # Create list to store model predictions in two cases
    ### DEFINE SIG0 BEFORE LOOP + CHANGE VARIABLE NAMES
    sig_0_store = [] 
    sig_1_store = []  
    drop_ID = X_list[1:]
    
    for index,element in enumerate(drop_ID):
        
        # Print output for each iteration
        print(f'The current index is {index}')
        print(f'The current element is {element}')
        
        # Change weights back to values found in W
        W1 = copy.deepcopy(W)
        
        W1[index] = 0 # index weight set to 0
        
        # Get list of model guesses with unchanged weights 
        sig_0 = ProduceWeights(df, W, B)
        print(sig_0)
        
        # Get a list of model guesses where weights are changed
        sig_1 = ProduceWeights(df, W1, B)
        print(sig_1)
        
        # Append array of model guesses to list 
        sig_0_store.append(sig_0)
        sig_1_store.append(sig_1)
    
    return sig_0_store, sig_1_store

# Call F_test

In [5]:
# Get first weight in CSV file of weights 
W_par = np.array([W_array[0][0], W_array[1][0], W_array[2][0], W_array[3][0]], ndmin= 0)
B_par = B_array[0]

# Create list of arrays of model guesses 
p_vector_0, p_vector_1 = f_test_loop(W_par,B_par) 

vores i er 0
vores element er PreviousMedals
vores len er 4
[0.92524216 0.20173657 0.43566097 ... 0.55253714 0.37581016 0.29995489]
[0.92524216 0.20173657 0.43566097 ... 0.55253714 0.37581016 0.29995489]
vores i er 1
vores element er Height_div_avg
vores len er 4
[0.92524216 0.20173657 0.43566097 ... 0.55253714 0.37581016 0.29995489]
[0.92213526 0.19923577 0.41987367 ... 0.55507281 0.36356257 0.31676768]
vores i er 2
vores element er Weight_div_avg
vores len er 4
[0.92524216 0.20173657 0.43566097 ... 0.55253714 0.37581016 0.29995489]
[0.32566492 0.33365407 0.46003613 ... 0.46465944 0.4624237  0.44020105]
vores i er 3
vores element er Age_div_avg
vores len er 4
[0.92524216 0.20173657 0.43566097 ... 0.55253714 0.37581016 0.29995489]
[0.94741978 0.25644853 0.39402305 ... 0.48648425 0.33168873 0.25306099]


# Test for normality

In [6]:
# Test each array in sig0_store and sig1_store for normality 
# 0. Model weights remain unchanged 
# 1. ith weight is set to 0
# Append result to list 

def normality_loop(sig_0_probabilities, sig_1_probabilities):
    
    # Create lists to store result of normality test 
    x_norm = [] ### CHANGE VARIABLE NAMES 
    y_norm = []
    drop_ID = X_list[1:] # Only used for print statement 
    
    # Normality test is done for each array in the two lists of model guesses 
    for i in range(len(sig_0_probabilities)): # Ensure 
        x_norm.append(scipy.stats.shapiro(sig_0_probabilities[i])) ### CREATES TOO MANY TESTS, WE JUST NEED 1
        y_norm.append(scipy.stats.shapiro(sig_1_probabilities[i]))
        print(f' vores variabel er: {drop_ID[i]} \n vores x_norm er {x_norm[i]}, \n vores y_norm er {y_norm[i]}')
        
    return x_norm,y_norm

shapiro_0, shapiro_1 = normality_loop(p_vector_0, p_vector_1)



#z = scipy.stats.probplot(df_sig_0["Prob"], dist = 'norm', plot = plt)
#q = scipy.stats.probplot(df_sig_1["Prob"], dist = 'norm', plot = plt)

#plt.show(z)
#plt.show(q)

# Test for normality showed that including previous medals, caused the distribution to be non-normal. PreviousMedals is therefore excluded. 

 vores variabel er: PreviousMedals 
 vores x_norm er ShapiroResult(statistic=0.8946238160133362, pvalue=0.0), 
 vores y_norm er ShapiroResult(statistic=0.8800283074378967, pvalue=0.0)
 vores variabel er: Height_div_avg 
 vores x_norm er ShapiroResult(statistic=0.8946238160133362, pvalue=0.0), 
 vores y_norm er ShapiroResult(statistic=0.8890291452407837, pvalue=0.0)
 vores variabel er: Weight_div_avg 
 vores x_norm er ShapiroResult(statistic=0.8946238160133362, pvalue=0.0), 
 vores y_norm er ShapiroResult(statistic=0.8226956129074097, pvalue=0.0)
 vores variabel er: Age_div_avg 
 vores x_norm er ShapiroResult(statistic=0.8946238160133362, pvalue=0.0), 
 vores y_norm er ShapiroResult(statistic=0.8803070187568665, pvalue=0.0)




# F-test

In [7]:
# F-test: https://link.springer.com/book/10.1007%2F978-3-319-46162-5 
# Code adapted from: https://www.statology.org/f-test-python/

# F test arrays of model guesses 
# 0. Model weights remain unchanged 
# 1. ith weight is set to 0

def f_test(sig_0_probabilities,sig_1_probabilities):
    f_test_list = []
    drop_ID = X_list[1:]
    
    for i in range(len(x)):
        f = np.var(sig_0_probabilities[i], ddof=1)/np.var(sig_1_probabilities[i], ddof=1) #calculate F test statistic 
        dfn = sig_0_probabilities[i].size-1 #define degrees of freedom numerator 
        dfd = sig_1_probabilities[i].size-1 #define degrees of freedom denominator 
        p = 1-scipy.stats.f.cdf(f, dfn, dfd) #find p-value of F test statistic
        f_test_list.append(p)
        
        # Print current variable and p value 
        print(f'vores variabel er {drop_ID[i]} \n vores p er {p}')
        
        # Define alpha and print conclusion of hypothesis test 
        if p <= 0.05:
            print(f'nulhypotesen forkastes da p-værdien = {p}. Dermed er sandsynligheden for at de to fordelinger har samme varians lav. Variablen har stor betydning for modellens output.')
        else:
            print(f'nulhypotesen kan ikke forkastes da p-værdien = {p}. Dermed er der høj sandsynlighed for at de to fordelinger har samme varians. Variablen kan have stor betydning for modellens output.')
    
    return f_test_list

f_test(p_vector_0, p_vector_1)





vores variabel er PreviousMedals 
 vores p er 5.223256799169462e-05
nulhypotesen forkastes da p-værdien = 5.223256799169462e-05. Dermed er sandsynligheden for at de to fordelinger har samme varians lav. Variablen har stor betydning for modellens output.
vores variabel er Height_div_avg 
 vores p er 0.02825330415181948
nulhypotesen forkastes da p-værdien = 0.02825330415181948. Dermed er sandsynligheden for at de to fordelinger har samme varians lav. Variablen har stor betydning for modellens output.
vores variabel er Weight_div_avg 
 vores p er 1.1102230246251565e-16
nulhypotesen forkastes da p-værdien = 1.1102230246251565e-16. Dermed er sandsynligheden for at de to fordelinger har samme varians lav. Variablen har stor betydning for modellens output.
vores variabel er Age_div_avg 
 vores p er 0.9737840882713829
nulhypotesen kan ikke forkastes da p-værdien = 0.9737840882713829. Dermed er der høj sandsynlighed for at de to fordelinger har samme varians. Variablen kan have stor betydning f

[5.223256799169462e-05,
 0.02825330415181948,
 1.1102230246251565e-16,
 0.9737840882713829]