In [108]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
import statsmodels.api as sm


In [109]:
#Scales dataset and metric to evaluate

scales = pd.read_excel("filled_scales_BEBRASK_RETOS.xlsx")

In [110]:
trend_data = pd.read_excel("trend_dataset.xlsx")

trend_data["Together_Match"] = trend_data["AVG_Match"] + trend_data["Trend_Match"]*27
trend_data["Together_No_Match"] = trend_data["AVG_No_Match"] + trend_data["Trend_No_Match"]*18

baseline_data = pd.read_excel("RETOS_BEBRASK_Baseline.xlsx")

In [111]:
features_dataset = pd.merge(left = baseline_data,right=trend_data, left_on="Subject",right_on="Subject ID").drop("Subject ID",axis=1)

In [112]:
scaler = StandardScaler()
scaler.fit(features_dataset.drop("Subject",axis=1))
features_dataset_scaled = scaler.transform(features_dataset.drop("Subject",axis=1))

In [113]:
features_dataset_scaled = pd.DataFrame(features_dataset_scaled)
features_dataset_scaled.insert(0, 'Subject', features_dataset['Subject'])

In [114]:
features_dataset_scaled.columns = features_dataset.columns

In [115]:
features_dataset_scaled

Unnamed: 0,Subject,Mean_Rating0,Dif_Match,Cor_Pred_Like,Trend_Match,Trend_No_Match,AVG_Match,AVG_No_Match,Together_Match,Together_No_Match
0,PREDWELL_RETOS-1-1,1.993171,0.632545,-0.926969,-0.658026,-1.132586,1.656228,1.562985,0.832344,-0.232422
1,PREDWELL_RETOS-10-1,0.502669,-0.852105,-0.534772,-1.059467,0.813927,0.833654,-0.149700,-0.734860,1.305163
2,PREDWELL_RETOS-1001-1,0.968451,0.274406,-0.064176,0.687980,-0.762995,0.085340,0.915671,1.186616,-0.317060
3,PREDWELL_RETOS-1002-1,1.247920,-0.253036,0.379229,1.113034,-1.280423,-0.223126,1.684357,1.510865,-0.359379
4,PREDWELL_RETOS-1003-1,-0.894676,0.066034,-1.081741,-0.539956,0.641452,0.011080,-1.039758,-0.842943,-0.049041
...,...,...,...,...,...,...,...,...,...,...
144,PREDWELL_RETOS-5-1,-0.149425,-0.305129,-0.493753,0.876893,-1.452898,-0.794358,1.252814,0.490081,-1.177544
145,PREDWELL_RETOS-6-1,-0.615207,-0.747919,0.568578,1.845073,-0.294846,-1.794014,0.187442,0.892390,-0.317060
146,PREDWELL_RETOS-7-1,0.130044,-0.148850,0.286641,-0.421885,-0.516601,0.370956,0.484128,-0.248486,-0.373485
147,PREDWELL_RETOS-8-1,0.595826,-0.852105,-0.645545,-0.091287,-0.393404,0.131038,0.956128,0.003708,0.402361


In [116]:
scales.drop(["SUBJECT_CODE","Age"],axis=1,inplace=True)
metrics_columns = scales.drop("EPRIME_CODE",axis=1).columns.values

In [117]:
feature_scales = pd.merge(left = features_dataset_scaled,right=scales, left_on="Subject",right_on="EPRIME_CODE").drop(["EPRIME_CODE","Subject"],axis=1)

In [118]:
features_columns = features_dataset.drop("Subject",axis=1).columns
dic_features_scales = {}
for metric in metrics_columns:
    target = feature_scales[metric]
    dic_features_scales[metric] = {}
    for feature in features_columns:
        
        X = feature_scales[feature]
        X2 = sm.add_constant(X)
        est = sm.OLS(target, X2)
        est2 = est.fit()
        p_values = est2.summary2().tables[1]['P>|t|']
        dic_features_scales[metric].update({feature:p_values.iloc[1]})

In [119]:
df_features_scales = pd.DataFrame(dic_features_scales)
df_features_scales.mean(axis=1)


Mean_Rating0         0.467371
Dif_Match            0.455141
Cor_Pred_Like        0.606172
Trend_Match          0.441686
Trend_No_Match       0.634640
AVG_Match            0.540235
AVG_No_Match         0.535258
Together_Match       0.453699
Together_No_Match    0.616108
dtype: float64

In [139]:
# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_85 = {}

for metric in metrics_columns:
    dic_mean_comparison_85[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.04][metric] #Per sobre 85%
        low_values = feature_scales[feature_scales[feature] <= 1.04][metric] #Per sota 85%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_85[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_85 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_85[i][j] 
                                             for i in dic_mean_comparison_85.keys() 
                                             for j in dic_mean_comparison_85[i].keys()},
                                            orient='index')
print("85%")
df_mean_comparison_85[df_mean_comparison_85["p_value"]<0.10]

85%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Trend_No_Match,35.631579,19,32.892308,130,0.061146
PA,Together_No_Match,35.190476,21,32.921875,128,0.068908
NA.,Dif_Match,18.714286,21,20.515625,128,0.073243
NA.,AVG_Match,22.666667,21,19.867188,128,0.012831
ERQ_CR,Dif_Match,5.412222,21,4.80737,128,0.000245
ERQ_CR,AVG_Match,5.214603,21,4.839792,128,0.056315
UPPSP_NU,AVG_No_Match,10.12,25,8.637097,124,0.016544
UPPSP_PU,AVG_No_Match,10.56,25,9.709677,124,0.06112
UPPSP_SS,Dif_Match,12.666667,21,10.554688,128,0.000973
UPPSP_PMD,Cor_Pred_Like,6.758621,29,7.561712,120,0.063913


In [140]:
df_mean_comparison_85.groupby(level=1)['p_value'].mean()


AVG_Match            0.440441
AVG_No_Match         0.516895
Cor_Pred_Like        0.546297
Dif_Match            0.351162
Mean_Rating0         0.537520
Together_Match       0.572697
Together_No_Match    0.501403
Trend_Match          0.585963
Trend_No_Match       0.494199
Name: p_value, dtype: float64

In [141]:


# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_90 = {}

for metric in metrics_columns:
    dic_mean_comparison_90[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.28][metric] #Per sobre 90%
        low_values = feature_scales[feature_scales[feature] <= 1.28][metric] #Per sota 90%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_90[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_90 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_90[i][j] 
                                             for i in dic_mean_comparison_90.keys() 
                                             for j in dic_mean_comparison_90[i].keys()},
                                            orient='index')
print("90%")
df_mean_comparison_90[df_mean_comparison_90["p_value"]<0.1]

90%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Trend_No_Match,36.090909,11,33.014493,138,0.064588
PA,Together_Match,36.384615,13,32.941176,136,0.093602
NA.,AVG_Match,23.066667,15,19.947761,134,0.026458
ERQ_CR,Dif_Match,5.416429,14,4.838296,135,0.003555
ERQ_CR,AVG_Match,5.445333,15,4.830746,134,0.007456
ERQ_ES,Together_Match,2.134615,13,3.119485,136,0.002806
UPPSP_NU,Mean_Rating0,10.7,10,8.755396,139,0.0446
UPPSP_PU,Mean_Rating0,10.8,10,9.784173,139,0.009083
UPPSP_SS,Dif_Match,13.071429,14,10.622222,135,0.001381
UPPSP_PMD,Cor_Pred_Like,6.7,20,7.514771,129,0.078667


In [142]:
df_mean_comparison_90.groupby(level=1)['p_value'].mean()

AVG_Match            0.395466
AVG_No_Match         0.589679
Cor_Pred_Like        0.595405
Dif_Match            0.328443
Mean_Rating0         0.568058
Together_Match       0.482768
Together_No_Match    0.555243
Trend_Match          0.614176
Trend_No_Match       0.444896
Name: p_value, dtype: float64

In [143]:
dic_mean_comparison = {}

for metric in metrics_columns:
    dic_mean_comparison[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.6][metric] #Per sobre 95%
        low_values = feature_scales[feature_scales[feature] <= 1.6][metric] #Per sota 95%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_95 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison[i][j] 
                                             for i in dic_mean_comparison.keys() 
                                             for j in dic_mean_comparison[i].keys()},
                                            orient='index')
print("95%")
df_mean_comparison_95[df_mean_comparison_95["p_value"]<0.1]



95%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Together_Match,39.125,8,32.907801,141,0.021389
NA.,Together_Match,16.5,8,20.475177,141,0.003203
ERQ_CR,Dif_Match,5.642381,7,4.855657,142,0.011265
ERQ_CR,AVG_Match,5.480417,8,4.859267,141,0.050537
ERQ_ES,AVG_No_Match,2.527778,9,3.066071,140,0.099975
ERQ_ES,Together_Match,1.75,8,3.106383,141,4e-05
UPPSP_NU,Mean_Rating0,10.777778,9,8.764286,140,0.060888
UPPSP_PU,Mean_Rating0,10.666667,9,9.8,140,0.023263
UPPSP_PU,AVG_Match,10.75,8,9.801418,141,0.0833
UPPSP_PU,Together_No_Match,8.777778,9,9.921429,140,0.015626


In [144]:
df_mean_comparison_95.groupby(level=1)['p_value'].mean()


AVG_Match            0.421083
AVG_No_Match         0.523863
Cor_Pred_Like        0.435817
Dif_Match            0.415787
Mean_Rating0         0.533614
Together_Match       0.319518
Together_No_Match    0.507024
Trend_Match          0.620207
Trend_No_Match       0.538885
Name: p_value, dtype: float64

In [145]:
dic_mean_comparison_5 = {}

for metric in metrics_columns:
    dic_mean_comparison_5[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] < -1.6][metric] #Per sota 5%
        low_values = feature_scales[feature_scales[feature] >= -1.6][metric] #Per sobre 5%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_5[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
print("5%")
df_mean_comparison_5 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_5[i][j] 
                                             for i in dic_mean_comparison_5.keys() 
                                             for j in dic_mean_comparison_5[i].keys()},
                                            orient='index')

df_mean_comparison_5[df_mean_comparison_5["p_value"]<0.1]

  res = hypotest_fun_out(*samples, **kwds)


5%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,28.2,5,33.416667,144,0.04771396
NA.,Trend_No_Match,17.6,10,20.453237,139,0.08950952
ERQ_ES,Mean_Rating0,1.9,5,3.072917,144,0.003894219
UPPSP_SS,Together_Match,9.181818,11,10.985507,138,0.07622709
UPPSP_PSV,AVG_No_Match,11.5,2,6.986395,147,0.03701629
BIS,Mean_Rating0,25.0,5,21.506944,144,0.02677734
BIS,Dif_Match,19.142857,7,21.746479,142,0.01370842
BAS_RR,AVG_Match,19.166667,6,17.398601,143,0.041818
BAS_RR,AVG_No_Match,20.0,2,17.435374,147,8.388936e-35
BAS_D,AVG_No_Match,13.0,2,11.857143,147,8.810011e-09


In [146]:
df_mean_comparison_5.groupby(level=1)['p_value'].mean()


AVG_Match            0.544163
AVG_No_Match         0.507565
Cor_Pred_Like        0.561616
Dif_Match            0.529052
Mean_Rating0         0.512888
Together_Match       0.434878
Together_No_Match    0.386037
Trend_Match          0.569449
Trend_No_Match       0.438827
Name: p_value, dtype: float64

In [149]:
dic_mean_comparison_10 = {}

for metric in metrics_columns:
    dic_mean_comparison_10[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] < -1.28][metric] #Per sota 10%
        low_values = feature_scales[feature_scales[feature] >= -1.28][metric] #Per sobre 10%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_10[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_10 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_10[i][j] 
                                             for i in dic_mean_comparison_10.keys() 
                                             for j in dic_mean_comparison_10[i].keys()},
                                            orient='index')
print("10%")

df_mean_comparison_10[df_mean_comparison_10["p_value"]<0.1]

10%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,29.533333,15,33.656716,134,0.042053
NA.,Dif_Match,22.0,12,20.109489,137,0.096622
ERQ_CR,Dif_Match,4.444722,12,4.931849,137,0.040691
ERQ_ES,Mean_Rating0,2.566667,15,3.085821,134,0.056328
ERQ_ES,Together_Match,2.464286,14,3.092593,135,0.047859
ERQ_ES,Together_No_Match,2.539474,19,3.105769,130,0.071601
UPPSP_PU,AVG_Match,11.05,20,9.666667,129,0.041523
UPPSP_SS,AVG_No_Match,8.888889,9,10.978571,140,0.087425
UPPSP_SS,Together_Match,9.285714,14,11.014815,135,0.036179
BIS,Mean_Rating0,23.0,15,21.470149,134,0.073733


In [150]:
df_mean_comparison_10.groupby(level=1)['p_value'].mean()


AVG_Match            0.570490
AVG_No_Match         0.418113
Cor_Pred_Like        0.489073
Dif_Match            0.602982
Mean_Rating0         0.385582
Together_Match       0.472063
Together_No_Match    0.494912
Trend_Match          0.532096
Trend_No_Match       0.514168
Name: p_value, dtype: float64

In [151]:
# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_15 = {}

for metric in metrics_columns:
    dic_mean_comparison_15[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] < -1.04][metric] #Per sota 15%
        low_values = feature_scales[feature_scales[feature] >= -1.04][metric] #Per sobre 15%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_15[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_15 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_15[i][j] 
                                             for i in dic_mean_comparison_15.keys() 
                                             for j in dic_mean_comparison_15[i].keys()},
                                            orient='index')
print("85%")
df_mean_comparison_15[df_mean_comparison_15["p_value"]<0.10]

85%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,29.9,20,33.75969,129,0.024928
PA,Dif_Match,35.823529,17,32.909091,132,0.076193
PA,AVG_Match,30.954545,22,33.637795,127,0.069265
ERQ_CR,Mean_Rating0,4.542,20,4.946977,129,0.095583
ERQ_CR,Together_No_Match,5.284444,24,4.817387,125,0.030373
ERQ_ES,Mean_Rating0,2.4875,20,3.118217,129,0.010794
ERQ_ES,Together_Match,2.663043,23,3.10119,126,0.088016
UPPSP_PU,AVG_Match,11.0,22,9.653543,127,0.032401
UPPSP_PU,Together_Match,9.130435,23,9.984127,126,0.066813
UPPSP_SS,Trend_Match,9.92,25,11.040323,124,0.07787


In [152]:
df_mean_comparison_15.groupby(level=1)['p_value'].mean()

AVG_Match            0.494690
AVG_No_Match         0.440877
Cor_Pred_Like        0.524054
Dif_Match            0.554948
Mean_Rating0         0.340630
Together_Match       0.410309
Together_No_Match    0.583480
Trend_Match          0.416062
Trend_No_Match       0.601684
Name: p_value, dtype: float64