In [57]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats
import statsmodels.api as sm


In [58]:
#Scales dataset and metric to evaluate

scales = pd.read_excel("filled_scales_BEBRASK_RETOS.xlsx")

In [59]:
trend_data = pd.read_excel("trend_dataset.xlsx")
baseline_data = pd.read_excel("RETOS_BEBRASK_Baseline.xlsx")

In [60]:
features_dataset = pd.merge(left = baseline_data,right=trend_data, left_on="Subject",right_on="Subject ID").drop("Subject ID",axis=1)

In [61]:
scaler = StandardScaler()
scaler.fit(features_dataset.drop("Subject",axis=1))
features_dataset_scaled = scaler.transform(features_dataset.drop("Subject",axis=1))

In [62]:
features_dataset_scaled = pd.DataFrame(features_dataset_scaled)
features_dataset_scaled.insert(0, 'Subject', features_dataset['Subject'])

In [63]:
features_dataset_scaled.columns = features_dataset.columns

In [64]:
features_dataset_scaled

Unnamed: 0,Subject,Mean_Rating0,Dif_Match,Cor_Pred_Like,Trend_Match,Trend_No_Match,Intercept_Match,Intercept_No_Match,Match_Final_Value,No_Match_Final_Value
0,PREDWELL_RETOS-1-1,1.993171,0.632545,-0.926969,-0.658026,-1.132586,1.656228,1.562985,0.832344,-0.232422
1,PREDWELL_RETOS-10-1,0.502669,-0.852105,-0.534772,-1.059467,0.813927,0.833654,-0.149700,-0.734860,1.305163
2,PREDWELL_RETOS-1001-1,0.968451,0.274406,-0.064176,0.687980,-0.762995,0.085340,0.915671,1.186616,-0.317060
3,PREDWELL_RETOS-1002-1,1.247920,-0.253036,0.379229,1.113034,-1.280423,-0.223126,1.684357,1.510865,-0.359379
4,PREDWELL_RETOS-1003-1,-0.894676,0.066034,-1.081741,-0.539956,0.641452,0.011080,-1.039758,-0.842943,-0.049041
...,...,...,...,...,...,...,...,...,...,...
144,PREDWELL_RETOS-5-1,-0.149425,-0.305129,-0.493753,0.876893,-1.452898,-0.794358,1.252814,0.490081,-1.177544
145,PREDWELL_RETOS-6-1,-0.615207,-0.747919,0.568578,1.845073,-0.294846,-1.794014,0.187442,0.892390,-0.317060
146,PREDWELL_RETOS-7-1,0.130044,-0.148850,0.286641,-0.421885,-0.516601,0.370956,0.484128,-0.248486,-0.373485
147,PREDWELL_RETOS-8-1,0.595826,-0.852105,-0.645545,-0.091287,-0.393404,0.131038,0.956128,0.003708,0.402361


In [65]:
scales.drop(["SUBJECT_CODE","Age"],axis=1,inplace=True)
metrics_columns = scales.drop("EPRIME_CODE",axis=1).columns.values

In [66]:
feature_scales = pd.merge(left = features_dataset_scaled,right=scales, left_on="Subject",right_on="EPRIME_CODE").drop(["EPRIME_CODE","Subject"],axis=1)

In [67]:
features_columns = features_dataset.drop("Subject",axis=1).columns
dic_features_scales = {}
for metric in metrics_columns:
    target = feature_scales[metric]
    dic_features_scales[metric] = {}
    for feature in features_columns:
        
        X = feature_scales[feature]
        X2 = sm.add_constant(X)
        est = sm.OLS(target, X2)
        est2 = est.fit()
        p_values = est2.summary2().tables[1]['P>|t|']
        dic_features_scales[metric].update({feature:p_values.iloc[1]})

In [68]:
df_features_scales = pd.DataFrame(dic_features_scales)
df_features_scales.mean(axis=1)


Mean_Rating0            0.468428
Dif_Match               0.458331
Cor_Pred_Like           0.598672
Trend_Match             0.435313
Trend_No_Match          0.626407
Intercept_Match         0.535498
Intercept_No_Match      0.527571
Match_Final_Value       0.446971
No_Match_Final_Value    0.614784
dtype: float64

In [82]:
df_features_scales <0.1

Unnamed: 0,PA,NA.,ERQ_CR,ERQ_ES,UPPSP_NU,UPPSP_PU,UPPSP_SS,UPPSP_PMD,UPPSP_PSV,BIS,...,RRQ_Rum,RRQ_Ref,ASI_P,ASI_C,ASI_S,SPQ,SPQ_IR,MSSB_POS,MSSB_NEG,MSSB_DES
Mean_Rating0,True,False,False,False,False,False,False,True,True,False,...,False,False,True,True,False,False,False,False,False,False
Dif_Match,False,False,True,False,False,False,False,False,False,False,...,False,True,True,True,False,False,False,False,False,False
Cor_Pred_Like,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Trend_Match,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
Trend_No_Match,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Intercept_Match,True,False,True,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
Intercept_No_Match,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Match_Final_Value,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
No_Match_Final_Value,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [69]:
# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_85 = {}

for metric in metrics_columns:
    dic_mean_comparison_85[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.04][metric] #Per sobre 85%
        low_values = feature_scales[feature_scales[feature] <= 1.04][metric] #Per sota 85%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_85[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_85 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_85[i][j] 
                                             for i in dic_mean_comparison_85.keys() 
                                             for j in dic_mean_comparison_85[i].keys()},
                                            orient='index')
print("85%")
df_mean_comparison_85[df_mean_comparison_85["p_value"]<0.10]

85%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Trend_No_Match,35.631579,19,32.892308,130,0.061146
PA,No_Match_Final_Value,35.190476,21,32.921875,128,0.068908
NA.,Dif_Match,18.714286,21,20.515625,128,0.073243
NA.,Intercept_Match,22.666667,21,19.867188,128,0.012831
ERQ_CR,Dif_Match,5.412222,21,4.80737,128,0.000245
ERQ_CR,Intercept_Match,5.214603,21,4.839792,128,0.056315
UPPSP_NU,Intercept_No_Match,10.12,25,8.637097,124,0.016544
UPPSP_PU,Intercept_No_Match,10.56,25,9.709677,124,0.06112
UPPSP_SS,Dif_Match,12.666667,21,10.554688,128,0.000973
UPPSP_PMD,Cor_Pred_Like,6.758621,29,7.561712,120,0.063913


In [70]:
df_mean_comparison_85.groupby(level=1)['p_value'].mean()


Cor_Pred_Like           0.543168
Dif_Match               0.352697
Intercept_Match         0.433860
Intercept_No_Match      0.515300
Match_Final_Value       0.569008
Mean_Rating0            0.533957
No_Match_Final_Value    0.502348
Trend_Match             0.583559
Trend_No_Match          0.496184
Name: p_value, dtype: float64

In [71]:


# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_90 = {}

for metric in metrics_columns:
    dic_mean_comparison_90[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.28][metric] #Per sobre 90%
        low_values = feature_scales[feature_scales[feature] <= 1.28][metric] #Per sota 90%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_90[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_90 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_90[i][j] 
                                             for i in dic_mean_comparison_90.keys() 
                                             for j in dic_mean_comparison_90[i].keys()},
                                            orient='index')
print("90%")
df_mean_comparison_90[df_mean_comparison_90["p_value"]<0.1]

90%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Trend_No_Match,36.090909,11,33.014493,138,0.064588
PA,Match_Final_Value,36.384615,13,32.941176,136,0.093602
NA.,Intercept_Match,23.066667,15,19.947761,134,0.026458
ERQ_CR,Dif_Match,5.416429,14,4.838296,135,0.003555
ERQ_CR,Intercept_Match,5.445333,15,4.830746,134,0.007456
ERQ_ES,Match_Final_Value,2.134615,13,3.119485,136,0.002806
UPPSP_NU,Mean_Rating0,10.7,10,8.755396,139,0.0446
UPPSP_PU,Mean_Rating0,10.8,10,9.784173,139,0.009083
UPPSP_SS,Dif_Match,13.071429,14,10.622222,135,0.001381
UPPSP_PMD,Cor_Pred_Like,6.7,20,7.514771,129,0.078667


In [72]:
df_mean_comparison_90.groupby(level=1)['p_value'].mean()

Cor_Pred_Like           0.594869
Dif_Match               0.328914
Intercept_Match         0.386691
Intercept_No_Match      0.577079
Match_Final_Value       0.487590
Mean_Rating0            0.570846
No_Match_Final_Value    0.556509
Trend_Match             0.609838
Trend_No_Match          0.444956
Name: p_value, dtype: float64

In [73]:
dic_mean_comparison = {}

for metric in metrics_columns:
    dic_mean_comparison[metric] = {}

    for feature in features_columns:
        
        high_values = feature_scales[feature_scales[feature] > 1.6][metric] #Per sobre 95%
        low_values = feature_scales[feature_scales[feature] <= 1.6][metric] #Per sota 95%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_95 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison[i][j] 
                                             for i in dic_mean_comparison.keys() 
                                             for j in dic_mean_comparison[i].keys()},
                                            orient='index')
print("95%")
df_mean_comparison_95[df_mean_comparison_95["p_value"]<0.1]



95%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Match_Final_Value,39.125,8,32.907801,141,0.021389
NA.,Match_Final_Value,16.5,8,20.475177,141,0.003203
ERQ_CR,Dif_Match,5.642381,7,4.855657,142,0.011265
ERQ_CR,Intercept_Match,5.480417,8,4.859267,141,0.050537
ERQ_ES,Intercept_No_Match,2.527778,9,3.066071,140,0.099975
ERQ_ES,Match_Final_Value,1.75,8,3.106383,141,4e-05
UPPSP_NU,Mean_Rating0,10.777778,9,8.764286,140,0.060888
UPPSP_PU,Mean_Rating0,10.666667,9,9.8,140,0.023263
UPPSP_PU,Intercept_Match,10.75,8,9.801418,141,0.0833
UPPSP_PU,No_Match_Final_Value,8.777778,9,9.921429,140,0.015626


In [74]:
df_mean_comparison_95.groupby(level=1)['p_value'].mean()


Cor_Pred_Like           0.434481
Dif_Match               0.414979
Intercept_Match         0.426829
Intercept_No_Match      0.500906
Match_Final_Value       0.326208
Mean_Rating0            0.537251
No_Match_Final_Value    0.506402
Trend_Match             0.617706
Trend_No_Match          0.538348
Name: p_value, dtype: float64

In [91]:
dic_mean_comparison_5 = {}

for metric in metrics_columns:
    dic_mean_comparison_5[metric] = {}

    for feature in features_columns:
        
        low_values = feature_scales[feature_scales[feature] < -1.6][metric] #Per sota 5%
        high_values = feature_scales[feature_scales[feature] >= -1.6][metric] #Per sobre 5%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_5[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': float(f"{p_value:.6f}")}


# Convert results to a DataFrame for easier visualization and analysis
print("5%")
df_mean_comparison_5 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_5[i][j] 
                                             for i in dic_mean_comparison_5.keys() 
                                             for j in dic_mean_comparison_5[i].keys()},
                                            orient='index')

df_mean_comparison_5[df_mean_comparison_5["p_value"]<0.1]

  res = hypotest_fun_out(*samples, **kwds)


5%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,33.416667,144,28.2,5,0.047714
NA.,Trend_No_Match,20.453237,139,17.6,10,0.08951
ERQ_ES,Mean_Rating0,3.072917,144,1.9,5,0.003894
UPPSP_SS,Match_Final_Value,10.985507,138,9.181818,11,0.076227
UPPSP_PSV,Intercept_No_Match,6.986395,147,11.5,2,0.037016
BIS,Mean_Rating0,21.506944,144,25.0,5,0.026777
BIS,Dif_Match,21.746479,142,19.142857,7,0.013708
BAS_RR,Intercept_Match,17.398601,143,19.166667,6,0.041818
BAS_RR,Intercept_No_Match,17.435374,147,20.0,2,0.0
BAS_D,Intercept_No_Match,11.857143,147,13.0,2,0.0


In [84]:
df_mean_comparison_5.groupby(level=1)['p_value'].mean()


Cor_Pred_Like           0.565897
Dif_Match               0.520647
Intercept_Match         0.543534
Intercept_No_Match      0.507676
Match_Final_Value       0.434343
Mean_Rating0            0.508384
No_Match_Final_Value    0.388247
Trend_Match             0.566450
Trend_No_Match          0.428442
Name: p_value, dtype: float64

In [85]:
dic_mean_comparison_10 = {}

for metric in metrics_columns:
    dic_mean_comparison_10[metric] = {}

    for feature in features_columns:
        
        low_values = feature_scales[feature_scales[feature] < -1.28][metric] #Per sota 10%
        high_values = feature_scales[feature_scales[feature] >= -1.28][metric] #Per sobre 10%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_10[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_10 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_10[i][j] 
                                             for i in dic_mean_comparison_10.keys() 
                                             for j in dic_mean_comparison_10[i].keys()},
                                            orient='index')
print("10%")

df_mean_comparison_10[df_mean_comparison_10["p_value"]<0.1]

10%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,33.656716,134,29.533333,15,0.042053
NA.,Dif_Match,20.109489,137,22.0,12,0.096622
ERQ_CR,Dif_Match,4.931849,137,4.444722,12,0.040691
ERQ_ES,Mean_Rating0,3.085821,134,2.566667,15,0.056328
ERQ_ES,Match_Final_Value,3.092593,135,2.464286,14,0.047859
ERQ_ES,No_Match_Final_Value,3.105769,130,2.539474,19,0.071601
UPPSP_PU,Intercept_Match,9.666667,129,11.05,20,0.041523
UPPSP_SS,Intercept_No_Match,10.978571,140,8.888889,9,0.087425
UPPSP_SS,Match_Final_Value,11.014815,135,9.285714,14,0.036179
BIS,Mean_Rating0,21.470149,134,23.0,15,0.073733


In [86]:
df_mean_comparison_10.groupby(level=1)['p_value'].mean()


Cor_Pred_Like           0.486707
Dif_Match               0.608162
Intercept_Match         0.566303
Intercept_No_Match      0.410399
Match_Final_Value       0.470943
Mean_Rating0            0.384771
No_Match_Final_Value    0.487380
Trend_Match             0.529675
Trend_No_Match          0.504408
Name: p_value, dtype: float64

In [87]:
# Assuming feature_scales is your DataFrame with features as columns and metrics as rows
dic_mean_comparison_15 = {}

for metric in metrics_columns:
    dic_mean_comparison_15[metric] = {}

    for feature in features_columns:
        
        low_values = feature_scales[feature_scales[feature] < -1.04][metric] #Per sota 15%
        high_values = feature_scales[feature_scales[feature] >= -1.04][metric] #Per sobre 15%
        t_stat, p_value = stats.ttest_ind(high_values, low_values, equal_var=False)  # Assuming unequal variances

                # Calculate means
        high_mean = np.mean(high_values)
        low_mean = np.mean(low_values)

        dic_mean_comparison_15[metric][feature] = {'high_mean': high_mean,'size_high': len(high_values), 'low_mean': low_mean,'size_low': len(low_values), 'p_value': p_value}


# Convert results to a DataFrame for easier visualization and analysis
df_mean_comparison_15 = pd.DataFrame.from_dict({(i,j): dic_mean_comparison_15[i][j] 
                                             for i in dic_mean_comparison_15.keys() 
                                             for j in dic_mean_comparison_15[i].keys()},
                                            orient='index')
print("85%")
df_mean_comparison_15[df_mean_comparison_15["p_value"]<0.10]

85%


Unnamed: 0,Unnamed: 1,high_mean,size_high,low_mean,size_low,p_value
PA,Mean_Rating0,33.75969,129,29.9,20,0.024928
PA,Dif_Match,32.909091,132,35.823529,17,0.076193
PA,Intercept_Match,33.637795,127,30.954545,22,0.069265
ERQ_CR,Mean_Rating0,4.946977,129,4.542,20,0.095583
ERQ_CR,No_Match_Final_Value,4.817387,125,5.284444,24,0.030373
ERQ_ES,Mean_Rating0,3.118217,129,2.4875,20,0.010794
ERQ_ES,Match_Final_Value,3.10119,126,2.663043,23,0.088016
UPPSP_PU,Intercept_Match,9.653543,127,11.0,22,0.032401
UPPSP_PU,Match_Final_Value,9.984127,126,9.130435,23,0.066813
UPPSP_SS,Trend_Match,11.040323,124,9.92,25,0.07787


In [88]:
df_mean_comparison_15.groupby(level=1)['p_value'].mean()

Cor_Pred_Like           0.519340
Dif_Match               0.558772
Intercept_Match         0.490992
Intercept_No_Match      0.441447
Match_Final_Value       0.409998
Mean_Rating0            0.339512
No_Match_Final_Value    0.572271
Trend_Match             0.409891
Trend_No_Match          0.594007
Name: p_value, dtype: float64