In [1]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import pandas as pd
import numpy as np
import copy
import os

In [2]:
def agg_feat_imp(ls_dicts):
    dict_agg_imp = dict()
    dict_agg_freq = dict()
    for dict_feat_imp in ls_dicts:
        for i in dict_feat_imp:
            if i in dict_agg_imp:
                dict_agg_imp[i] = dict_agg_imp[i] + dict_feat_imp[i]
            else:
                dict_agg_imp[i] = dict_feat_imp[i]

            if i in dict_agg_freq:
                dict_agg_freq[i] = dict_agg_freq[i] + 1
            else:
                dict_agg_freq[i] = 1

    for j in dict_agg_imp:
        dict_agg_imp[j] = dict_agg_imp[j] / dict_agg_freq[j]

    return dict_agg_imp, dict_agg_freq

In [3]:
def to_dict(ls_str):
    dict_output = dict()
    ls_str = ls_str[1: len(ls_str) - 1]
    str_splits = ls_str.split(',')
    for str_split in str_splits:
        feat_split = str_split.split(':')
        feat_name = eval(feat_split[0]).lstrip()
        feat_imp = float(feat_split[1])
        dict_output[feat_name] = feat_imp
        
    return dict_output

In [4]:
def comm_feat(ls_xgb_dict, ls_rf_dict):
    dict_imp_xgb, dict_freq_xgb = agg_feat_imp(ls_xgb_dict)
    dict_imp_rf, dict_freq_rf = agg_feat_imp(ls_rf_dict)
    
    ls_xgb_imp = list()
    for feat_name in dict_imp_xgb:
        ls_row = [feat_name, dict_imp_xgb[feat_name]]
        ls_xgb_imp.append(ls_row)
        
    ls_xgb_freq = list()
    for feat_name in dict_freq_xgb:
        ls_row = [feat_name, dict_freq_xgb[feat_name]]
        ls_xgb_freq.append(ls_row)
    
    ls_rf_imp = list()
    for feat_name in dict_imp_rf:
        ls_row = [feat_name, dict_imp_rf[feat_name]]
        ls_rf_imp.append(ls_row)

    ls_rf_freq = list()
    for feat_name in dict_freq_rf:
        ls_row = [feat_name, dict_freq_rf[feat_name]]
        ls_rf_freq.append(ls_row)
        
    df_imp_xgb = pd.DataFrame(ls_xgb_imp, columns = ['Feature', 'Importance'])
    df_freq_xgb = pd.DataFrame(ls_xgb_freq, columns = ['Feature', 'Frequency'])
    df_imp_rf = pd.DataFrame(ls_rf_imp, columns = ['Feature', 'Importance'])
    df_freq_rf = pd.DataFrame(ls_rf_freq, columns = ['Feature', 'Frequency'])
    
    mean_freq_xgb = np.median(df_freq_xgb['Frequency'].tolist())
    mean_imp_xgb = np.median(df_imp_xgb['Importance'].tolist())
    mean_freq_rf = np.median(df_freq_rf['Frequency'].tolist())
    mean_imp_rf = np.median(df_imp_rf['Importance'].tolist())

    df_freq_sub_xgb = df_freq_xgb[df_freq_xgb['Frequency'] >= mean_freq_xgb]
    df_imp_sub_xgb = df_imp_xgb[df_imp_xgb['Importance'] >= mean_imp_xgb]
    df_freq_sub_rf = df_freq_rf[df_freq_rf['Frequency'] >= mean_freq_rf]
    df_imp_sub_rf = df_imp_rf[df_imp_rf['Importance'] >= mean_imp_rf]
    
    df_sub_xgb = pd.merge(df_freq_sub_xgb, df_imp_sub_xgb, on='Feature')
    df_sub_xgb.columns = ['Feature', 'Frequency_XGB', 'Importance_XGB']
    df_sub_rf = pd.merge(df_freq_sub_rf, df_imp_sub_rf, on='Feature')
    df_sub_rf.columns = ['Feature', 'Frequency_RF', 'Importance_RF']
    
    df_feat_agg = pd.merge(df_sub_xgb, df_sub_rf, on='Feature')
    ls_imp_avg = list()
    for i in range(len(df_feat_agg)):
        imp_xgb = df_feat_agg.loc[i, 'Importance_XGB']
        imp_rf = df_feat_agg.loc[i, 'Importance_RF']
        ls_imp_avg.append(np.mean([imp_xgb, imp_rf]))
        
    df_feat_agg['Ranking Score'] = ls_imp_avg
    df_feat_agg_sort = df_feat_agg.sort_values(by='Ranking Score', ascending=False)
    df_feat_agg_sort.reset_index(drop=True, inplace=True)
    
    return df_feat_agg_sort

In [5]:
def keep_feat(ls_keep_feat, pers, df, df_corr):
    ls_del_index = list()
    ls_corr = list()
    for i in range(len(df)):
        feat = df.loc[i, 'Feature']
        if feat not in ls_keep_feat:
            ls_del_index.append(i)
        else:
            ls_corr.append(df_corr.loc[pers, feat])
            
            
    df_keep = df.drop(ls_del_index, axis = 0)
    df_keep.reset_index(drop = True, inplace = True)
    df_keep['Corr'] = ls_corr
    
    return df_keep

In [6]:
def match_abbr(df_def, df):
    for i in range(len(df)):
        feat = df.loc[i, 'Feature']
        df_part = df_def[df_def['Feature'] == feat]
        feat_abbr = df_part['Abbreviation'].tolist()[0]
        df.loc[i, 'Feature'] = feat_abbr
    
    return df

## Heatmap Features

In [7]:
os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results')
# The definition and abbrevation of features
df_feat_defi = pd.read_csv('feat_def.csv')
df_ml_input = pd.read_csv('ml_input.csv', index_col = 0)
df_ml_input['neoNeuroticism'] = 4 - np.array(df_ml_input['neoNeuroticism'].tolist())

In [8]:
pers_names = ['catAntagonism', 'catDetachment', 'catDisinhibition', 'catNegativeAffectivity', 'catPsychoticism', 'neoAgreeableness', 'neoExtraversion', 'neoConscientiousness', 'neoNeuroticism', 'neoOpenness']
pers_names_output = ['Antagonism', 'Detachment', 'Disinhibition', 'Negative Affectivity', 'Psychoticism', 'Agreeableness', 'Extraversion', 'Conscientiousness', 'Emotional Stability', 'Openness']
df_corr = df_ml_input.corr()
df_score_corr = pd.DataFrame(index = pers_names_output)

In [9]:
ls_df_comm_feat = list()
for pers_name in pers_names:
    os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/xgb_results/')
    df_xgb_feat = pd.read_csv('xgb_' + pers_name + '_feats_raw.csv')
    ls_xgb_dict = list()
    for i in range(len(df_xgb_feat)):
        ls_xgb_dict.append(to_dict(df_xgb_feat.loc[i, 'Feats']))
    
    dict_imp_xgb, dict_freq_xgb = agg_feat_imp(ls_xgb_dict)
    os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/rf_results/')
    df_rf_feat = pd.read_csv('rf_' + pers_name + '_feats_raw.csv')
    ls_rf_dict = list()
    for i in range(len(df_rf_feat)):
        ls_rf_dict.append(to_dict(df_rf_feat.loc[i, 'Feats']))
        
    
    df_comm_feat = comm_feat(ls_xgb_dict, ls_rf_dict)
    
    ls_df_comm_feat.append(df_comm_feat)

In [10]:
ls_all_feat_imp = list()
for i in range(len(df_feat_defi)):
    feat_name = df_feat_defi.loc[i, 'Feature']
    for df in ls_df_comm_feat:
        if feat_name in df['Feature'].tolist():
            ls_all_feat_imp.append(df[df['Feature'] == feat_name]['Ranking Score'].iloc[0])

norm_min = np.min(ls_all_feat_imp)
norm_max = np.max(ls_all_feat_imp)

In [11]:
ls_all_feat_imp = list()
for i in range(len(df_feat_defi)):
    ls_col_feat_rank = list()
    ls_col_feat_corr = list()
    feat_abbr = df_feat_defi.loc[i, 'Abbreviation']
    feat_name = df_feat_defi.loc[i, 'Feature']
    for i, df in enumerate(ls_df_comm_feat):
        if feat_name in df['Feature'].tolist():
            rank_score = df[df['Feature'] == feat_name]['Ranking Score'].iloc[0]
            ls_col_feat_rank.append(round((rank_score-norm_min)/(norm_max-norm_min),2))
            ls_col_feat_corr.append(round(df_corr.loc[feat_name, pers_names[i]],2))
#             rank_score_cell = round((rank_score-norm_min)/(norm_max-norm_min),2)
#             if rank_score_cell == 0:
#                 ls_col_feat_rank.append(np.nan)
#                 ls_col_feat_corr.append(np.nan)
#             else:
#                 ls_col_feat_rank.append(round((rank_score-norm_min)/(norm_max-norm_min),2))
#                 ls_col_feat_corr.append(round(df_corr.loc[feat_name, pers_names[i]],2))
        else:
            ls_col_feat_rank.append(np.nan)
            ls_col_feat_corr.append(np.nan)            
            
    df_score_corr[feat_abbr + '(Rank)'] = ls_col_feat_rank
    df_score_corr[feat_abbr + '(Corr)'] = ls_col_feat_corr

In [12]:
os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/heatmap')
df_score_corr.to_csv('feat_rank_corr_new.csv')

### Statistical Analysis

In [13]:
import os
import copy
import pandas as pd
import numpy as np

In [14]:
os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/heatmap')
# Old represents day-level modeling, and new represents person-level modeling
df_score_old = pd.read_csv('feat_rank_corr_old.csv', index_col = 0)
df_score_new = pd.read_csv('feat_rank_corr_new.csv', index_col = 0)
df_feat = pd.read_csv('feat_def.csv', index_col=0)

In [15]:
# Reassign the abbrevation to the new results (person-level modeling)
dict_feat_new = {'HR1':'HR4',
                 'HR2':'HR2',
                 'HR3':'HR5',
                 'SL1':'SL4',
                 'SL2':'SL5',
                 'SL3':'SL6',
                 'SL4':'SL7',
                 'SL5':'SL8',
                 'SL6':'SL9',
                 'SL7':'SL10',
                 'ST1':'ST1',
                 'ST2':'ST3',
                 'ST3':'ST4',
                 'Bat1':'Bat3',
                 'Bat2':'Bat2',
                 'BL1':'BL1',
                 'Call1':'Call1',
                 'Call2':'Call3',
                 'Call3':'Call4',
                 'Call4':'Call5',
                 'Call5':'Call6',
                 'Call6':'Call7',
                 'Call7':'Call8',
                 'Call8':'Call9',
                 'Aud1':'Aud4',
                 'Aud2':'Aud5',
                 'Aud3':'Aud6',
                 'Loc1':'Loc7',
                 'Loc2':'Loc8',
                 'SR1':'SR1',
                 'SR2':'SR2',
                 'SR3':'SR3',
                 'SR4':'SR4',
                 'Wifi1':'Wifi1',
                 'Wifi2':'Wifi2'
                }

In [16]:
col_name_new = list()
for col_name in df_score_new.columns.tolist():
    feat_name = col_name.split('(')[0]
    categ = col_name.split('(')[1][0:4]
    col_name_new.append(dict_feat_new[feat_name] + '(' + categ + ')')

df_score_new.columns = col_name_new

In [17]:
feat_names = df_feat.index.tolist()
ls_comm_cols = list()
for i in feat_names:
    ls_comm_cols.append(i + '(Rank)')
    ls_comm_cols.append(i + '(Corr)')
for i in ls_comm_cols:
    if i not in df_score_old.columns.tolist():
        df_score_old[i] = [np.nan] * 10
    if i not in df_score_new.columns.tolist():
        df_score_new[i] = [np.nan] * 10
df_score_old = df_score_old[ls_comm_cols]
df_score_new = df_score_new[ls_comm_cols]

In [18]:
col_names = df_score_old.columns.tolist()
pers_names = df_score_old.index.tolist()
feat_names = list()
for col_name in col_names:
    feat_name = col_name.split('(')[0]
    if feat_name not in feat_names:
        feat_names.append(feat_name)

col_names_sep_merge = list()
col_names_sep = list()
index_names_sep = feat_names
for pers_name in pers_names:
    col_names_sep.append('D:' + pers_name + '(Rank)')
    col_names_sep.append('D:' + pers_name + '(Corr)')
    col_names_sep.append('P:' + pers_name + '(Rank)')
    col_names_sep.append('P:' + pers_name + '(Corr)')
    col_names_sep_merge.append('Rank   ')
    col_names_sep_merge.append('Corr   ')
    col_names_sep_merge.append('Rank   ')
    col_names_sep_merge.append('Corr   ')
    
df_score_sep_merge = pd.DataFrame(index = index_names_sep, columns = col_names_sep)

In [19]:
for col_name in df_score_sep_merge.columns.tolist():
    for index_name in df_score_sep_merge.index.tolist():
        model = col_name.split(':')[0]
        pers_name = col_name.split(':')[1].split('(')[0]
        categ = col_name.split(':')[1].split('(')[1][0:4]
        if model == 'D':
            df_score_sep_merge.loc[index_name, col_name] = df_score_old.loc[pers_name, index_name + '(' + categ + ')']
        if model == 'P':
            df_score_sep_merge.loc[index_name, col_name] = df_score_new.loc[pers_name, index_name + '(' + categ + ')']

In [20]:
ls_index_new = list()
for i in df_score_sep_merge.index.tolist():
    ls_index_new.append(df_feat.loc[i, 'Definition'] + ':' + i)
df_score_sep_merge.columns = col_names_sep_merge
df_score_sep_merge.index = ls_index_new
df_score_sep_merge.to_csv('feat_rank_corr_final.csv')

## Pair Features

In [21]:
os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results')
# Load the definition and abbreviation of featurs
df_feat_defi = pd.read_csv('feat_def.csv')
df_ml_input = pd.read_csv('ml_input.csv', index_col = 0)
df_ml_input['neoNeuroticism'] = 4 - np.array(df_ml_input['neoNeuroticism'].tolist())

In [22]:
pers_names_output = ['Antagonism', 'Detachment', 'Disinhibition', 'Negative Affectivity', 'Psychoticism', 'Agreeableness', 'Extraversion', 'Conscientiousness', 'Emotional Stability', 'Openness']
pers_pairs = [['catAntagonism', 'neoAgreeableness'], ['catDetachment', 'neoExtraversion'], ['catDisinhibition', 'neoConscientiousness'], ['catNegativeAffectivity', 'neoNeuroticism'], ['catPsychoticism', 'neoOpenness']]
df_corr = df_ml_input.corr()

In [23]:
norm_min = 100
norm_max = 0
ls_feat_keep = df_feat_defi['Feature'].tolist()
ls_dfs = list()
for pers_pair in pers_pairs:
    pers_cat = pers_pair[0]
    pers_neo = pers_pair[1]
    os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/xgb_results/')
    df_xgb_cat_feat = pd.read_csv('xgb_' + pers_cat + '_feats_raw.csv')
    ls_xgb_cat_dict = list()
    for i in range(len(df_xgb_cat_feat)):
        ls_xgb_cat_dict.append(to_dict(df_xgb_cat_feat.loc[i, 'Feats']))

    df_xgb_neo_feat = pd.read_csv('xgb_' + pers_neo + '_feats_raw.csv')
    ls_xgb_neo_dict = list()
    for i in range(len(df_xgb_neo_feat)):
        ls_xgb_neo_dict.append(to_dict(df_xgb_neo_feat.loc[i, 'Feats']))

    os.chdir('/Users/ry4jr/Library/Mobile Documents/com~apple~CloudDocs/AAPEX/results/rf_results/')
    df_rf_cat_feat = pd.read_csv('rf_' + pers_cat + '_feats_raw.csv')
    ls_rf_cat_dict = list()
    for i in range(len(df_rf_cat_feat)):
        ls_rf_cat_dict.append(to_dict(df_rf_cat_feat.loc[i, 'Feats']))

    df_rf_neo_feat = pd.read_csv('rf_' + pers_neo + '_feats_raw.csv')
    ls_rf_neo_dict = list()
    for i in range(len(df_rf_neo_feat)):
        ls_rf_neo_dict.append(to_dict(df_rf_neo_feat.loc[i, 'Feats']))
        
    df_comm_cat_feat = keep_feat(ls_feat_keep, pers_cat, comm_feat(ls_xgb_cat_dict, ls_rf_cat_dict), df_corr)
    df_comm_neo_feat = keep_feat(ls_feat_keep, pers_neo, comm_feat(ls_xgb_neo_dict, ls_rf_neo_dict), df_corr)
    df_comm_feat = match_abbr(df_feat_defi, pd.merge(df_comm_cat_feat, df_comm_neo_feat, suffixes=('_cat', '_neo'), on='Feature'))
    if np.max(df_comm_feat['Importance_RF_cat']) > norm_max:
        norm_max = np.max(df_comm_feat['Importance_RF_cat'])
    if np.max(df_comm_feat['Importance_RF_neo']) > norm_max:
        norm_max = np.max(df_comm_feat['Importance_RF_neo'])
    if np.max(df_comm_feat['Importance_XGB_cat']) > norm_max:
        norm_max = np.max(df_comm_feat['Importance_XGB_cat'])
    if np.max(df_comm_feat['Importance_XGB_neo']) > norm_max:
        norm_max = np.max(df_comm_feat['Importance_XGB_neo'])

    if np.min(df_comm_feat['Importance_RF_cat']) < norm_min:
        norm_min = np.min(df_comm_feat['Importance_RF_cat'])
    if np.min(df_comm_feat['Importance_RF_neo']) < norm_min:
        norm_min = np.min(df_comm_feat['Importance_RF_neo'])
    if np.min(df_comm_feat['Importance_XGB_cat']) < norm_min:
        norm_min = np.min(df_comm_feat['Importance_XGB_cat'])
    if np.min(df_comm_feat['Importance_XGB_neo']) < norm_min:
        norm_min = np.min(df_comm_feat['Importance_XGB_neo'])
        
    ls_dfs.append(df_comm_feat)