In [1]:
from param import ml_names, pers_names, col_infor, data_name, data_path, results_path
from pre_process import remove_missing
from feature_extraction import feature_extraction
from imputation import impute_min, impute_iter
from scipy import stats
import pandas as pd
import numpy as np
import os

## Person-level

In [2]:
os.chdir(data_path)
df_data = pd.read_csv(data_name)
df_miss = remove_missing(df=df_data, proc_call = False, remove_col = False, remove_par = True)

In [3]:
index_del = list()
df_miss_groups = df_miss.groupby('participantID')
miss_group_names = df_miss_groups.size().index
for miss_group_name in miss_group_names:
    df_miss_group = df_miss_groups.get_group(miss_group_name)
    if df_miss_group['phone_data_yield_rapids_ratiovalidyieldedminutes'].isnull().all():
        index_del.extend(df_miss_group.index.tolist())

df_miss_new = df_miss.drop(index_del)

In [4]:
first_week = list()
second_week = list()
df_groups = df_miss_new.groupby('participantID')
group_names = df_groups.size().index
for group_name in group_names:
    df_group = df_groups.get_group(group_name)
    df_group.reset_index(drop=True, inplace=True)
    if len(df_group) >= 14:
        # First week
        df_group_first = df_group[0:7]
        # Second week
        df_group_second = df_group[7:14]
        df_group_first.reset_index(drop=True, inplace=True)
        df_group_second.reset_index(drop=True, inplace=True)
        first_week.append(df_group_first)
        second_week.append(df_group_second)
    
df_first_week = pd.concat(first_week)
df_second_week = pd.concat(second_week)

In [5]:
os.chdir(data_path)
df_ml_input = pd.read_csv('ml_input_person.csv', index_col=0)

In [6]:
df_feat_first = feature_extraction(df_first_week)
df_feat_second = feature_extraction(df_second_week)

In [7]:
df_feat_first_ml = df_feat_first[df_ml_input.columns.tolist()]
df_feat_second_ml = df_feat_second[df_ml_input.columns.tolist()]

In [8]:
df_feat_first_com = impute_min(df_feat_first_ml)
df_feat_second_com = impute_min(df_feat_second_ml)

In [9]:
feat_names = df_ml_input.columns.tolist()
ls_relia = list()
for pers_name in pers_names:
    feat_names.remove(pers_name)
for feat_name in feat_names:
    ls_first_val = df_feat_first_com[feat_name].tolist()
    ls_second_val = df_feat_second_com[feat_name].tolist()
    corr_results = stats.pearsonr(ls_first_val, ls_second_val)
    ls_relia.append([feat_name, corr_results[0]])

In [10]:
df_relia = pd.DataFrame(ls_relia, columns = ['feat', 'correlation'])
df_relia.to_csv('feat_reliability_person.csv')

## Day-level

In [2]:
os.chdir(data_path)
df_data = pd.read_csv(data_name)

In [3]:
df_miss = remove_missing(df=df_data, remove_col=False, remove_row = True, remove_par=True)
df_miss_new = remove_missing(df=df_miss, remove_truth = False, remove_android = False, proc_call = False, remove_col = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
first_week = list()
second_week = list()
df_groups = df_miss_new.groupby('participantID')
group_names = df_groups.size().index
for group_name in group_names:
    df_group = df_groups.get_group(group_name)
    df_group.reset_index(drop=True, inplace=True)
    if len(df_group) >= 14:
        # First week
        df_group_first = df_group[0:7]
        # Second week
        df_group_second = df_group[7:14]
        df_group_first.reset_index(drop=True, inplace=True)
        df_group_second.reset_index(drop=True, inplace=True)
        first_week.append(df_group_first)
        second_week.append(df_group_second)
    
df_first_week = pd.concat(first_week)
df_second_week = pd.concat(second_week)

In [5]:
os.chdir(data_path)
df_ml_input = pd.read_csv('ml_input_day.csv', index_col=0)

In [6]:
feat_names = df_ml_input.columns.tolist()
ls_relia = list()
for pers_name in pers_names:
    feat_names.remove(pers_name)

df_first_week_com = impute_iter(df_first_week, feat_names, 1)
df_second_week_com = impute_iter(df_second_week, feat_names, 1)

Imputation:  0
Imputation:  0


In [7]:
df_first_week_imp = df_first_week_com[0][0]
df_second_week_imp = df_second_week_com[0][0]
for feat_name in feat_names:
    ls_first_val = df_first_week_imp[feat_name].tolist()
    ls_second_val = df_second_week_imp[feat_name].tolist()
    corr_results = stats.pearsonr(ls_first_val, ls_second_val)
    if not pd.isna(corr_results[0]):
        ls_relia.append([feat_name, corr_results[0]])



In [8]:
df_relia = pd.DataFrame(ls_relia, columns = ['feat', 'correlation'])
df_relia.to_csv('feat_reliability_day_new.csv')