In [3]:
"""
Script to analyse every feature.
"""
import os
import sys

cwd = os.getcwd()
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, os.path.join(cwd, "../"))

import os
import pandas as pd
import numpy as np

from src.utils import student_utils
from src.definitions import MINIMAL_PROCESSED_DATA_PATH, \
    FEATURE_CONFIG_FILE_PATH, STUDENT_FOLDER_NAME_PREFIX, BINNED_ON_VAR_FREQ_DATA_PATH, STUDENT_RAW_DATA_ANALYSIS_ROOT
from src.utils.read_utils import read_yaml
from src.utils.write_utils import df_to_csv
from src.utils import student_utils
from src.data_processing import helper

# Reading Configs.
FEATURE_CONFIG = read_yaml(FEATURE_CONFIG_FILE_PATH)['features']
AVAILABLE_FEATURE = FEATURE_CONFIG.keys()
COVARIATES = read_yaml(FEATURE_CONFIG_FILE_PATH)['covariates']
STUDENT_CONFIG = read_yaml(FEATURE_CONFIG_FILE_PATH)['students']

students = read_yaml(FEATURE_CONFIG_FILE_PATH)['students']['student_list']

if students:
    AVAILABLE_STUDENTS = list(set(students).intersection(set(AVAILABLE_STUDENTS)))
    
resampling_freq = '60T'

############## Main Loop To Process Data ##################

AVAILABLE_STUDENTS = [2, 53, 46, 7, 49, 24, 22, 35]

for student_id in AVAILABLE_STUDENTS:
    student_data = pd.DataFrame()
    
    statistic_dict = {"statistic": ['mean_occurence_very_hour', 'mean',
                                    'stdev','max',
                                    'min', 'median',
                                    'max_time_delta_minutes', 'min_time_delta_minutes']}

    for idx, feature in enumerate(AVAILABLE_FEATURE):
        feature_data_path = os.path.join(MINIMAL_PROCESSED_DATA_PATH,
                                         STUDENT_FOLDER_NAME_PREFIX + str(student_id),
                                         feature + ".csv")
        feature_data = pd.read_csv(feature_data_path, index_col=[0])
        feature_data.index = pd.to_datetime(feature_data.index)
        
        resampler = feature_data.resample(resampling_freq)
        feature_mean_count = resampler.count().mean()
        feature_mean  = feature_data.mean()
        feature_stdev = feature_data.std()
        feature_max  = feature_data.max()
        feature_min  = feature_data.min()
        feature_median = feature_data.median()
        feature_names = feature_max.index.values[1:]
        pd.set_option('max_rows', 100)
        
        index = feature_data.index.astype(np.int64).values# (10 ** 9 * 60)
        
        ahead_time = index[1:]
        behind_time = index[:-1]
        time_delta = np.subtract (ahead_time, behind_time) 
        time_delta_max = time_delta.max().tolist()
        time_delta_min = time_delta.min().tolist()
        
        for name in feature_names:
            statistic_dict[name] = [feature_mean_count[name], feature_mean[name], 
                                    feature_stdev[name], feature_max[name], 
                                    feature_min[name], feature_median[name], 
                                    time_delta_max, time_delta_min]

        student_data = pd.DataFrame(statistic_dict)
        
    file_path = os.path.join(STUDENT_RAW_DATA_ANALYSIS_ROOT, "student_{}.csv".format(str(student_id)))
    student_data.to_csv(file_path)
        