In [1]:
debug = False

# Imports

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import plotnine as gg
gg.theme_set(gg.theme_classic)
from sklearn.linear_model import Ridge, RidgeCV
import sklearn.linear_model as sklin
import sklearn.metrics as skmet

In [3]:
from Functions import get_param_names, get_ages_cols, get_category_dict, get_category_from_feature, \
RepeatableGridSearchCV, make_gridder_pd

## Directories etc.

In [4]:
data_save_path = "C:/Users/maria/MEGAsync/SLCNdata/Meta/all_data"
plot_save_path = "C:/Users/maria/MEGAsync/SLCN/MetaSLCNPlots"
if not os.path.exists(plot_save_path):
    os.makedirs(plot_save_path)

# Get data

In [5]:
all_data = pd.read_csv(os.path.join(data_save_path, 'all_data.csv'), index_col=0)
all_data_z = pd.read_csv(os.path.join(data_save_path, 'all_data_z.csv'), index_col=0)
all_data_long = pd.read_csv(os.path.join(data_save_path, 'all_data_long.csv'), index_col=0)
all_data_long_z = pd.read_csv(os.path.join(data_save_path, 'all_data_long_z.csv'), index_col=0)

In [6]:
ages_cols = get_ages_cols()
param_cols = list(set([c for c in all_data.columns for p in get_param_names() if p in c]))
behav_cols = [c for c in all_data.columns if (c not in ages_cols) and (c not in param_cols)]
category_dict = get_category_dict()

## Repeated CV

In [7]:
# Define model for gridder
model = Ridge()
n_repeats = 10  # 100
n_splitss = np.arange(2, 8)
scoring = 'r2'  # 'r2', 'neg_mean_squared_error'
param_grid = {'alpha': [0, 10, 30, 50, 1e2, 3e2, 5e2, 1e3, 3e3, 5e3, 1e4, 1e5, 1e6]}

if debug:
    n_repeats = 2
    n_splitss = [3, 5, ]
    param_grid = {'alpha': [0, 50]}

In [8]:
# [category_dict[p] for p in ['Beta', 'Alpha', 'Nalpha', 'Forget']]

In [9]:
# cols = param_cols

# # Set up empty DataFrames
# coef_dat = pd.DataFrame()
# r2_dat = pd.DataFrame()

# # Predict 1 task from 1 task
# dat = all_data_z[[c for c in all_data_z.columns if c not in ages_cols]].dropna()
# for predictor_task in ['bf_', 'rl_', 'ps_']:

#     # Get predictor and outcome data
#     X_dat = dat[[c for c in cols if predictor_task in c]]
#     y_dat = dat[[c for c in cols if predictor_task not in c]]

#     for y_col in y_dat.columns:

#         # Run regression
#         r2_row, coef_row = run_regression_get_r2_coef(X_dat, y_dat, y_col, predictor_task)

#         # Concat data
#         coef_dat = pd.concat([coef_dat, coef_row])
#         r2_dat = pd.concat([r2_dat, r2_row])

# # Predict 1 task from the other 2 tasks
# for outcome_task in ['bf_', 'rl_', 'ps_']:

#     # Get predictor and outcome data
#     X_dat = dat[[c for c in cols if outcome_task not in c]]
#     y_dat = dat[[c for c in cols if outcome_task in c]]

#     for y_col in y_dat.columns:

#         # Run regression
#         r2_row, coef_row = run_regression_get_r2_coef(X_dat, y_dat, y_col, 'both')

#         # Concat data
#         coef_dat = pd.concat([coef_dat, coef_row])
#         r2_dat = pd.concat([r2_dat, r2_row])


In [10]:
# X_dat

In [11]:
# y_dat[y_col]

In [12]:
def run_repeatedcv_get_r2(X_dat, y_dat, y_col, predictor_task):
    
    all_gridders = pd.DataFrame()
    
    for i, n_splits in enumerate(n_splitss):

        # Fit gridder to get best hyper-parameters
        gridder = RepeatableGridSearchCV(model, param_grid, n_repeats=n_repeats, n_splits=n_splits, scoring=scoring)
        gridder.fit(X_dat, y_dat[y_col])

        # Make pretty
        gridder_results = make_gridder_pd(gridder)
        gridder_results['outcome'] = y_col
        gridder_results['X_cols'] = '.'.join(X_dat.columns)
        gridder_results['pred_task'] = predictor_task
        gridder_results['n_splits'] = n_splits
#         gridder_results['scoring'] = scoring
#         gridder_results['n_repeats'] = n_repeats
        
        # Append
        all_gridders = all_gridders.append(gridder_results)
        
    return all_gridders, pd.DataFrame()
    
# # Example use
# run_repeatedcv_get_r2(X_dat, y_dat, y_col, predictor_task)[0]

## Predict parameters of one task from the others

In [13]:
def run_regression_get_r2_coef(X_dat, y_dat, y_col, predictor_task, do_vizualize=False):
    
    # Run regression
#     reg = sklin.LinearRegression().fit(X_dat, y_dat[y_col])
    reg = sklin.Ridge(alpha=100).fit(X_dat, y_dat[y_col])

    # Get coefficients
    coef_row = pd.DataFrame(
        reg.coef_, X_dat.columns
    ).reset_index().rename(
        columns={'index': 'predictor', 0: 'coef'})
    coef_row['outcome'] = y_col
    coef_row['pred_task'] = predictor_task

    # Get R^2
    pred_outcome = reg.predict(X_dat)
    r2 = skmet.r2_score(y_dat[y_col], pred_outcome)
    r2_row = pd.DataFrame({'r2': [r2], 'outcome': [y_col], 'pred_task': [predictor_task]})

    # Vizualize
    if do_vizualize:
        gg.options.figure_size = (3, 3)
        pred_true_dat = pd.DataFrame(np.array([pred_outcome, y_dat[y_col]]).T).rename(columns={0: 'pred', 1: 'true'})
        print(gg.ggplot(pred_true_dat, gg.aes('true', 'pred'))
         + gg.geom_point()
        )

    return r2_row, coef_row

# # Example use
# run_regression_get_r2_coef(X_dat, y_dat, y_col, predictor_task)

In [14]:
def run_all_regression_models(cols, all_data_z):

    # Set up empty DataFrames
    coef_dat = pd.DataFrame()
    r2_dat = pd.DataFrame()

    # Predict 1 task from 1 task
    dat = all_data_z[[c for c in all_data_z.columns if c not in ages_cols]].dropna()
    for predictor_task in ['bf_', 'rl_', 'ps_']:
        print(predictor_task)

        # Get predictor and outcome data
        X_dat = dat[[c for c in cols if predictor_task in c]]
        y_dat = dat[[c for c in cols if predictor_task not in c]]

        for y_col in y_dat.columns:
            
            # Run regression
#             r2_row, coef_row = run_regression_get_r2_coef(X_dat, y_dat, y_col, predictor_task)
            r2_row, coef_row = run_repeatedcv_get_r2(X_dat, y_dat, y_col, predictor_task)

            # Concat data
            coef_dat = pd.concat([coef_dat, coef_row])
            r2_dat = pd.concat([r2_dat, r2_row])

    # Predict 1 task from the other 2 tasks
    for outcome_task in ['bf_', 'rl_', 'ps_']:
        print(outcome_task)

        # Get predictor and outcome data
        X_dat = dat[[c for c in cols if outcome_task not in c]]
        y_dat = dat[[c for c in cols if outcome_task in c]]

        for y_col in y_dat.columns:

            # Run regression
#             r2_row, coef_row = run_regression_get_r2_coef(X_dat, y_dat, y_col, 'both')
            r2_row, coef_row = run_repeatedcv_get_r2(X_dat, y_dat, y_col, 'both')

            # Concat data
            coef_dat = pd.concat([coef_dat, coef_row])
            r2_dat = pd.concat([r2_dat, r2_row])
            
    # Predict 1 task from the other 2 tasks
    dat = all_data_z[[c for c in all_data_z.columns if c not in ages_cols] + ['PreciseYrs']].dropna()
    dat['PreciseYrs2'] = np.square(dat['PreciseYrs'])
    
    for outcome_task in ['bf_', 'rl_', 'ps_']:
        print(outcome_task)

        # Get predictor and outcome data
        X_dat = dat[[c for c in cols + ['PreciseYrs', 'PreciseYrs2'] if outcome_task not in c]]
        y_dat = dat[[c for c in cols if outcome_task in c]]

        for y_col in y_dat.columns:

            # Run regression
#             r2_row, coef_row = run_regression_get_r2_coef(X_dat, y_dat, y_col, 'both')
            r2_row, coef_row = run_repeatedcv_get_r2(X_dat, y_dat, y_col, 'both_and_age')

            # Concat data
            coef_dat = pd.concat([coef_dat, coef_row])
            r2_dat = pd.concat([r2_dat, r2_row])
            
    return r2_dat, coef_dat

# # Example use
# run_all_regression_models(param_cols, all_data_z)

In [15]:
def beautify_coef_r2_df(dat):
    
    dat['out_task'] = dat['outcome'].apply(lambda x : x[:2])
    dat['pred_task'] = pd.Categorical(dat['pred_task'], categories=('both_and_age', 'both', 'bf_', 'ps_', 'rl_'), ordered=True)
    dat = dat.reset_index(drop=True)
    return dat

# # Example use
# beautify_coef_r2_df(r2_dat_params)

In [None]:
r2_dat_params, coef_dat_params = run_all_regression_models(param_cols, all_data_z)
# r2_dat_behav, coef_dat_behav = run_all_regression_models(behav_cols, all_data_z)

bf_
rl_
ps_
bf_
rl_


In [None]:
r2_dat_params = beautify_coef_r2_df(r2_dat_params)
r2_dat_params = r2_dat_params.rename(columns={'mean_test_score': 'r2', 'std_test_score': 'r2sd'})
r2_dat_params['r2se'] = r2_dat_params.apply(lambda x : x['r2sd'] / np.sqrt(x['n_splits']), axis=1)
r2_dat_params
# pd.melt(r2_dat_params, id_vars=[]

In [None]:
best_scores = r2_dat_params.groupby(['outcome', 'X_cols']).aggregate('max').reset_index()[['outcome', 'X_cols', 'r2', 'r2se']]
best_models = r2_dat_params.loc[r2_dat_params['r2'].isin(best_scores['r2'].values)].reset_index(drop=True)
best_models

In [None]:
def plot_r2(r2_dat):

    gg.options.figure_size = (8, 4)
    return (gg.ggplot(r2_dat, gg.aes('outcome', 'r2', fill='pred_task'))
         + gg.geom_bar(stat='identity', position=gg.position_dodge(width=0.9))
         + gg.theme(axis_text_x=gg.element_text(rotation=90, hjust=0.5))
         + gg.scale_fill_manual(values=('black', 'grey', 'red', 'orange', 'yellow'))
         + gg.labs(x='')
         + gg.facet_wrap('~ out_task', scales='free_x')
        )

# # Use
# g_p = plot_r2(r2_dat_params)
# print(g_p)
# g_p.save(os.path.join(plot_save_path, '4_PredictParamByParam_r2.png'.format(suf)))

# g_b = plot_r2(r2_dat_behav)
# print(g_b)
# g_b.save(os.path.join(plot_save_path, '4_PredictBehavByBehav_r2.png'.format(suf)))

In [None]:
g_r2 = (plot_r2(best_models)
     + gg.geom_pointrange(gg.aes(ymin='r2-r2se', ymax='r2+r2se'), position=gg.position_dodge(width=0.9))
    )
print(g_r2)
g_r2.save(os.path.join(plot_save_path, '4_PredictParamByParam_RepeatedCV_r2.png'))

In [None]:
g_ns = (plot_r2(best_models)
     + gg.aes(y='n_splits')
    )
print(g_ns)
g_ns.save(os.path.join(plot_save_path, '4_PredictParamByParam_RepeatedCV_nsplilts.png'))

In [None]:
g_a = (plot_r2(best_models)
     + gg.aes(y='param_alpha')
    )
print(g_a)
g_a.save(os.path.join(plot_save_path, '4_PredictParamByParam_RepeatedCV_RidgeAlpha.png'))

In [None]:
# def run_all_regression_models(cols):

#     # Set up empty DataFrames
#     pred_task_coef = pd.DataFrame()
#     pred_task_r2 = pd.DataFrame()

#     # Predict 1 task from 1 task
#     for predictor_task in ['bf_', 'rl_', 'ps_']:

#         # Get predictor and outcome data
#         pred_param_dat = all_data_z[[c for c in param_cols if predictor_task in c]]
#         out_param_dat = all_data_z[[c for c in param_cols if predictor_task not in c]]

#         for outcome_col in out_param_dat.columns:

#             # Run regression
#             r2_row, coef_row = run_regression_get_r2_coef(pred_param_dat, out_param_dat, outcome_col, predictor_task)

#             # Concat data
#             pred_task_coef = pd.concat([pred_task_coef, coef_row])
#             pred_task_r2 = pd.concat([pred_task_r2, r2_row])

#     # Predict 1 task from the other 2 tasks
#     for outcome_task in ['bf_', 'rl_', 'ps_']:

#         # Get predictor and outcome data
#         pred_param_dat = all_data_z[[c for c in param_cols if outcome_task not in c]]
#         out_param_dat = all_data_z[[c for c in param_cols if outcome_task in c]]

#         for outcome_col in out_param_dat.columns:

#             # Run regression
#             r2_row, coef_row = run_regression_get_r2_coef(pred_param_dat, out_param_dat, outcome_col, 'both')

#             # Concat data
#             pred_task_coef = pd.concat([pred_task_coef, coef_row])
#             pred_task_r2 = pd.concat([pred_task_r2, r2_row])
            
#     return pred_task_r2, pred_task_coef

# # Use
# run_all_regression_models(param_cols)

In [None]:
# Beautify DataFrames
coef_dat_params = beautify_coef_r2_df(coef_dat_params)
r2_dat_params = beautify_coef_r2_df(r2_dat_params)

coef_dat_behav = beautify_coef_r2_df(coef_dat_behav)
r2_dat_behav = beautify_coef_r2_df(r2_dat_behav)

In [None]:
coef_dat_params['out_category'] = get_category_from_feature(coef_dat_params['outcome'], category_dict)
coef_dat_params['pred_category'] = get_category_from_feature(coef_dat_params['predictor'], category_dict)
coef_dat_params

In [None]:
def plot_coef(coef_dat):

    # Subset model with both tasks only (individual tasks are very similar)
    sub = coef_dat.loc[coef_dat['pred_task'] == 'both']

    gg.options.figure_size = (70, 5)
    return (gg.ggplot(sub, gg.aes('predictor', 'coef', fill='out_task'))
         + gg.geom_bar(stat='identity')
         + gg.labs(x='')
         + gg.theme(axis_text_x=gg.element_text(rotation=90, hjust=0.5))
         + gg.theme(legend_position='none')
    #      + gg.facet_wrap('~ pred_task + outcome', scales='free_x', labeller='label_context')
         + gg.facet_grid('pred_task ~ outcome', scales='free_x')
        )

# Use
g_p = plot_coef(coef_dat_params)
print(g_p)
g_p.save(os.path.join(plot_save_path, '4_PredictParamsByParams_coef_onlyboth.png'), limitsize=False)

g_b = plot_coef(coef_dat_behav)
print(g_b)
g_b.save(os.path.join(plot_save_path, '4_PredictBehavByBehav_coef_onlyboth.png'), limitsize=False)