# Imports

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import plotnine as gg
gg.theme_set(gg.theme_classic)

In [3]:
# from Functions import get_ages_cols, get_param_names, get_category_cols, name_from_index, bool_from_index

## Directories etc.

In [4]:
data_save_path = "C:/Users/maria/MEGAsync/SLCNdata/Meta/all_data"
plot_save_path = "C:/Users/maria/MEGAsync/SLCN/MetaSLCNPlots"
if not os.path.exists(plot_save_path):
    os.makedirs(plot_save_path)

# Get data

In [5]:
all_data = pd.read_csv(os.path.join(data_save_path, 'all_data.csv'), index_col=0)
all_data_z = pd.read_csv(os.path.join(data_save_path, 'all_data_z.csv'), index_col=0)
all_data_long = pd.read_csv(os.path.join(data_save_path, 'all_data_long.csv'), index_col=0)
all_data_long_z = pd.read_csv(os.path.join(data_save_path, 'all_data_long_z.csv'), index_col=0)

In [6]:
# ages_dir = "C:/Users/maria/MEGAsync/SLCNdata/SLCNinfo2.csv"
# ages = pd.read_csv(os.path.join(ages_dir))

In [7]:
# ages_cols = get_ages_cols()
# param_names = get_param_names()
# category_cols = get_category_cols()

In [8]:
# param_cols = [c for c in all_data.columns for p in get_param_names() if p in c]

## Predict parameters of one task from the others

In [None]:
gg.options.figure_size = (3, 3)

pred_task_coef = pd.DataFrame()
pred_task_r2 = pd.DataFrame()

# Get predictor and outcome data
for predictor_task in ['bf_', 'rl_', 'ps_']:
    pred_param_dat = all_data_z[[c for c in param_cols if predictor_task in c]]
    out_param_dat = all_data_z[set([c for c in param_cols if predictor_task not in c])]

    for outcome_col in out_param_dat.columns:

        # Run regression
        reg = LinearRegression().fit(pred_param_dat, out_param_dat[outcome_col])

        # Get coefficients
        lin_reg_result = pd.DataFrame(
            reg.coef_, pred_param_dat.columns
        ).reset_index().rename(
            columns={'index': 'predictor', 0: 'coef'})
        lin_reg_result['outcome'] = outcome_col
        lin_reg_result['pred_task'] = predictor_task

        # Get R^2
        pred_outcome = reg.predict(pred_param_dat)
        r2 = r2_score(out_param_dat[outcome_col], pred_outcome)
        r2_row = pd.DataFrame({'r2': [r2], 'outcome': [outcome_col], 'pred_task': [predictor_task]})

    #     # Vizualize
    #     pred_true_dat = pd.DataFrame(np.array([pred_outcome, out_param_dat[outcome_col]]).T).rename(columns={0: 'pred', 1: 'true'})
    #     print(gg.ggplot(pred_true_dat, gg.aes('true', 'pred'))
    #      + gg.geom_point()
    #     )

        # Save data
        pred_task_coef = pd.concat([pred_task_coef, lin_reg_result])
        pred_task_r2 = pd.concat([pred_task_r2, r2_row])
    
pred_task_r2['out_task'] = pred_task_r2['outcome'].apply(lambda x : x[:2])
pred_task_r2 = pred_task_r2.reset_index(drop=True)
pred_task_r2

pred_task_coef['out_task'] = pred_task_coef['outcome'].apply(lambda x : x[:2])
pred_task_coef = pred_task_coef.reset_index(drop=True)
pred_task_coef

In [None]:
# TODO:
# predict each task from both, show in same panel
# Flip color with panels
gg.options.figure_size = (15, 10)
g = (gg.ggplot(pred_task_r2, gg.aes('outcome', 'r2', fill='out_task'))
     + gg.geom_bar(stat='identity')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
     + gg.labs(x='')
     + gg.theme(legend_position='none')
     + gg.facet_wrap('~ pred_task', scales='free_x')
    )
g = (gg.ggplot(pred_task_r2, gg.aes('outcome', 'r2', fill='pred_task'))
     + gg.geom_bar(stat='identity')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
     + gg.labs(x='')
     + gg.theme(legend_position='none')
     + gg.facet_wrap('~ out_task', scales='free_x')
    )
print(g)
g.save(os.path.join(plot_save_path, '3_predicting_params_by_params_r2.png'))

In [None]:
gg.options.figure_size = (12, 12)
g = (gg.ggplot(pred_task_coef, gg.aes('predictor', 'coef', fill='out_task'))
     + gg.geom_bar(stat='identity')
     + gg.labs(x='')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
     + gg.theme(legend_position='none')
     + gg.facet_wrap('~ pred_task + outcome', scales='free_x')
    )
print(g)
g.save(os.path.join(plot_save_path, '3_predicting_params_by_params_coef.png'))

## Predict age using regularized regression

### Ridge regression
* L2-norm (returns small but non-zero coefficients)
* It is majorly used to prevent overfitting. Since it includes all the features, it is not very useful in case of exorbitantly high #features, say in millions, as it will pose computational challenges.
* It generally works well even in presence of highly correlated features as it will include all of them in the model but the coefficients will be distributed among them depending on the correlation.

### Lasso regression
* L1-norm (sparse; tries to get many coefficients 0)
* It arbitrarily selects any one feature among the highly correlated ones and reduced the coefficients of the rest to zero. Also, the chosen variable changes randomly with change in model parameters. This generally doesn’t work that well as compared to ridge regression.
* Since it provides sparse solutions, it is generally the model of choice (or some variant of this concept) for modelling cases where the #features are in millions or more. In such a case, getting a sparse solution is of great computational advantage as the features with zero coefficients can simply be ignored.

### Conclusion
* Ridge might be better because of how it deals with correlated features

In [None]:
# Get regression data
def get_regr_data(regr_data_name):

    if regr_data_name == 'raw':
        regr_dat = all_data_z[[c for c in wanted_cols if c not in ages_cols[2:] + ['age_group']]]  # behaviors
    elif regr_data_name == 'pc':
        regr_dat = pd.merge(fit_pd_all, ages).drop(columns=[c for c in ages.columns if (c != 'ID') and (c != 'PreciseYrs')])  # PCs
    else:
        raise(ValueError, "regr_data_name must either be 'pc' or 'raw'.")

    regr_dat = regr_dat.dropna().set_index('ID')
    regr_y = regr_dat['PreciseYrs']
    regr_X = regr_dat.drop(columns='PreciseYrs')
    
    return regr_dat, regr_X, regr_y

# Use
regr_data_name = 'pc'  # can be 'raw' or 'pc'
regr_dat, regr_X, regr_y = get_regr_data(regr_data_name)

In [None]:
regr_y

In [None]:
regr_X

In [None]:
# Run simple linear regression on all basic features
reg = LinearRegression().fit(regr_X, regr_y)
reg = Ridge(alpha=0.8).fit(regr_X, regr_y)

lin_reg_result = pd.DataFrame(
    reg.coef_, [c for c in regr_dat.columns if c != 'PreciseYrs']).reset_index().rename(
    columns={'index': 'predictor', 0: 'coef'})
lin_reg_result

In [None]:
if regr_data_name == 'raw':
    # Sort predictors by coefficient value (for plotting)
    lin_reg_result = lin_reg_result.sort_values(by='coef')
    lin_reg_result = lin_reg_result.reset_index(drop=True)
    lin_reg_result['predictor_cat'] = pd.Categorical(lin_reg_result['predictor'], categories=lin_reg_result['predictor'])

    # Add columns
    lin_reg_result['category'] = lin_reg_result['predictor'].apply(name_from_index, names=category_cols)
    lin_reg_result['task'] = lin_reg_result.predictor.apply(lambda x : x[:2])

elif regr_data_name == 'pc':
    # Add columns
    lin_reg_result['predictor_cat'] = lin_reg_result['predictor'].apply(lambda x : int(x[2:]))
    lin_reg_result['task'] = lin_reg_result.predictor.apply(lambda x : x[:2])
    lin_reg_result['category'] = lin_reg_result.predictor.apply(lambda x : x[:2])

In [None]:
# Performance of this model
pred_y = reg.predict(regr_X)

linregr_true_pred = pd.DataFrame(np.array([np.array(regr_y), pred_y]).T, columns=('true_y', 'pred_y'))
gg.options.figure_size = (6, 6)
g = (gg.ggplot(linregr_true_pred, gg.aes('true_y', 'pred_y'))
 + gg.geom_point()
 + gg.labs(x='True age', y='Predicted age')
)
g.save(os.path.join(plot_save_path, '2Regr_lin_perf_{}.png'.format('')))
print(g)

print("R2: {:.2f}".format(r2_score(regr_y, pred_y)))

In [None]:
gg.options.figure_size = (12, 6)
g = (gg.ggplot(lin_reg_result, gg.aes('predictor_cat', 'coef', fill='task'))
 + gg.geom_point()
 + gg.geom_bar(stat='identity')
 + gg.labs(x='', fill='')
 + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
)
g.save(os.path.join(plot_save_path, '2Regr_lin_task_{}.png'.format('')))
print(g)

g2 = g + gg.facet_grid('~ category', scales='free_x')
g2
g2.save(os.path.join(plot_save_path, '2Regr_lin__cat_{}.png'.format('')))
print(g2)

In [None]:
poly = PolynomialFeatures(2)
regr_poly = poly.fit_transform(regr_X)
regr_poly = pd.DataFrame(regr_poly, columns=poly.get_feature_names(regr_dat.columns))
regr_poly = regr_poly.set_index(regr_dat.index)
regr_poly

In [None]:
tuned_parameters = [{'alpha': [1e-3, 1e-2, 1e-1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'max_iter': [1e5]}]

ridge_grid = GridSearchCV(linear_model.Ridge(), tuned_parameters, scoring='r2', cv=3, iid=False, return_train_score=False)
ridge_best = ridge_grid.fit(regr_X, regr_y)
print(pd.DataFrame(ridge_best.cv_results_).sort_values(by='rank_test_score').head())

ridge_best.best_params_

In [None]:
# ridge = linear_model.Ridge(alpha=ridge_best.best_params_['alpha'])
ridge = linear_model.Ridge(alpha=0.5)
ridge.fit(regr_X, regr_y)
ridge.coef_, ridge.intercept_
dat = pd.DataFrame(ridge.coef_, index=poly.get_feature_names()).reset_index()
dat = dat.rename(columns={0: 'coef'})

(gg.ggplot(dat.loc[dat.coef != 0], gg.aes('index', 'coef'))
 + gg.geom_point()
 + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_poly, y, test_size=0.33, random_state=42)

In [None]:
ridge = ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

print("Train score: {}".format(ridge.score(X_train, y_train)))
print("Test score: {}".format(ridge.score(X_test, y_test)))
print(cross_val_score(ridge, X_test, y_test, cv=5))
print(cross_val_score(ridge, X_test, y_test, cv=LeaveOneOut()))

plt.plot(y_train, y_train_pred, '.')
plt.plot(y_test, y_test_pred, '.')

In [None]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names())
X_poly

In [None]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)
poly.get_feature_names()

In [None]:
from sklearn.pipeline import make_pipeline

def PolynomialLasso(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         linear_model.Lasso(**kwargs))

In [None]:
tuned_parameters = {
    'polynomialfeatures__degree': np.arange(4),
    'lasso__alpha': [0.01, 0.1, 0.3, 0.5],
    'lasso__max_iter': [1e5],
}

lasso_grid = GridSearchCV(PolynomialLasso(), tuned_parameters, scoring='r2', cv=3, iid=False, return_train_score=False)
lasso_best = lasso_grid.fit(X_poly, y)
print(pd.DataFrame(lasso_best.cv_results_).sort_values(by='rank_test_score').head())

lasso_best.best_params_

In [None]:
tuned_parameters = [{'alpha': [1e-3, 1e-2, 1e-1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'max_iter': [1e5]}]

lasso_grid = GridSearchCV(linear_model.Lasso(), tuned_parameters, scoring='r2', cv=3, iid=False, return_train_score=False)
lasso_best = lasso_grid.fit(X_poly, y)
print(pd.DataFrame(lasso_best.cv_results_).sort_values(by='rank_test_score').head())

lasso_best.best_params_

In [None]:
ridge_grid = GridSearchCV(linear_model.Ridge(), tuned_parameters, scoring='r2', cv=3, iid=False, return_train_score=False)
ridge_best = ridge_grid.fit(X_poly, y)
print(pd.DataFrame(ridge_best.cv_results_).sort_values(by='rank_test_score').head())

ridge_best.best_params_

In [None]:
lasso = linear_model.Lasso(alpha=lasso_best.best_params_['alpha'])
lasso.fit(X_poly, y)
lasso.coef_, lasso.intercept_
dat = pd.DataFrame(lasso.coef_, index=poly.get_feature_names()).reset_index()
dat = dat.rename(columns={0: 'coef'})

(gg.ggplot(dat.loc[dat.coef != 0], gg.aes('index', 'coef'))
 + gg.geom_point()
 + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
)

In [None]:
ridge = linear_model.Ridge(alpha=ridge_best.best_params_['alpha'])
ridge.fit(X_poly, y)
ridge.coef_, ridge.intercept_
dat = pd.DataFrame(ridge.coef_, index=poly.get_feature_names()).reset_index()
dat = dat.rename(columns={0: 'coef'})

(gg.ggplot(dat.loc[dat.coef != 0], gg.aes('index', 'coef'))
 + gg.geom_point()
 + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_poly, y, test_size=0.33, random_state=42)

lasso = lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_test_pred = lasso.predict(X_test)

print("Train score: {}".format(lasso.score(X_train, y_train)))
print("Test score: {}".format(lasso.score(X_test, y_test)))
print(cross_val_score(lasso, X_test, y_test, cv=5))
print(cross_val_score(lasso, X_test, y_test, cv=LeaveOneOut()))

plt.plot(y_train, y_train_pred, '.')
plt.plot(y_test, y_test_pred, '.')

In [None]:
ridge = ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)

print("Train score: {}".format(ridge.score(X_train, y_train)))
print("Test score: {}".format(ridge.score(X_test, y_test)))
print(cross_val_score(ridge, X_test, y_test, cv=5))
print(cross_val_score(ridge, X_test, y_test, cv=LeaveOneOut()))

plt.plot(y_train, y_train_pred, '.')
plt.plot(y_test, y_test_pred, '.')

# SVM

In [None]:
all_data_z[param_cols_by_task]

In [None]:
X, y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV

x_params = [p for p in param_cols_by_task if 'bf' not in p]
y_param = 'bf_nalpha'  # 'bf_alpha'
print(y_param, '~', x_params)

X = all_data_z[x_params]
y = all_data_z['bf_alpha']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

svr = svm.SVR(kernel='poly', C=1e4, gamma='scale', degree=3, epsilon=.1, verbose=True)
svr_fit = svr.fit(X_train, y_train)
y_train_pred = svr_fit.predict(X_train)
y_test_pred = svr_fit.predict(X_test)

print("Train score: {}".format(svr.score(X_train, y_train)))
print("Test score: {}".format(svr.score(X_test, y_test)))
print(cross_val_score(svr, X_test, y_test, cv=5))
print(cross_val_score(svr, X_test, y_test, cv=LeaveOneOut()))

plt.plot(y_train, y_train_pred, '.')
plt.plot(y_test, y_test_pred, '.')

In [None]:
tuned_parameters = [
    {'kernel': ['rbf', 'poly'],
     'C': [1e-1, 1e1, 1e2, 1e3, 1e4],
     'gamma': [1e-5, 1e-4, 1e-4, 1e-3, 1e-2, 1e-1]}]

clf = GridSearchCV(svm.SVR(), tuned_parameters, scoring='r2', cv=2, iid=False, return_train_score=False)
clf_fit = clf.fit(X, y)
# print(pd.DataFrame(clf_fit.cv_results_).sort_values(by='rank_test_score').head())

# clf = GridSearchCV(svm.SVR(), tuned_parameters, scoring='explained_variance', cv=2, iid=False, return_train_score=False)
# clf_fit = clf.fit(X_train, y_train)
# print(pd.DataFrame(clf_fit.cv_results_).sort_values(by='rank_test_score').head())
clf_fit.best_params_

In [None]:

svr = svm.SVR(C=clf_fit.best_params_['C'], gamma=clf_fit.best_params_['gamma'], kernel=clf_fit.best_params_['kernel'])
svr_fit = svr.fit(X_train, y_train)
y_train_pred = svr_fit.predict(X_train)
y_test_pred = svr_fit.predict(X_test)

print("Train score: {}".format(svr.score(X_train, y_train)))
print("Test score: {}".format(svr.score(X_test, y_test)))
print(cross_val_score(svr, X_test, y_test, cv=5))
print(cross_val_score(svr, X_test, y_test, cv=LeaveOneOut()))

plt.plot(y_train, y_train_pred, '.')
plt.plot(y_test, y_test_pred, '.')

In [None]:
from sklearn.model_selection import GridSearchCV

# param_grid = {'polynomialfeatures__degree': np.arange(21),
#               'linearregression__fit_intercept': [True, False],
#               'linearregression__normalize': [True, False]}

# grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)



tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']

clf = GridSearchCV(
    svm.SVC(), tuned_parameters, scoring=scores[0]
)
clf.fit(X_train, y_train)

In [None]:
# import numpy as np
# from sklearn.svm import SVR
# import matplotlib.pyplot as plt
from sklearn import svm

# #############################################################################
# Generate sample data
X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = np.sin(X).ravel()

# #############################################################################
# Add noise to targets
# y[::5] += 3 * (0.5 - np.random.rand(8))
y += 0.5 - np.random.rand(len(X))

# #############################################################################
# Fit regression model
svr_rbf = svm.SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = svm.SVR(kernel='linear', C=100, gamma='auto')
svr_poly = svm.SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)

# #############################################################################
# Look at the results
lw = 2

svrs = [svr_rbf, svr_lin, svr_poly]
kernel_label = ['RBF', 'Linear', 'Polynomial']
model_color = ['m', 'c', 'g']

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10), sharey=True)
for ix, svr in enumerate(svrs):
    axes[ix].plot(X, svr.fit(X, y).predict(X), color=model_color[ix], lw=lw,
                  label='{} model'.format(kernel_label[ix]))
    axes[ix].scatter(X[svr.support_], y[svr.support_], facecolor="none",
                     edgecolor=model_color[ix], s=50,
                     label='{} support vectors'.format(kernel_label[ix]))
    axes[ix].scatter(X[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     y[np.setdiff1d(np.arange(len(X)), svr.support_)],
                     facecolor="none", edgecolor="k", s=50,
                     label='other training data')
    axes[ix].legend(loc='upper center', bbox_to_anchor=(0.5, 1.1),
                    ncol=1, fancybox=True, shadow=True)

fig.text(0.5, 0.04, 'data', ha='center', va='center')
fig.text(0.06, 0.5, 'target', ha='center', va='center', rotation='vertical')
fig.suptitle("Support Vector Regression", fontsize=14)
plt.show()

In [None]:
# get support vectors
clf.support_vectors_

# get indices of support vectors
clf.support_

# get number of support vectors for each class
clf.n_support_

## PLS

In [None]:
from sklearn.cross_decomposition import PLSRegression
X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
pls2 = PLSRegression(n_components=2)
pls2.fit(X, Y)

Y_pred = pls2.predict(X)
Y_pred, Y
plt.plot(np.array(Y)[:,0], np.array(Y_pred)[:,0])

In [None]:
x_params = [p for p in param_cols_by_task if 'bf' not in p]
y_params = [p for p in param_cols_by_task if 'bf' in p]
len(x_params)

In [None]:
pls = PLSRegression(n_components=14)
pls.fit(all_data_z[x_params], all_data_z[y_params])

pred = pls.predict(all_data_z[x_params])
pls.score(all_data_z[x_params], all_data_z[y_params])  # R^2 of self.predict(X) wrt. y.

In [None]:
pred = pd.DataFrame(pred, columns=y_params)
pred_l = pred.melt(var_name='param_name', value_name='param_value').reset_index()
true_l = all_data_z[y_params].melt(var_name='param_name', value_name='param_value').reset_index()
# dat = pd.merge(pred, all_data_z[y_params], left_index=True, right_index=True, suffixes=['_pred', '_true'])
# dat
dat_l = pd.merge(pred_l, true_l, on=['param_name', 'index'], suffixes=['_pred', '_true'])
dat_l

In [None]:
(gg.ggplot(dat_l, gg.aes('param_value_pred', 'param_value_true', color='param_name'))
 + gg.geom_point(alpha=0.5)
 + gg.geom_smooth(color='black')
 + gg.facet_wrap('~ param_name')
)

# PCA on parameters

In [None]:
# Run PCA
pca = PCA(n_components=z_dat[param_cols_by_param].shape[1])
# pca = KernelPCA(n_components=z_dat.shape[1])
fit = pca.fit_transform(z_dat[param_cols_by_param].dropna())
n_comp = 11

In [None]:
# Explained variance
dat = pd.DataFrame({'PC': range(z_dat[param_cols_by_param].shape[1]), 'expl_var': np.cumsum(pca.explained_variance_ratio_)})
g = (gg.ggplot(dat, gg.aes('PC', 'expl_var'))
 + gg.geom_point()
 + gg.geom_line()
 + gg.geom_hline(yintercept=1, linetype='dotted')
 + gg.geom_vline(xintercept=n_comp, linetype='dotted')
)
g.save(os.path.join(plot_save_path, '5PCAExplVar_params.png'))

expl_var_at_n_comp = list(dat[dat.PC <= n_comp].expl_var)[-1]
print("Explained variance at {} components: {}%.".format(n_comp, round(100 * expl_var_at_n_comp, 1)))
g

In [None]:
pd.DataFrame(fit)

In [None]:
# Age in PC space
fit_pd = pd.DataFrame(fit,
                      columns=['pc{}'.format(i) for i in range(z_dat[param_cols_by_param].shape[1])],
                      index=all_data.drop(columns=ages_cols[1:]).dropna().reset_index().ID)
fit_pd = fit_pd.reset_index()
fit_pd = fit_pd.merge(ages[ages_cols])

g1 = (gg.ggplot(fit_pd, gg.aes('pc0', 'pc1', color='PreciseYrs'))
 + gg.geom_point()
)
g1.save(os.path.join(plot_save_path, '5PCPC0PC1_param.png'))

g2 = g1 + gg.aes('pc2', 'pc3')
g2.save(os.path.join(plot_save_path, '5PCPC2PC3_param.png'))
g1, g2

In [None]:
# PC changes over age
fit_long = fit_pd.melt(
    value_vars=['pc{}'.format(i) for i in range(n_comp)],
    id_vars=ages_cols)

gg.options.figure_size = (10, 8)
g = (gg.ggplot(fit_long, gg.aes('PreciseYrs', 'value'))
 + gg.geom_point(alpha=0.5, size=0.5)
 + gg.geom_smooth(color='red')
 + gg.facet_wrap('~ variable', scales='free')
)
g.save(os.path.join(plot_save_path, '5PCAAgePC.png'))
g

In [None]:
g_t = g + gg.aes(x='meanT')
g.save(os.path.join(plot_save_path, '5PCATPC.png'))
g_t

In [None]:
# Factor loadings
loadings = pca.components_[:n_comp].T  # (n_components, n_features)  * np.sqrt(pca.explained_variance_)
loadings_pd = pd.DataFrame(loadings, columns=['pc{}'.format(i) for i in range(n_comp)], index=sub_dat.columns)
loadings_pd = loadings_pd.reset_index()
loadings_long = loadings_pd.melt(
    value_vars=['pc{}'.format(i) for i in range(n_comp)],
    id_vars=['index'])

In [None]:
loadings_long['task'] = loadings_long['index'].apply(lambda x : x[:3])
loadings_long['ACC_RT'] = loadings_long['index'].apply(name_from_index, names=['ACC', 'RT'])
loadings_long['PC'] = loadings_long['variable'].apply(lambda x : int(x[2:]))

In [None]:
loadings_pd['param_name'] = loadings_pd['index'].apply(lambda x : x[3:])
loadings_pd['param_bool'] = loadings_pd['index'].apply(lambda x : x[:2])
loadings_pd['category'] = loadings_pd['param_name'].apply(name_from_index, names=param_names + ['ACC', 'RT'])
loadings_pd['ACC_RT'] = loadings_pd['param_name'].apply(name_from_index, names=['ACC', 'RT'])
loadings_pd['task'] = loadings_pd['index'].apply(lambda x: x[:2])
loadings_pd

In [None]:
sub_dat = loadings_long[loadings_long['PC'] < 5]
g = (gg.ggplot(sub_dat, gg.aes('index', 'value', fill='ACC_RT'))
 + gg.geom_bar(stat='identity')
 + gg.theme(axis_text_x=gg.element_text(rotation=90, hjust=1))
 + gg.facet_grid('variable ~ task', scales='free_x')
 + gg.labs(x='', y='Loading')
 + gg.theme(legend_position='none')
)
g.save(os.path.join(plot_save_path, '5PCALoadings.png'))
g

## PCA on behavior

In [None]:
# Run PCA
beh_dat = all_data[acc_cols + rt_cols + rtsd_cols].dropna()
IDs = all_data[acc_cols + rt_cols + rtsd_cols + ['ID']].dropna().ID
pca = PCA(n_components=beh_dat.shape[1])
fit = pca.fit_transform(beh_dat)
n_comp = 6

In [None]:
# Explained variance
gg.options.figure_size = (5, 5)
dat = pd.DataFrame({'PC': range(beh_dat.shape[1]), 'expl_var': np.cumsum(pca.explained_variance_ratio_)})
g = (gg.ggplot(dat, gg.aes('PC', 'expl_var'))
 + gg.geom_point()
 + gg.geom_line()
 + gg.geom_hline(yintercept=1, linetype='dotted')
 + gg.geom_vline(xintercept=n_comp, linetype='dotted')
)
g.save(os.path.join(plot_save_path, '3PCAExplVar_params.png'))

expl_var_at_n_comp = list(dat[dat.PC <= n_comp].expl_var)[-1]
print("Explained variance at {} components: {}%.".format(n_comp, round(100 * expl_var_at_n_comp, 1)))
g

In [None]:
# Age in PC space
fit_pd = pd.DataFrame(fit,
                      columns=['pc{}'.format(i) for i in range(beh_dat.shape[1])],
                      index=IDs)
fit_pd = fit_pd.reset_index()
fit_pd = fit_pd.merge(ages[ages_cols])

g1 = (gg.ggplot(fit_pd, gg.aes('pc0', 'pc1', color='PreciseYrs'))
 + gg.geom_point()
)
g1.save(os.path.join(plot_save_path, '3PCPC0PC1_beh.png'))

g2 = g1 + gg.aes('pc2', 'pc3')
g2.save(os.path.join(plot_save_path, '3PCPC2PC3_beh.png'))
g1, g2

In [None]:
# PC changes over age
fit_long = fit_pd.melt(
    value_vars=['pc{}'.format(i) for i in range(n_comp)],
    id_vars=ages_cols)

gg.options.figure_size = (10, 8)
g = (gg.ggplot(fit_long, gg.aes('PreciseYrs', 'value'))
 + gg.geom_point(alpha=0.5, size=0.5)
 + gg.geom_smooth(color='red')
 + gg.facet_wrap('~ variable', scales='free')
)
g.save(os.path.join(plot_save_path, '3PCAAgePC.png'))
g

In [None]:
pd.DataFrame(loadings)

In [None]:
# Factor loadings
loadings = pca.components_[:n_comp].T  # (n_components, n_features)  * np.sqrt(pca.explained_variance_)
loadings_pd = pd.DataFrame(loadings, columns=['pc{}'.format(i) for i in range(n_comp)], index=beh_dat.columns)
loadings_pd = loadings_pd.reset_index()
loadings_long = loadings_pd.melt(
    value_vars=['pc{}'.format(i) for i in range(n_comp)],
    id_vars=['index'])

In [None]:
loadings_long['task'] = loadings_long['index'].apply(lambda x : x[:3])
loadings_long['ACC_RT'] = loadings_long['index'].apply(name_from_index, names=['ACC', 'RT'])
loadings_long['PC'] = loadings_long['variable'].apply(lambda x : int(x[2:]))

In [None]:
loadings_pd['param_name'] = loadings_pd['index'].apply(lambda x : x[3:])
loadings_pd['param_bool'] = loadings_pd['index'].apply(lambda x : x[:2])
loadings_pd['category'] = loadings_pd['param_name'].apply(name_from_index, names=param_names + ['ACC', 'RT'])
loadings_pd['ACC_RT'] = loadings_pd['param_name'].apply(name_from_index, names=['ACC', 'RT'])
loadings_pd['task'] = loadings_pd['index'].apply(lambda x: x[:2])
loadings_pd

In [None]:
sub_dat = loadings_long[loadings_long['PC'] < 5]
g = (gg.ggplot(sub_dat, gg.aes('index', 'value', fill='ACC_RT'))
 + gg.geom_bar(stat='identity')
 + gg.theme(axis_text_x=gg.element_text(rotation=90, hjust=1))
 + gg.facet_grid('variable ~ task', scales='free_x')
 + gg.labs(x='', y='Loading')
 + gg.theme(legend_position='none')
)
g.save(os.path.join(plot_save_path, '3PCALoadings_beh.png'))
g

## Z-score all features

In [None]:
# Prepare data
sub_dat = all_data.drop(columns=ages_cols[1:])
z_dat = preprocessing.scale(sub_dat)
z_dat = pd.DataFrame(z_dat, columns=sub_dat.columns)
z_dat = z_dat.set_index(sub_dat.index)
z_dat

In [None]:
z_dat_ages = z_dat.reset_index().merge(ages[ages_cols], on=['ID'])
z_dat_ages

In [None]:
z_dat_long = z_dat.melt(var_name='feature')
z_dat_long['task'] = z_dat_long.feature.apply(lambda x : x[:2])
z_dat_long['param'] = z_dat_long.feature.apply(lambda x : x[3:])

g_z = g
g_z.data = z_dat_long
g_z

## Regression models

In [None]:
# Run parameter regression models
r2s_param = pd.DataFrame()
coefs_param = pd.DataFrame()

for dep_var in param_cols:
    
    # Indep vars: all parameters of the other models
    indep_vars = '+'.join([p for p in param_cols if p[:2] != dep_var[:2]])
    model = smf.ols('{} ~ {}'.format(dep_var, indep_vars), data=all_data_z).fit()
    
    model_r2 = pd.DataFrame({'R2': [model.rsquared], 'dep_var': [dep_var]})
    model_coefs_param = pd.DataFrame(model.params, columns=['coef'])
    model_coefs_param['dep_var'] = dep_var
    
    r2s_param = r2s_param.append(model_r2)
    coefs_param = coefs_param.append(model_coefs_param)

In [None]:
# Run behavior regression models
r2s_beh = pd.DataFrame()
coefs_beh = pd.DataFrame()

for dep_var in acc_cols + rt_cols:
    
    # Predict with all behaviors of other models
    indep_vars = '+'.join([p for p in acc_cols + rt_cols if p[:2] != dep_var[:2]])
    model = smf.ols('{} ~ {}'.format(dep_var, indep_vars), data=all_data_z).fit()
    
    model_r2 = pd.DataFrame({'R2': [model.rsquared], 'dep_var': [dep_var]})
    model_coefs_beh = pd.DataFrame(model.params, columns=['coef'])
    model_coefs_beh['dep_var'] = dep_var
    
    r2s_beh = r2s_beh.append(model_r2)
    coefs_beh = coefs_beh.append(model_coefs_beh)

In [None]:
# Predict age from parameters
indep_vars = '+'.join(param_cols_by_param)
model_param = smf.ols('PreciseYrs ~ {}'.format(indep_vars), data=all_data_z).fit()
param_coefs = pd.DataFrame(model_param.params, columns=['coef'])
param_coefs['indep_var'] = 'parameters'

# Predict age from behavior
indep_vars = '+'.join(rt_cols + acc_cols)
model_beh = smf.ols('PreciseYrs ~ {}'.format(indep_vars), data=all_data_z).fit()
beh_coefs = pd.DataFrame(model_beh.params, columns=['coef'])
beh_coefs['indep_var'] = 'behavior'

# Predict age from all
indep_vars = '+'.join(param_cols_by_param + rt_cols + acc_cols)
model_all = smf.ols('PreciseYrs ~ {}'.format(indep_vars), data=all_data_z).fit()
all_coefs = pd.DataFrame(model_all.params, columns=['coef'])
all_coefs['indep_var'] = 'all'

r2s_age = pd.DataFrame({
    'dep_var': ['PreciseYrs'] * 3,
    'R2': [model_param.rsquared, model_beh.rsquared, model_all.rsquared],
    'indep_vars': ['parameters', 'behavior', 'all']
})

# coefs_age = param_coefs.append(beh_coefs).append(all_coefs)
all_coefs = all_coefs.reset_index()
all_coefs['task'] = all_coefs['index'].apply(lambda x : x[:2])
all_coefs = all_coefs[all_coefs['index'] != 'Intercept']
all_coefs['category'] = all_coefs['index'].apply(name_from_index, names=param_names+['ACC', 'RT'])
# all_coefs

In [None]:
# Beautify data
r2s_param = r2s_param.reset_index(drop=True)
r2s_param['task'] = r2s_param.dep_var.apply(lambda x : x[:2])
r2s_param['param'] = r2s_param.dep_var.apply(lambda x : x[3:])

coefs_param['task'] = coefs_param.dep_var.apply(lambda x : x[:2])
coefs_param['param'] = coefs_param.dep_var.apply(lambda x : x[3:])

r2s_beh = r2s_beh.reset_index(drop=True)
r2s_beh['task'] = r2s_beh.dep_var.apply(lambda x : x[:2])
r2s_beh['param'] = r2s_beh.dep_var.apply(lambda x : x[3:])
r2s_beh['category'] = r2s_beh['param'].apply(name_from_index, names=['ACC', 'RT'])

coefs_beh['task'] = coefs_beh.dep_var.apply(lambda x : x[:2])
coefs_beh['param'] = coefs_beh.dep_var.apply(lambda x : x[3:])
coefs_beh['category'] = coefs_beh['param'].apply(name_from_index, names=['RT', 'ACC'])

In [None]:
# Predicting age
gg.options.figure_size = (5, 5)
g = (gg.ggplot(all_coefs, gg.aes('index', 'abs(coef)', fill='category'))
     + gg.stat_summary(geom='bar')
     + gg.stat_summary(geom='pointrange')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
     + gg.facet_wrap('~ task', scales='free_x')
     + gg.labs(x='', y='Weight when predicting age')
    )
g.save(os.path.join(plot_save_path, '3PredictingAgeCoefs.png'))
g

In [None]:
g = (gg.ggplot(r2s_age, gg.aes('indep_vars', 'R2'))
     + gg.geom_bar(stat='identity')
     + gg.coord_cartesian(ylim=(0, 1))
     + gg.labs(x='', y='R2 when predicting age')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1))
    )
g.save(os.path.join(plot_save_path, '3PredictingAgeR2.png'))
g

In [None]:
# Predicting parameters
gg.options.figure_size = (7, 5)
g = (gg.ggplot(r2s_param, gg.aes('param', 'R2', fill='param'))
     + gg.geom_bar(stat='identity')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1),
                legend_position='none')
     + gg.facet_wrap('~ task', scales='free_x')
     + gg.labs(x='', y='R2 when predicting each parameter')
)
g.save(os.path.join(plot_save_path, '3PredictingParamsR2.png'))
g

In [None]:
g_beh = g + gg.aes(fill='category') + gg.labs(y='R2 when predicting behaviors')
g_beh.data = r2s_beh
g_beh.save(os.path.join(plot_save_path, '3PredictingBehaviorR2.png'))
g_beh

In [None]:
# Plot coefficients
g = (gg.ggplot(coefs_param, gg.aes('param', 'abs(coef)', fill='param'))
#      + gg.geom_point(position='jitter', size=0.1)
     + gg.stat_summary(geom='bar')
     + gg.stat_summary(geom='pointrange')
     + gg.theme(axis_text_x=gg.element_text(rotation=45, hjust=1),
                legend_position='none')
     + gg.facet_wrap('~ task', scales='free_x')
     + gg.labs(x='', y='Weight when predicting parameters')
    )
g.save(os.path.join(plot_save_path, '3PredictingParamsCoefs.png'))
g

In [None]:
g_beh = g + gg.aes(fill='category') + gg.labs(y = 'Weight when predicting behaviors')
g_beh.data = coefs_beh
g_beh.save(os.path.join(plot_save_path, '3PredictingBehaviorCoefs.png'))
g_beh

# Dendrograms

In [None]:
sns.clustermap(all_data[acc_cols + rt_cols + ['PreciseYrs']].dropna(), z_score=1)
sns.clustermap(all_data[param_cols_by_param + ['PreciseYrs']].dropna(), z_score=1)
plt.savefig(os.path.join(plot_save_path, '4DendrogramBeh_param.png'))

# tSNE

In [None]:
# Show participants in 2D space
tsne_dat = TSNE(n_components=2).fit_transform(z_dat.dropna())
index = all_data.reset_index().drop(columns=ages_cols[1:]).dropna().ID
tsne_pd = pd.DataFrame(tsne_dat, columns=['tSNE1', 'tSNE2'], index=index).reset_index()
tsne_pd = tsne_pd.merge(ages[['ID', 'PreciseYrs', 'Gender', 'meanT']])

In [None]:
gg.options.figure_size = (5, 4)
g = (gg.ggplot(tsne_pd, gg.aes('tSNE1', 'tSNE2', color='PreciseYrs'))
 + gg.geom_point()
)
g.save(os.path.join(plot_save_path, '4TsneParticipants.png'))
g

In [None]:
tsne_long = tsne_pd.melt(id_vars=['ID', 'PreciseYrs', 'Gender', 'meanT'])
g = (gg.ggplot(tsne_long, gg.aes('PreciseYrs', 'value'))
 + gg.geom_point()
 + gg.geom_smooth()
 + gg.facet_grid('~ variable')
)
g.save(os.path.join(plot_save_path, '4TsneParticipants_.png'))
g

In [None]:
# Show measures in 2D space
tsne_dat = TSNE(n_components=2).fit_transform(z_dat.dropna().T)
tsne_pd = pd.DataFrame(tsne_dat, columns=['tSNE1', 'tSNE2'], index=z_dat.columns).reset_index()
tsne_pd['task'] = tsne_pd['index'].apply(lambda x : x[:2])
tsne_pd['param_name'] = tsne_pd['index'].apply(name_from_index)
tsne_pd['ACC_RT'] = tsne_pd['index'].apply(name_from_index, names=['ACC', 'RT'])
tsne_pd['param_bool'] = tsne_pd['index'].apply(bool_from_index)
tsne_pd

In [None]:
gg.options.figure_size = (5, 4)
g = (gg.ggplot(tsne_pd, gg.aes('tSNE1', 'tSNE2', color='task'))
 + gg.geom_point()
)
g.save(os.path.join(plot_save_path, '4TsneFeaturesBeh_task.png'))
g

# Transforming RLWM regression weights

In [None]:
all_data['rl_lrn-fix'].describe()
all_data['rl_lrn-fix_sig'].describe()
zscore(all_data['rl_lrn-fix']).describe()

In [None]:
x = np.arange(-30, 30, 0.1)
# x = np.arange(-10, 10, 0.01)
# dat = pd.DataFrame({'x': x, 'y': 3 *sigmoid(x/3)})
# dat = pd.DataFrame({'x': x, 'y': sigmoid(x)})
dat = pd.DataFrame({'x': x, 'y': sigmoid(zscore(x))})
dat
(gg.ggplot(dat, gg.aes('x', 'y')) +
 gg.geom_point()
)

In [None]:
# all_data['rl_lrn-fix_sig'] = sigmoid(zscore(all_data['rl_lrn-fix']))
all_data['rl_lrn-fix_sig'] = sigmoid(all_data['rl_lrn-fix'])
# all_data['rl_lrn-fix_sig'] = 50 * sigmoid(all_data['rl_lrn-fix'] / 50)
# all_data['rl_lrn-fix_sig'] = all_data['rl_lrn-fix_sig'].median() * sigmoid(all_data['rl_lrn-fix'] / all_data['rl_lrn-fix_sig'].median())

(gg.ggplot(all_data, gg.aes('rl_lrn-fix', 'rl_lrn-fix_sig')) +
 gg.geom_point()
)

In [None]:
dat = all_data#[all_data['rl_lrn-fix'] < 50]
(gg.ggplot(dat, gg.aes('PreciseYrs', 'rl_lrn-fix_sig'))
 + gg.geom_point()
 + gg.geom_smooth()
)

In [None]:
dat = all_data[all_data['rl_lrn-fix'] < 50]
(gg.ggplot(dat, gg.aes('PreciseYrs', 'rl_lrn-fix'))
 + gg.geom_point()
 + gg.geom_smooth()
)

In [None]:
(gg.ggplot(all_data, gg.aes('PreciseYrs', 'rl_lrn-ns'))
 + gg.geom_point()
)

In [None]:
(gg.ggplot(all_data, gg.aes('PreciseYrs', 'rl_lrn-pcor'))
 + gg.geom_point()
)

In [None]:
(gg.ggplot(all_data, gg.aes('PreciseYrs', 'rl_lrn-pinc'))
 + gg.geom_point()
)