# Compare variance explained among top features

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

import numpy as np

path = '/home/jln436/Internship_AUMC/'

In [2]:
lgbm_importance = pd.read_csv(path + "output/Importances/LightGBM_Importances_gain_no_cog_tests.csv")
rf_importance = pd.read_csv(path + "output/Importances/RF_Importances_no_cog_tests.csv")
Lasso_importance = pd.read_csv(path + "output/Importances/Lasso_Importances_no_cog_tests.csv")
Ridge_importance = pd.read_csv(path + "output/Importances/Ridge_Importances_no_cog_tests.csv")
SVM_importance = pd.read_csv(path + "output/Importances/SVM_Importances_no_cog_tests.csv")

In [3]:
df = pd.read_csv(path + "output/dfs/X_train_final.csv")
target = pd.read_csv(path + 'output/dfs/y_train_final.csv')
y = target.rename(columns={'cmcodt_mean': 'target'})

In [4]:
df.drop(columns = ['respnr'], inplace = True)
scaler = StandardScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), columns = df.columns)

In [5]:
# Stratifying the target
y_stratified = pd.cut(y.target, bins=10, labels=False)

## Perform 10F cross-validation with all models to get idea of R2

In [6]:
def stratified_linear_regression(col_names, n, title, plotting = False):
    print(title + ": \n --------------------------------------------")
    X = df[col_names]
    # Setting the validation strategy
    skf = StratifiedKFold(n_splits=n, 
                          shuffle=True, 
                          random_state=10)
    scores = []
    for train_index, test_index in skf.split(X, y_stratified):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.target.iloc[train_index], y.target.iloc[test_index]
        lm = LinearRegression()
        lm.fit(X_train, y_train)   
        y_pred = lm.predict(X_test)
        score = r2_score(y_test, y_pred)
        # Check mean and std of test values because getting weird things for one fold
        scores.append(score)
        # Generate plots to see what the predictions look like
        if plotting == True:
            tmp_df = pd.DataFrame(data = y_test)
            tmp_df['predicted'] = y_pred
            tmp_df = tmp_df.sort_values(by=['target'])
            # visualizing in a plot
            x_ax = range(len(y_test))
            plt.figure(figsize=(12, 6))
            plt.plot(x_ax, tmp_df.target, label="original", color = "red")
            plt.plot(x_ax, tmp_df.predicted, label="predicted", color = "purple")
            plt.title("CV plot")
            plt.xlabel('Individuals ordered by Score')
            plt.ylabel('Future Information Processing Speed')
            plt.legend(loc='best',fancybox=True, shadow=True)
            plt.grid(True)
            plt.text(0.5, 32, 'R^2: ' + str(round(score,2)), fontsize = 10)
            plt.show()
            
    for i in range(n):
        if abs(scores[i]) > 1000:
            print("CV " + str(i) + ": " + str(round(scores[i], -15)))
        else:
            print("CV " + str(i) + ": " + str(round(scores[i], 3)))
    print("mean score: " + str(sum(scores)/n) + "\n --------------------------------------------" )
    return scores

In [7]:
import warnings
warnings.filterwarnings("ignore")
print("91 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
all_features_scores = stratified_linear_regression(df.columns, 10, "All Features Included")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:91], 10, "LGBM")
rf_scores = stratified_linear_regression(rf_importance.features[0:91], 10, "Random Forest")
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:91], 10, "Lasso")
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:91], 10, "Ridge Regression")
svm_scores = stratified_linear_regression(SVM_importance.features[0:91], 10, "SVM")

91 Features selected by Lasso Regression: --------------------------------------------------------------------------------
All Features Included: 
 --------------------------------------------
CV 0: -0.126
CV 1: -1.6581e+19
CV 2: -0.128
CV 3: -0.059
CV 4: 0.335
CV 5: 0.005
CV 6: -1.957397e+21
CV 7: 0.128
CV 8: 0.029
CV 9: -0.112
mean score: -1.9739779718863592e+20
 --------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.453
CV 1: 0.385
CV 2: 0.365
CV 3: 0.314
CV 4: 0.321
CV 5: 0.331
CV 6: 0.305
CV 7: 0.364
CV 8: 0.429
CV 9: 0.337
mean score: 0.3603326415651222
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.455
CV 1: 0.439
CV 2: 0.437
CV 3: 0.382
CV 4: 0.315
CV 5: 0.293
CV 6: 0.303
CV 7: 0.424
CV 8: 0.481
CV 9: 0.396
mean score: 0.39248017445802963
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.403
CV 1: 0.436

In [8]:
print("20 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:20], 10, "LGBM")
rf_scores = stratified_linear_regression(rf_importance.features[0:20], 10, "Random Forest")
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:20], 10, "Lasso")
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:20], 10, "Ridge Regression")
svm_scores = stratified_linear_regression(SVM_importance.features[0:20], 10, "SVM")

20 Features selected by Lasso Regression: --------------------------------------------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.409
CV 1: 0.427
CV 2: 0.383
CV 3: 0.411
CV 4: 0.387
CV 5: 0.349
CV 6: 0.32
CV 7: 0.369
CV 8: 0.515
CV 9: 0.357
mean score: 0.39267885707116157
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.405
CV 1: 0.382
CV 2: 0.395
CV 3: 0.393
CV 4: 0.476
CV 5: 0.291
CV 6: 0.315
CV 7: 0.348
CV 8: 0.525
CV 9: 0.365
mean score: 0.3894406323152694
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.476
CV 1: 0.427
CV 2: 0.428
CV 3: 0.397
CV 4: 0.48
CV 5: 0.406
CV 6: 0.321
CV 7: 0.5
CV 8: 0.537
CV 9: 0.488
mean score: 0.445911422565923
 --------------------------------------------
Ridge Regression: 
 --------------------------------------------
CV 0: 0.464
CV 1: 0.414
CV 2: 0.43
CV 3: 0.403
CV 4: 0.4

In [9]:
print("50 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:50], 10, "LGBM")
rf_scores = stratified_linear_regression(rf_importance.features[0:50], 10, "Random Forest")
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:50], 10, "Lasso")
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:50], 10, "Ridge Regression")
svm_scores = stratified_linear_regression(SVM_importance.features[0:50], 10, "SVM")

50 Features selected by Lasso Regression: --------------------------------------------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.502
CV 1: 0.423
CV 2: 0.406
CV 3: 0.365
CV 4: 0.37
CV 5: 0.324
CV 6: 0.331
CV 7: 0.376
CV 8: 0.456
CV 9: 0.379
mean score: 0.39332149726604887
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.415
CV 1: 0.374
CV 2: 0.428
CV 3: 0.42
CV 4: 0.431
CV 5: 0.351
CV 6: 0.309
CV 7: 0.439
CV 8: 0.484
CV 9: 0.401
mean score: 0.40530421598574956
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.486
CV 1: 0.489
CV 2: 0.396
CV 3: 0.412
CV 4: 0.503
CV 5: 0.376
CV 6: 0.365
CV 7: 0.466
CV 8: 0.529
CV 9: 0.54
mean score: 0.4563583187143152
 --------------------------------------------
Ridge Regression: 
 --------------------------------------------
CV 0: 0.504
CV 1: 0.424
CV 2: 0.373
CV 3: 0.44
CV 4: 

## All unique features from each method combined

In [10]:
tmp_list = list(lgbm_importance.features[0:91]) + list(rf_importance.features[0:91]) + list(Lasso_importance.features[0:91]) + list(Ridge_importance.features[0:91]) + list(SVM_importance.features[0:91])
combined_features = list(np.unique(tmp_list))

In [11]:
print("Combined Features: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(combined_features, 10, "Combined")

Combined Features: --------------------------------------------------------------------------------
Combined: 
 --------------------------------------------
CV 0: 0.31
CV 1: 0.355
CV 2: -4.9194273e+22
CV 3: 0.289
CV 4: 0.242
CV 5: 0.35
CV 6: 0.172
CV 7: 0.38
CV 8: 0.456
CV 9: 0.411
mean score: -4.919427323855209e+21
 --------------------------------------------
