# Compare variance explained among top features

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import numpy as np

path = '/home/jln436/Internship_AUMC/'

In [2]:
lgbm_importance = pd.read_csv(path + "output/Importances/LightGBM_Importances_gain.csv")
rf_importance = pd.read_csv(path + "output/Importances/RF_Importances.csv")
Lasso_importance = pd.read_csv(path + "output/Importances/Lasso_Importances.csv")
Ridge_importance = pd.read_csv(path + "output/Importances/Ridge_Importances.csv")
SVM_importance = pd.read_csv(path + "output/Importances/SVM_Importances.csv")

In [3]:
df = pd.read_csv(path + "output/dfs/X_train_final.csv")
target = pd.read_csv(path + 'output/dfs/y_train_final.csv')
y = target.rename(columns={'cmcodt_mean': 'target'})

In [4]:
df.drop(columns = ['respnr'], inplace = True)
scaler = StandardScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), columns = df.columns)

In [5]:
# Stratifying the target
y_stratified = pd.cut(y.target, bins=10, labels=False)

## Perform 10F cross-validation with all models to get idea of R2

In [6]:
def stratified_linear_regression(col_names, n, title, plotting = False):
    print(title + ": \n --------------------------------------------")
    X = df[col_names]
    # Setting the validation strategy
    skf = StratifiedKFold(n_splits=n, 
                          shuffle=True, 
                          random_state=10)
    scores = []
    for train_index, test_index in skf.split(X, y_stratified):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.target.iloc[train_index], y.target.iloc[test_index]
        lm = LinearRegression()
        lm.fit(X_train, y_train)   
        y_pred = lm.predict(X_test)
        score = r2_score(y_test, y_pred)
        # Check mean and std of test values because getting weird things for one fold
        scores.append(score)
        # Generate plots to see what the predictions look like
        if plotting == True:
            tmp_df = pd.DataFrame(data = y_test)
            tmp_df['predicted'] = y_pred
            tmp_df = tmp_df.sort_values(by=['target'])
            # visualizing in a plot
            x_ax = range(len(y_test))
            plt.figure(figsize=(12, 6))
            plt.plot(x_ax, tmp_df.target, label="original", color = "red")
            plt.plot(x_ax, tmp_df.predicted, label="predicted", color = "purple")
            plt.title("CV plot")
            plt.xlabel('Individuals ordered by Score')
            plt.ylabel('Future Information Processing Speed')
            plt.legend(loc='best',fancybox=True, shadow=True)
            plt.grid(True)
            plt.text(0.5, 32, 'R^2: ' + str(round(score,2)), fontsize = 10)
            plt.show()
            
    for i in range(n):
        if abs(scores[i]) > 1000:
            print("CV " + str(i) + ": " + str(round(scores[i], -15)))
        else:
            print("CV " + str(i) + ": " + str(round(scores[i], 3)))
    print("mean score: " + str(sum(scores)/n) + "\n --------------------------------------------" )
    return scores

In [7]:
import warnings
warnings.filterwarnings("ignore")
print("49 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
all_features_scores = stratified_linear_regression(df.columns, 10, "All Features Included")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:49], 10, "LGBM")
rf_scores = stratified_linear_regression(rf_importance.features[0:49], 10, "Random Forest")
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:49], 10, "Lasso")
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:49], 10, "Ridge Regression")
svm_scores = stratified_linear_regression(SVM_importance.features[0:49], 10, "SVM")

48 Features selected by Lasso Regression: --------------------------------------------------------------------------------
All Features Included: 
 --------------------------------------------
CV 0: -0.126
CV 1: -1.6581e+19
CV 2: -0.128
CV 3: -0.059
CV 4: 0.335
CV 5: 0.005
CV 6: -1.957397e+21
CV 7: 0.128
CV 8: 0.029
CV 9: -0.112
mean score: -1.9739779718863592e+20
 --------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.768
CV 1: 0.698
CV 2: 0.819
CV 3: 0.769
CV 4: 0.811
CV 5: 0.744
CV 6: 0.75
CV 7: 0.812
CV 8: 0.851
CV 9: 0.722
mean score: 0.7743766679386269
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.777
CV 1: 0.719
CV 2: 0.832
CV 3: 0.777
CV 4: 0.795
CV 5: 0.746
CV 6: 0.757
CV 7: 0.822
CV 8: 0.836
CV 9: 0.716
mean score: 0.7777275286339498
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.797
CV 1: 0.757
C

In [8]:
print("20 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:20], 10, "LGBM", plotting = False)
rf_scores = stratified_linear_regression(rf_importance.features[0:20], 10, "Random Forest", plotting = False)
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:20], 10, "Lasso", plotting = False)
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:20], 10, "Ridge Regression", plotting = False)
svm_scores = stratified_linear_regression(SVM_importance.features[0:20], 10, "SVM", plotting = False)

20 Features selected by Lasso Regression: --------------------------------------------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.784
CV 1: 0.714
CV 2: 0.832
CV 3: 0.782
CV 4: 0.811
CV 5: 0.745
CV 6: 0.767
CV 7: 0.821
CV 8: 0.859
CV 9: 0.729
mean score: 0.7843029391353069
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.783
CV 1: 0.718
CV 2: 0.83
CV 3: 0.783
CV 4: 0.801
CV 5: 0.741
CV 6: 0.77
CV 7: 0.823
CV 8: 0.856
CV 9: 0.737
mean score: 0.7841449059401012
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.791
CV 1: 0.767
CV 2: 0.835
CV 3: 0.802
CV 4: 0.809
CV 5: 0.774
CV 6: 0.781
CV 7: 0.839
CV 8: 0.852
CV 9: 0.733
mean score: 0.7983581755695556
 --------------------------------------------
Ridge Regression: 
 --------------------------------------------
CV 0: 0.772
CV 1: 0.745
CV 2: 0.845
CV 3: 0.788
CV 4: 

In [9]:
print("100 Features selected by Lasso Regression: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(lgbm_importance.features[0:100], 10, "LGBM")
rf_scores = stratified_linear_regression(rf_importance.features[0:100], 10, "Random Forest")
lasso_scores = stratified_linear_regression(Lasso_importance.features[0:100], 10, "Lasso")
ridge_scores = stratified_linear_regression(Ridge_importance.features[0:100], 10, "Ridge Regression")
svm_scores = stratified_linear_regression(SVM_importance.features[0:100], 10, "SVM")

100 Features selected by Lasso Regression: --------------------------------------------------------------------------------
LGBM: 
 --------------------------------------------
CV 0: 0.758
CV 1: 0.68
CV 2: 0.775
CV 3: 0.763
CV 4: 0.798
CV 5: 0.754
CV 6: 0.75
CV 7: 0.816
CV 8: 0.819
CV 9: 0.705
mean score: 0.7618322977566467
 --------------------------------------------
Random Forest: 
 --------------------------------------------
CV 0: 0.776
CV 1: 0.699
CV 2: 0.805
CV 3: 0.752
CV 4: 0.778
CV 5: 0.74
CV 6: 0.734
CV 7: 0.815
CV 8: 0.826
CV 9: 0.711
mean score: 0.7636185834701974
 --------------------------------------------
Lasso: 
 --------------------------------------------
CV 0: 0.755
CV 1: 0.752
CV 2: 0.78
CV 3: 0.776
CV 4: 0.787
CV 5: 0.79
CV 6: -8.039678e+21
CV 7: 0.815
CV 8: 0.867
CV 9: 0.72
mean score: -8.039677636671886e+20
 --------------------------------------------
Ridge Regression: 
 --------------------------------------------
CV 0: 0.774
CV 1: 0.757
CV 2: 0.774
CV 3: 0.7

## All unique features from each method combined

In [10]:
tmp_list = list(lgbm_importance.features[0:48]) + list(rf_importance.features[0:48]) + list(Lasso_importance.features[0:48]) + list(Ridge_importance.features[0:48]) + list(SVM_importance.features[0:48])
combined_features = list(np.unique(tmp_list))

In [11]:
print("Combined Features: --------------------------------------------------------------------------------")
lgbm_gain_scores = stratified_linear_regression(combined_features, 10, "Combined")

Combined Features: --------------------------------------------------------------------------------
Combined: 
 --------------------------------------------
CV 0: 0.762
CV 1: 0.72
CV 2: 0.774
CV 3: 0.764
CV 4: 0.837
CV 5: 0.755
CV 6: 0.764
CV 7: 0.813
CV 8: 0.826
CV 9: 0.727
mean score: 0.774395783815075
 --------------------------------------------
