In [1]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def load(pth:str):
    df = pd.read_csv(pth)
    return df.drop(axis=1, columns="Unnamed: 0")
df_kirc, df_lusc = load("kirc_final_form.csv"), load("lusc_final_form.csv")

In [3]:
df_kirc.head(5)

Unnamed: 0,FBXO22,AURKC,AKAP7,CCDC158,ZBTB39,CNV_Xq11.2,CNV_17q24.3,CNV_7q36.3,CNV_3p21.32,CNV_14q31.1,...,CNV_5q35.1,CNV_1p36.23,CNV_1p31.1,CNV_10p14,CNV_4q32.1,CNV_10q23.31,CNV_9p23,CNV_3p12.2,CNV_3q11.2,CNV_Xp11.4
0,450.94,10.55,205.5,13.42,70.01,0.00222,-0.006229,0.001387,0.003125,0.001942,...,0.44071,0.0,0.0,-0.00194,-0.45179,0.002775,-0.001247,0.006526,0.008165,0.00222
1,642.59,13.72,199.85,2.43,50.63,-0.024114,-0.015604,0.001387,-0.40374,0.13509,...,0.01419,-0.010892,-0.010892,-0.007886,0.007222,-0.007886,-0.001524,-0.40639,0.002636,-0.024114
2,698.41,41.52,140.04,4.17,30.78,-0.009681,0.010842,0.095508,-0.08206,-0.000416,...,0.11594,-0.11525,0.019028,-0.063435,0.0,-0.063435,-0.070536,-0.08206,-0.08206,-0.009681
3,808.0,9.56,283.05,9.27,43.94,-0.063156,-0.020687,0.98908,-0.53073,0.0,...,-0.011611,-0.02384,0.004481,-0.001801,0.003885,-0.001801,0.006692,-0.50784,0.005136,-0.063156
4,511.0,21.78,295.11,43.63,28.24,-0.064738,0.014692,0.6072,-0.63979,0.01562,...,-0.002078,-0.62979,-0.62979,-0.009819,-0.005434,-0.64358,0.0,-0.63869,-0.63869,-0.064738


In [4]:
df_lusc.head(5)

Unnamed: 0,DES,FMO5,HMCES,NEXN,SLC7A5,CNV_6q12,CNV_18q11.2,CNV_12p,CNV_4q22.1,CNV_13q34,...,CNV_19q13.2,CNV_7q21.2,CNV_1q21.2,CNV_3p12.1,CNV_3p11.1,CNV_17p11.2,CNV_13q14.2,CNV_2q22.1,CNV_4p,CNV_4q
0,231.32,77.79,1572.76,159.9,1551.09,1.9427,0.078871,0.0,-0.10211,0.060508,...,0.1428,0.16661,0.033127,-0.12393,-0.13331,0.076422,-0.048229,0.0,-0.102,-0.102
1,81.14,199.85,2287.2,42.41,1745.2,3.6569,-0.13264,2.77,-0.44681,-0.59565,...,1.2459,0.44577,0.84593,-0.4197,-0.4197,1.2089,0.60487,0.11247,-0.447,-0.447
2,27.64,12.55,1340.84,88.26,1561.89,0.12739,3.6569,1.158,-0.25353,-0.31381,...,0.61785,0.33267,0.86192,-0.25245,0.13344,-0.001825,-0.13357,-0.043354,-0.16,-0.16
3,293.07,19.82,1605.83,165.57,4639.29,0.3794,-0.004569,0.661,-0.38649,-0.39919,...,0.80383,0.36395,0.45151,-0.3983,-0.3983,-0.048372,-0.39992,0.02693,-0.386,-0.386
4,95.34,22.1,1388.16,70.51,2271.4,0.42099,0.21115,0.0,-0.47099,-0.5347,...,0.35594,0.61697,0.35463,-0.61964,-1.2929,-0.51208,-0.5348,0.32743,-0.471,-0.471


In [5]:
df_lusc.columns

Index(['DES', 'FMO5', 'HMCES', 'NEXN', 'SLC7A5', 'CNV_6q12    ',
       'CNV_18q11.2 ', 'CNV_12p', 'CNV_4q22.1  ', 'CNV_13q34   ',
       ...
       'CNV_19q13.2 ', 'CNV_7q21.2  ', 'CNV_1q21.2  ', 'CNV_3p12.1  ',
       'CNV_3p11.1  ', 'CNV_17p11.2 ', 'CNV_13q14.2 ', 'CNV_2q22.1  ',
       'CNV_4p', 'CNV_4q'],
      dtype='object', length=107)

In [6]:
def to_x_y(df):
    cols = df.columns
    targets = df.loc[:,cols[:5]]
    features = df.loc[:,cols[5:]]
    return features, targets
kirc_data, lusc_data = to_x_y(df_kirc), to_x_y(df_lusc)


In [7]:
def train_models(data, model_name):
    features, targets = data
    features = StandardScaler().fit_transform(features)
    model_results = {}
    models = {}
    
    for target_name, target in targets.items():
        if model_name == "Ridge":
            # Define the parameter grid for Ridge
            param_grid = {
                'alpha': np.logspace(-2, 2, 100)  
            }
            ridge_model = Ridge( max_iter=10000)  
            randomized_search = RandomizedSearchCV(ridge_model, param_distributions=param_grid, n_iter=10, cv=5, scoring='r2', random_state=42)
            randomized_search.fit(features, target)  # Perform randomized search
            best_ridge_model = randomized_search.best_estimator_
            models["ridge_model_" + target_name] = best_ridge_model
            y_pred = best_ridge_model.predict(features)
            r2 = r2_score(target, y_pred)
            mse = mean_squared_error(target, y_pred)
            model_results["ridge_model_" + target_name] = {("r2","mse"): [r2,mse]}
        
        elif model_name == "Lasso":
            # Define the parameter grid for Lasso
            param_grid = {
                'alpha': np.logspace(-2, 2, 100)  # Creates a range of alpha values from 0.01 to 100
            }
            lasso_model = Lasso( max_iter=10000)  # Create a Lasso model
            randomized_search = RandomizedSearchCV(lasso_model, param_distributions=param_grid, n_iter=10, cv=5, scoring='r2', random_state=42)
            randomized_search.fit(features, target)  # Perform randomized search
            best_lasso_model = randomized_search.best_estimator_
            models["lasso_model_" + target_name] = best_lasso_model
            y_pred = best_lasso_model.predict(features)
            r2 = r2_score(target, y_pred)
            mse = mean_squared_error(target, y_pred)

            model_results["lasso_model_" + target_name] = {("r2","mse"): [r2,mse]}
        
        else:
            raise ValueError("Use 'Ridge' or 'Lasso' for model_name")
    
    return models, model_results

kirc_ridge_models, kirc_ridge_results = train_models(kirc_data, "Ridge")
kirc_lasso_models, kirc_lasso_results = train_models(kirc_data, "Lasso")


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [8]:
print(kirc_ridge_results)
print(kirc_lasso_results)

{'ridge_model_FBXO22': {('r2', 'mse'): [0.5278038541744479, 12108.118331040767]}, 'ridge_model_AURKC': {('r2', 'mse'): [0.3081628865439239, 60.31244577978992]}, 'ridge_model_AKAP7': {('r2', 'mse'): [0.3853871062078382, 3434.1929408189967]}, 'ridge_model_CCDC158': {('r2', 'mse'): [0.37601873648677553, 338.5599156077676]}, 'ridge_model_ZBTB39': {('r2', 'mse'): [0.49515469345104546, 146.0014927566762]}}
{'lasso_model_FBXO22': {('r2', 'mse'): [0.2925143068506658, 18141.44517252217]}, 'lasso_model_AURKC': {('r2', 'mse'): [0.0, 87.17723378339558]}, 'lasso_model_AKAP7': {('r2', 'mse'): [0.10471155613310323, 5002.487394877579]}, 'lasso_model_CCDC158': {('r2', 'mse'): [0.0, 542.5802590634876]}, 'lasso_model_ZBTB39': {('r2', 'mse'): [0.0, 289.2004557885665]}}


In [9]:
lusc_ridge_models, lusc_ridge_results = train_models(lusc_data, "Ridge")
lusc_lasso_models, lusc_lasso_results = train_models(lusc_data, "Lasso") 

In [10]:
print(lusc_ridge_results)
print(lusc_lasso_results)

{'ridge_model_DES': {('r2', 'mse'): [0.9999992093904378, 0.006586707294205562]}, 'ridge_model_FMO5': {('r2', 'mse'): [0.9248989791381171, 307.8646042841314]}, 'ridge_model_HMCES': {('r2', 'mse'): [0.9654634919721381, 10174.446298554061]}, 'ridge_model_NEXN': {('r2', 'mse'): [0.9988059709837754, 4.014956796828832]}, 'ridge_model_SLC7A5': {('r2', 'mse'): [0.9398826561063115, 77847.08303096736]}}
{'lasso_model_DES': {('r2', 'mse'): [0.5954810209272261, 3370.1187511129515]}, 'lasso_model_FMO5': {('r2', 'mse'): [0.9135209258064937, 354.50711122030367]}, 'lasso_model_HMCES': {('r2', 'mse'): [0.9999999918551148, 0.0023994810562466685]}, 'lasso_model_NEXN': {('r2', 'mse'): [0.9999974407625158, 0.008605509407394712]}, 'lasso_model_SLC7A5': {('r2', 'mse'): [0.9999999268929504, 0.094667698173142]}}


In [11]:
print(kirc_lasso_results)
print(kirc_ridge_results)

{'lasso_model_FBXO22': {('r2', 'mse'): [0.2925143068506658, 18141.44517252217]}, 'lasso_model_AURKC': {('r2', 'mse'): [0.0, 87.17723378339558]}, 'lasso_model_AKAP7': {('r2', 'mse'): [0.10471155613310323, 5002.487394877579]}, 'lasso_model_CCDC158': {('r2', 'mse'): [0.0, 542.5802590634876]}, 'lasso_model_ZBTB39': {('r2', 'mse'): [0.0, 289.2004557885665]}}
{'ridge_model_FBXO22': {('r2', 'mse'): [0.5278038541744479, 12108.118331040767]}, 'ridge_model_AURKC': {('r2', 'mse'): [0.3081628865439239, 60.31244577978992]}, 'ridge_model_AKAP7': {('r2', 'mse'): [0.3853871062078382, 3434.1929408189967]}, 'ridge_model_CCDC158': {('r2', 'mse'): [0.37601873648677553, 338.5599156077676]}, 'ridge_model_ZBTB39': {('r2', 'mse'): [0.49515469345104546, 146.0014927566762]}}


In [20]:
def features_selection(features, models_dict):
    lasso_selection_results = {}
    
    for name, model in models_dict.items():
        non_zero_indices = np.where(model.coef_ != 0)
        selected_feature_names = features[non_zero_indices]
        selected_coefficients = model.coef_[non_zero_indices]
        lasso_selection_results[name] = {
            'selected_features': selected_feature_names,
            'coefficients': selected_coefficients
        }
    
    return lasso_selection_results
lusc_lasso_fs = features_selection(lusc_data[0].columns, lusc_lasso_models)
kirc_lasso_fs = features_selection(kirc_data[0].columns, kirc_lasso_models)
lusc_ridge_fs = features_selection(lusc_data[0].columns, lusc_ridge_models)
kirc_ridge_fs = features_selection(kirc_data[0].columns, kirc_ridge_models)

In [26]:
kirc = {
        ("results_ridge","results_lasso") : (kirc_ridge_results, kirc_lasso_results),
        ("models_ridge","models_lasso") : (kirc_ridge_models, kirc_lasso_models),
        ("selected_features_ridge", "selected_features_lasso") : (kirc_ridge_fs, kirc_lasso_fs)
         }

lusc = {        
        ("results_ridge","results_lasso") : (lusc_ridge_results, lusc_lasso_results),
        ("models_ridge","models_lasso") : (lusc_ridge_models, lusc_lasso_models),
        ("selected_features_ridge", "selected_features_lasso") : (lusc_ridge_fs, lusc_lasso_fs)
         }

final = {"kirc":kirc, "lusc":lusc}

In [27]:
final

{'kirc': {('results_ridge',
   'results_lasso'): ({'ridge_model_FBXO22': {('r2',
      'mse'): [0.5278038541744479, 12108.118331040767]},
    'ridge_model_AURKC': {('r2', 'mse'): [0.3081628865439239,
      60.31244577978992]},
    'ridge_model_AKAP7': {('r2', 'mse'): [0.3853871062078382,
      3434.1929408189967]},
    'ridge_model_CCDC158': {('r2', 'mse'): [0.37601873648677553,
      338.5599156077676]},
    'ridge_model_ZBTB39': {('r2', 'mse'): [0.49515469345104546,
      146.0014927566762]}}, {'lasso_model_FBXO22': {('r2',
      'mse'): [0.2925143068506658, 18141.44517252217]},
    'lasso_model_AURKC': {('r2', 'mse'): [0.0, 87.17723378339558]},
    'lasso_model_AKAP7': {('r2', 'mse'): [0.10471155613310323,
      5002.487394877579]},
    'lasso_model_CCDC158': {('r2', 'mse'): [0.0, 542.5802590634876]},
    'lasso_model_ZBTB39': {('r2', 'mse'): [0.0, 289.2004557885665]}}),
  ('models_ridge',
   'models_lasso'): ({'ridge_model_FBXO22': Ridge(alpha=22.570197196339215, max_iter=10000),
 

In [32]:
import pickle
file_path = 'results.pickle'

# Save the dictionary to a file using pickle
with open(file_path, 'wb') as file:
    pickle.dump(final, file)

In [3]:
import pickle

with open('results.pickle', 'rb') as pickle_file:
    loaded_dict = pickle.load(pickle_file)
    loaded_dict

In [4]:
loaded_dict

{'kirc': {('results_ridge',
   'results_lasso'): ({'ridge_model_FBXO22': {('r2',
      'mse'): [0.5278038541744479, 12108.118331040767]},
    'ridge_model_AURKC': {('r2', 'mse'): [0.3081628865439239,
      60.31244577978992]},
    'ridge_model_AKAP7': {('r2', 'mse'): [0.3853871062078382,
      3434.1929408189967]},
    'ridge_model_CCDC158': {('r2', 'mse'): [0.37601873648677553,
      338.5599156077676]},
    'ridge_model_ZBTB39': {('r2', 'mse'): [0.49515469345104546,
      146.0014927566762]}}, {'lasso_model_FBXO22': {('r2',
      'mse'): [0.2925143068506658, 18141.44517252217]},
    'lasso_model_AURKC': {('r2', 'mse'): [0.0, 87.17723378339558]},
    'lasso_model_AKAP7': {('r2', 'mse'): [0.10471155613310323,
      5002.487394877579]},
    'lasso_model_CCDC158': {('r2', 'mse'): [0.0, 542.5802590634876]},
    'lasso_model_ZBTB39': {('r2', 'mse'): [0.0, 289.2004557885665]}}),
  ('models_ridge',
   'models_lasso'): ({'ridge_model_FBXO22': Ridge(alpha=22.570197196339215, max_iter=10000),
 