### Objective
### This file cotains definitions of  employed functions.

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.linalg import svd
#from sklearn.decomposition import PCA

from sklearn.cluster import DBSCAN

import sklearn.cluster as cluster

import math

In [2]:
#fct ploting feature variables of a dataframe

def fct_show_raw_visualization(
    data,
    feature_keys,
    titles,
    colors,
    v_cols,
    v_name_dataframe):
    
    #print("data.shape[0]",data.shape[0])
    #print()
    #print("range(len(feature_keys))",range(len(feature_keys)))
    #print()
    v_nb_rows=math.ceil(len(feature_keys)/v_cols)
    
    
    #time_data = data[date_time_key]
    x_axis_labels=list(range(0,data.shape[0]))
    fig, axes = plt.subplots(
        nrows=v_nb_rows, ncols=v_cols, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k"
    )
    
    #title 
    fig.suptitle('Features dataframe '+str(v_name_dataframe))
    
    # Remove the subplot at position (9,2)which is element (8,1)
    #as numeration starts from zero
    if len(feature_keys)<v_nb_rows*v_cols:
        row_id=int(v_nb_rows-1)
        fig.delaxes(axes[row_id,1])
    
    
    for i in range(len(feature_keys)):
        #print("i",i)
    
        key = feature_keys[i]
        
        #print("key",key)
        c = colors[i]
        t_data = data[key]
        
        #print("t_data",t_data)
       
        ax = t_data.plot(
            ax=axes[i//2, i%2],
            color=c,
            title="{} - {}".format(titles[i], key),
            rot=25,
        )
        ax.legend([titles[i]])
    plt.tight_layout()

In [3]:
#function creating a heatmap plot from a dataframe

def fct_show_feature_heatmap(data,val_name_dataframe):
    plt.matshow(data.corr())
    plt.xticks(range(data.shape[1]), data.columns, fontsize=12, rotation=90,color="b")
    plt.gca().xaxis.tick_bottom()
    plt.yticks(range(data.shape[1]), data.columns, fontsize=12,color="b")

    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=12)
    plt.title("Feature Correlation Heatmap-"+str(val_name_dataframe), fontsize=10,color="b")
    plt.show()




In [4]:
#fct creating a correlation coefficient matrix from a dataframe
def fct_show_cor_coef_matrix(val_data,val_name_dataframe):
    
    plt.figure(figsize=(14,7))
    #plt.figure(figsize=(20,15))

    # Create a custom divergin palette
    cmap = sns.diverging_palette(100, 7, s=75, l=40,n=5, center="light", as_cmap=True)
    
    #Create a mask
    mask = np.triu(np.ones_like(val_data.corr(), dtype=bool))
    np.fill_diagonal(mask, False)
    
    #scale all fonts in your legend and on the axes.
    sns.set(font_scale=1)
    
    heatmap = sns.heatmap(val_data.corr(), vmin=-1, vmax=1, annot=True,\
                          annot_kws={'fontsize': 8},cmap="PiYG", mask=mask)
    
    heatmap.set_title('Correlation coeffcient matrix-'+str(val_name_dataframe),\
                  fontdict={'fontsize':15}, pad=12,c="darkgreen")
    
    plt.yticks(rotation=30,fontsize=8,c="darkgreen")
    plt.xticks(rotation=20,fontsize=8,c="darkgreen")
    
    #plt.savefig("Figures/fig_cor_heatmap.png")
    #plt.close(fig)
    plt.show()

In [5]:
#fct standardizing the dataframe val_df
def fct_stand_dataframe(val_df):
    
    v_mean=val_df.mean(axis=0)
    v_std=val_df.std(axis=0)
    
    df_stand=(val_df-v_mean)/v_std
    
    return df_stand

In [6]:
#function decomposing a matrix X (normalized dataframe) into matrices U,S,Vt
#left singular vector, singular values, right singular vector
#it returns matrices 
#U left singular vector
#s=array with singular values
#Vt=right singular vector(transpose)
#S=diagonal matrix with singular values along diagonal

#X_stand=standardized dataframe  X_stand= (X-mean)/std of X
#requires previously
#from scipy.linalg import svd and 
#import numpy as np

def fct_svd_decomposition(X_stand):
    U, s, Vt = svd(X_stand,full_matrices=False)
    S=np.diag(s)
    return U,s,Vt,S

In [7]:
#fct veryfying the svd decomposition
#t returns True/False depending whether the product 
#of U,s,Vt returns a matrix close to the initial matrix to decompose

#val_initial_matrix_to_decompose= the initial standardized dataframe 
#val_component_S=diagonal matrix with singular values along the diagonal
def fct_verify_svd_decomposition(\
val_initial_matrix_to_decompose,
val_component_U,
val_component_S,
val_component_Vt):
    
    V=val_component_Vt.T
    
    return np.allclose(val_initial_matrix_to_decompose, \
                       val_component_U@val_component_S@V.T)
    
    
    
    

In [8]:
#function projecting data into a lower dimension space
#it returns the projected dataframe
def fct_components_projection_to_lower_dim_space(\
val_component_U,
val_component_S,
val_new_data_dimension):
    
    feat_cols = ['PC'+str(i) for i in range(1,int(val_new_data_dimension+1))]
    #print("feat_cols",feat_cols)
    
    U_projected = val_component_U[:,:val_new_data_dimension]
    S_projected = val_component_S[:val_new_data_dimension,:val_new_data_dimension]
    
    #a=U_projected@S_projected
    #print("a",a)
    
    df_projected = pd.DataFrame(U_projected@S_projected,columns=feat_cols)
    
    return df_projected
    
  

In [9]:
#function returning  the explained variance and 
#the cummulative explained  variance
#when considering each possible  number of features (data dimension)

#val_array_singular_values=array with all the singular values

def fct_variance_and_cum_explained_variance(val_array_singular_values):
    
    #the sum of all the singular values
    sum_sing_vals=sum(val_array_singular_values)

    # the explained variance of each singular value
    var_explained = [(i/sum_sing_vals) for i in val_array_singular_values]
    
    # the cum explained variance
    cum_var_explained=np.cumsum(var_explained)

    return var_explained, cum_var_explained
    

In [10]:
#function computing and ploting 
#the  explained variance and the cumulative explained variance
#it returns the explained variance, cumulative explained variance
#and shows the plots

def fct_compute_and_plot_sing_values_and_cum_variance_explained(\
val_array_singular_values,val_name_dataframe):
    
    #computation of the explained variance and 
    #the cumulative explained variance
    var_explained, cum_var_explained=\
    fct_variance_and_cum_explained_variance(\
    val_array_singular_values=val_array_singular_values)
    
    
    #plot the singular value ratio and the cumulative value
    dim=val_array_singular_values.shape[0]+1
    
    x = np.arange(1,dim)
    
    sns.set(style="darkgrid")
    
    res=sns.set(rc={'axes.facecolor':'lavender', 'figure.facecolor':'seashell'})

    #fig, ax = plt.subplots(figsize=(15, 17))
    fig, axes = plt.subplots(1,2,figsize=(12, 5))
    
    #title 
    fig.suptitle('Standardized Dataframe '+str(val_name_dataframe),color="purple")
    

    sns.lineplot(\
    ax=axes[0],x=x,\
    y=var_explained,marker='o',color="purple",linewidth=2)
    
    sns.lineplot(\
    ax=axes[1],x=x,\
    y=cum_var_explained,marker='*',color="magenta",linewidth=2,\
    drawstyle='steps-pre')
    
    
    axes[0].set_title("Explained Variance per Singular Value.",c="purple")
    axes[0].set_xlabel('Id Singular Value',fontsize=12,c="purple")
    axes[0].set_ylabel('Explained Variance Ratio',fontsize=12,color="purple")
    axes[1].set_title("Cummulative Explained Variance.",c="purple")
    axes[1].set_xlabel('Id Singular Value',fontsize=12,c="purple")
    axes[1].set_ylabel('Cumulative Explained Variance',fontsize=12,color="purple")
    
    plt.show()
    
    return var_explained, cum_var_explained
    
    

In [11]:
#function defining the space dimension (number of principal components,pc, to employ)
#from the desired retained variable

#val_desired_explained_variance= the desired explained variance
def fct_define_nb_pcs_from_explained_variance(\
val_desired_explained_variance,\
val_cum_variance_explained):

    return (val_cum_variance_explained<val_desired_explained_variance).sum()

In [12]:
#https://vitalflux.com/pca-explained-variance-concept-python-example/

#function returning the dataset  projection 
#into the directions capturing the greatest variance

def fct_data_projected_in_lower_dimension_space(\
val_initial_standardized_matrix_to_decompose,\
val_desire_explained_variance,\
val_name_standardized_dataframe):

    #matrix factorization
    v_component_U,v_s,v_component_Vt,v_component_S=\
    fct_svd_decomposition(X_stand=val_initial_standardized_matrix_to_decompose)

    #decomposition verification
    rep_verification=fct_verify_svd_decomposition(\
    val_initial_matrix_to_decompose=val_initial_standardized_matrix_to_decompose,
    val_component_U=v_component_U,
    val_component_S=v_component_S,
    val_component_Vt=v_component_Vt)
    
    if rep_verification==True:
        print("response svd decomposition verification: ",rep_verification)
    else:
        print("response svd decomposition verification: ",rep_verification)
        import sys
        sys.exit()
    
    #function computing and ploting 
    #the  explained variance captured by each direction
    #and the cumulative explained variance
    #it returns the explained variance, cumulative explained variance
    #and shows the plots

    var_explained, cum_var_explained=\
    fct_compute_and_plot_sing_values_and_cum_variance_explained(\
    val_array_singular_values=v_s,\
    val_name_dataframe=val_name_standardized_dataframe)


    #function defining the space dimension (number of principal components,pc, to employ)
    #from the desired retained variable
    val_new_space_dimension=fct_define_nb_pcs_from_explained_variance(\
    val_desired_explained_variance=val_desire_explained_variance,\
    val_cum_variance_explained=cum_var_explained)
    
    print("New (PC space) dimension: ", val_new_space_dimension)


    #function projecting data into a lower dimension space
    #it returns the projected dataframe
    df_projected=fct_components_projection_to_lower_dim_space(\
    val_component_U=v_component_U,
    val_component_S=v_component_S,
    val_new_data_dimension=val_new_space_dimension)
    
    return df_projected


    

In [13]:
#function returning a list with the inertias for a list 
#of values corresponding to  numbers of clusters

#it alsa returns a dict : key=id nb clusters, value=kmeans object

#val_type_init_kmeans_obj="random" if KMeans algorithms
#val_type_init_kmeans_obj='k-means++' if KMeans++ algorithms

def fct_compute_inertia_for_list_of_values(\
val_name_dataframe,\
val_list_number_clusters,\
val_type_init_kmeans_obj,\
val_verbose_inertia):
    li_inertias = []
    
    #di_kmeans_obj= dict, key=id nb clusters, value=kmeans object
    di_kmeans_obj={}
    
    for i in val_list_number_clusters:
        kmeans=\
        cluster.KMeans(n_clusters=i, init=val_type_init_kmeans_obj,n_init="auto",\
                       verbose=val_verbose_inertia,\
                       random_state=42).fit(val_name_dataframe)
        
        li_inertias.append(kmeans.inertia_)
        
        di_kmeans_obj[i]=kmeans
    
    return li_inertias, di_kmeans_obj

In [14]:
#function ploting a list of values
def fct_plot_li_vals(\
val_x,\
val_y,\
val_color,\
val_title_x_label="Number of clusters",\
val_title_y_label="Inertia",
val_title="K-means random initialization, Inertia Score by n cluster centers"):
    
    fig=plt.figure(figsize=(14,8))
    plt.plot(val_x, val_y, '--o',color=val_color)
    plt.xticks(list(range(1, 11)))#, list(range(1, 11)))
    plt.title(val_title,color=val_color);
    plt.xlabel(val_title_x_label,color=val_color)
    plt.ylabel(val_title_y_label,color=val_color)
    plt.show()
    plt.close(fig)

In [15]:
#fct computing the clusters for a list of possible
#number of centroids according to the kmean or keman++
#and plot the inertia

#it returns the dict of the kmeans obj where
#key=nb of clusters, value= kmeans object
def fct_define_clusters_and_plot_inertia(\
va_name_dataframe,\
va_list_number_clusters,\
va_type_init_kmeans_obj,\
va_color,\
va_verbose_inertia,\
va_title_x_label="Number of clusters",\
va_title_y_label="Inertia",\
va_title="K-means random initialization, Inertia Score by n cluster centers"):

    
    #calculation of the list with the inertias for each desired
    #number of clusters
    li_inertias,di_kmeans_obj=fct_compute_inertia_for_list_of_values(\
    val_name_dataframe=va_name_dataframe,\
    val_list_number_clusters=va_list_number_clusters,\
    val_type_init_kmeans_obj=va_type_init_kmeans_obj,\
    val_verbose_inertia=va_verbose_inertia)
    
    #we ploth the value of the inertia versus the number of clusters
    fct_plot_li_vals(\
    val_x=va_list_number_clusters,\
    val_y=li_inertias,\
    val_color=va_color,\
    val_title_x_label=va_title_x_label,\
    val_title_y_label=va_title_y_label,
    val_title=va_title)
    
    return di_kmeans_obj
    

In [16]:
#https://stackoverflow.com/questions/50297142/get-cluster-points-after-kmeans-in-a-list-format

#function creating a dict: key= id cluster, value=id points in the cluster
def fct_point_sets_per_cluster(val_cluster_object):
    
    #di_rep=dict, key=id cluster,
    #value=arrays with the sample ids  in the cluster
    
    di_rep={}
    
    #di_kmeans_obj_1[4].cluster_centers_

    #Labels of each point
    #di_kmeans_obj_1[4].labels_

    # !! Get the indices of the points for each corresponding cluster
    di_rep = {i: np.where(val_cluster_object.labels_ == i)[0] \
          for i in range(val_cluster_object.n_clusters)}

    return di_rep
    

In [17]:
#fct returning a dict  with the dataframes 
#containing the samples belonging to each cluster

#val_di_points_per_cluster=dict, key=id cluster
#value=array with the sample ids in the cluster

#val_dataframe_with_all_samples= the dataframe with all the samples
#the dataframe should be idexed by the sample id 
def fct_create_dict_dataframes_with_cluster_points_from_dict_pts_per_cluster(\
val_di_points_per_cluster,\
val_dataframe_with_all_samples):
    
    #key=id cluster, value=dataframe with the samples in the cluster
    di_rep={}
    
    #for each cluster id
    for i in val_di_points_per_cluster:
        di_rep[i]=\
        val_dataframe_with_all_samples.iloc[list(val_di_points_per_cluster[i])]
        
    return di_rep

In [18]:
#fct returning a dict  with the dataframes 
#containing the samples belonging to each cluster
#from the cluster object

def fct_create_dict_dataframes_with_cluster_points_from_cluster_object(\
val_cluster_object,\
val_df_with_all_samples):
    
    #di_rep_1=dict, key=id cluster,
    #value=arrays with the sample ids  in the cluster
    di_rep_1=fct_point_sets_per_cluster(val_cluster_object)
    
    #di_rep_2, dictionary
    #key=id cluster, value=dataframe with the samples in the cluster
    
    di_rep_2=fct_create_dict_dataframes_with_cluster_points_from_dict_pts_per_cluster(\
    val_di_points_per_cluster=di_rep_1,\
    val_dataframe_with_all_samples=val_df_with_all_samples)
    
    return di_rep_2
    
    

In [19]:
#fct creating a dict with the dataframes 
#comprised of the samples belonging to each cluster
#(when the number of desired clusters is already selected)
#and saves the dictionary in memory

#val_name_dict_pickled=the name of the dict with the dataframes
#when saved in memory
def fct_create_and_save_in_memory_dict_dfs_with_cluster_samples_from_cluster_object(\
val_cluster_obj,\
val_dataframe_with_all_samples,\
val_name_dict_dfs_pickled                                                                                    
):
    #create the dict, key=id  cluster value=datafrmae with the samples in the cluster
    di=fct_create_dict_dataframes_with_cluster_points_from_cluster_object(\
    val_cluster_object=val_cluster_obj,\
    val_df_with_all_samples=val_dataframe_with_all_samples)
    
    #save the dictionary with the samples per cluster in memory
    with open(val_name_dict_dfs_pickled, 'wb') as fp:
        pickle.dump(di,fp)

In [20]:
#fct creating a dataframe with the coefficients of the best estimator
#in an descending order

#we consider that the gridsearchcv object has 
#a pipeline with the steps 
#[('poly_features', PolynomialFeatures(include_bias=False)),
#('scaler', StandardScaler()),
#('model', ...)]

#val_gridsearch_fit_object: afit gridsearch object

def fct_create_sorted_df_regarding_coef_decreasing_order(\
val_gridsearch_fit_object,\
val_step_model="model"):
    
    #the names of the columns (features)
    v_names_cols=\
    val_gridsearch_fit_object.best_estimator_.steps[0][1][0].get_feature_names_out() 
    
    #the array with the coefficients
    ar=val_gridsearch_fit_object.best_estimator_.steps[0][1][val_step_model].coef_
    
    #we reshape the array at so it wil be 1 row and ar.size columns
    ar1=ar.reshape(1,ar.shape[0])
    
    #we create dataframe with the coefficients. per feature
    df=pd.DataFrame(data=ar1,columns =v_names_cols)
    
    #we rearrnage the dataframe columns in an decreasing 
    #order accoriding to their value
    df1=df[df.sum().sort_values(ascending=False).index]
    
    return df1

In [21]:
#function creating a dataframe with the non zero coefficients
#per feature resulting from Lasso algorithm used as feature selector


#val_name_feature_selector=the name the feature selector (steps)  in the pipleline
#e.g. when pipeline is lasso_feature_selector_with_Ridge_model_pipe
#then val_name_feature_selector ='feature_selector_l2



def fct_create_df_with_Lasso_coefs_used_as_feature_selector(\
val_name_fit_grisearchcv_obj,\
val_name_feature_selector='feature_selector_l2'):
    
    #the coefficients of the selector Lasso passed in the Ridge model 
    #including the zeros
    ar=val_name_fit_grisearchcv_obj.best_estimator_.named_steps['estim'].\
    named_steps[val_name_feature_selector].estimator_.coef_

    ar1=ar.reshape(1,ar.shape[0])


    #the columns createed by polynomeal features for the best dimension
    v_names_cols=\
    val_name_fit_grisearchcv_obj.best_estimator_.steps[0][1][0].get_feature_names_out() 
    
    #dataframe witht the coefs per feature, including the zero ones   
    df=pd.DataFrame(data=ar1,columns =v_names_cols)

    zero_cols =\
    [ col for col, is_zero in ((df == 0).sum() == df.shape[0]).items() if is_zero ]
    df.drop(zero_cols, axis=1, inplace=True)

    #(19) feature (coefficients ≠ 0)
    return df

In [22]:
#function creating barplot from a dataframe

#val_df=dataframe to plot
def fct_create_barplot(\
val_df_with_absolute_vals,\
val_title="RIDGE FEATURE IMPORTANCE",\
val_xlabel="Feature",\
val_ylabel="Ridge Coefficient"):
    
    sns.set_style('darkgrid')

    gfg=val_df_with_absolute_vals.plot(kind="bar", figsize = (22, 14))
    gfg.set_title(val_title,c="navy",fontsize=22,weight='bold')
    gfg.set_xlabel(val_xlabel,c="navy",fontsize=22,weight='bold')
    gfg.set_ylabel(val_ylabel,c="navy",fontsize=22,weight='bold')
    plt.show()
    plt.close()

In [23]:
#function creating a dataframe with the non zero coefficients
#per feature resulting from Lasso algorithm used as feature selector


#val_name_feature_selector=the name the feature selector (steps)  in the pipleline
#e.g. when pipeline is lasso_feature_selector_with_Ridge_model_pipe
#then val_name_feature_selector ='feature_selector_l2



def fct_create_df_with_Lasso_coefs_used_as_feature_selector(\
val_name_fit_grisearchcv_obj,\
val_name_feature_selector='feature_selector_l2',\
val_round=4):
    
    #the coefficients of the selector Lasso passed in the Ridge model 
    #including the zeros
    ar=val_name_fit_grisearchcv_obj.best_estimator_.named_steps['estim'].\
    named_steps[val_name_feature_selector].estimator_.coef_

    ar1=ar.reshape(1,ar.shape[0])


    #the columns createed by polynomeal features for the best dimension
    v_names_cols=\
    val_name_fit_grisearchcv_obj.best_estimator_.steps[0][1][0].get_feature_names_out() 
    
    #dataframe with the coefs per feature, including the zero ones   
    df=pd.DataFrame(data=ar1,columns =v_names_cols)

    #zero_cols =\
    #[ col for col, is_zero in ((df == 0).sum() == df.shape[0]).items() if is_zero ]
    
    #we round all values of the lasso coefficients to 4 decimals
    df=df.round(val_round)
    
    
    # count the number of zeros in each column
    zero_counts = (df == 0).sum(axis=0)

    zero_cols = zero_counts[zero_counts != 0].index.tolist()
    
    df.drop(zero_cols, axis=1, inplace=True)
    
    #print("zero_cols",zero_cols)
    #print()

    #print("df", df)
    #print("df.shape", df.shape)
    #(19) feature (coefficients ≠ 0)
    return df

In [24]:
#function creating a dataframe
#with the Ridge (used as model) coefficients resulting from
# the pipeline
#Lasso feature selector and Ridge regression model


#it returns a dataframe with the Ridge coefficients per column

#val_fit_gridsearchcv_object=fit gridsearchcv object

#val_df_with_coefs_from_lasso_selector=thh dataframe
#with the coefficients of the selected features by the Lasso algorithm
#used as selector
#these coefficients are ≠ zero


def fct_create_df_LR_or_Ridge_coefs_from_pipe_using_lasso_feature_selector(\
val_fit_gridsearchcv_object,\
val_df_with_coefs_from_lasso_selector):
    
    #the coefficients of the selector Lasso passed in the Ridge model 
    #including the zeros
    #ar=\
    #val_fit_gridsearchcv_object.best_estimator_.named_steps['estim'].\
    #named_steps['feature_selector_l2']\
    #.estimator_.coef_
    
    coef_model=\
    val_fit_gridsearchcv_object.best_estimator_.named_steps["estim"]["model"].coef_
    

    coef_model_reshaped=coef_model.reshape(1,coef_model.shape[0])
    

    #the columns createed by polynomeal features for the best dimension
    #v_names_cols=\
    #val_fit_gridsearchcv_object.best_estimator_.steps[0][1][0].get_feature_names_out() 
    
    
    #dataframe witht the coefs per feature,  
    df_model=\
    pd.DataFrame(data=coef_model_reshaped,columns=\
                 val_df_with_coefs_from_lasso_selector.columns)

    
    return df_model

In [25]:
#function which treats and plots the resuts
#of the pipeline
#Lasso feature selector and Ridge regression model
#that is the pipeline
#lasso_feature_selector_with_Ridge_model_pipe

#it returns a dataframe with the sorted absolute values 
#of the Ridge or LR coefs 
#in a decreasing order
#when features are selected with 
#Lasso algo
# and
#a dateframe with the

#val_round=the value to round the coefficients from Lasso selector
#the vakue depends on the data
#if too many deimals are zero then the  following algo that is Ridge may 
#see them as zeros and remove the variable
#Consequently the number of features selected from Lasso will be
#greater from the number of features utilized by Ridge and the 
#function fct_create_df_LR_or_Ridge_coefs_from_pipe_using_lasso_feature_selector
#will create errors when it defines the dataframe with the Ridge coefficients
def fct_plot_and_treat_LR_or_Ridge_results_when_lasso_feature_selector(\
val_fit_gridsearchcv_obj,\
val_name_feature_selector_lasso='feature_selector_l2',                                                                            
val_title_fig="FEATURE IMPORTANCE",\
val_xlabel_fig="Feature",\
val_ylabel_fig="Coefficient",
val_round=4):
    
    #create the dataframe with the non zero coefficients
    #resulting from Lasso whan used as feature selector
    df_lasso_coefs_as_feature_selector=\
    fct_create_df_with_Lasso_coefs_used_as_feature_selector(\
    val_name_fit_grisearchcv_obj=val_fit_gridsearchcv_obj,\
    val_name_feature_selector=val_name_feature_selector_lasso,\
    val_round=val_round)
    
    #print(df_lasso_coefs_as_feature_selector)
    
    #create of the df with the Ridge coefficients
    df=fct_create_df_LR_or_Ridge_coefs_from_pipe_using_lasso_feature_selector(\
    val_fit_gridsearchcv_object=val_fit_gridsearchcv_obj,\
    val_df_with_coefs_from_lasso_selector=df_lasso_coefs_as_feature_selector)                                                                               

    
    #plot the absolute values of the Ridge  coefficients as barplot
    fct_create_barplot(\
    val_df_with_absolute_vals=df.abs(),\
    val_title=val_title_fig,\
    val_xlabel=val_xlabel_fig,\
    val_ylabel=val_ylabel_fig)
                                                                            
                                                                            
                                                                            
    #we rearrange the  absolute values of the dataframe columns 
    #(which are the Ridge feature coefficients)
    #in an decreasing_order
    
    df1_abs_sorted=df.abs()[df.abs().sum().sort_values(ascending=False).index]
    
    #we also return the real values of the coefficients sorted 
    df1_sorted=df[df.sum().sort_values(ascending=False).index]
    
    return df1_abs_sorted, df1_sorted                                                                        
                                                                           
                                                                          

In [26]:
#fct creating a dataframe with the coefficients of the best estimator
#in an descending order for a pipeline without feature selection

#it returns a sorted dataframe with the absolute values 
#of the (feature) coefficients in an decreasing order
#and
#a sorted dataframe with the values 
#of the (feature) coefficients in an decreasing order

#we consider that the gridsearchcv object has 
#a pipeline with the steps 
#[('poly_features', PolynomialFeatures(include_bias=False)),
#('scaler', StandardScaler()),
#('model', ...)]

#val_gridsearch_fit_object: afit gridsearch object

def fct_create_sorted_df_regarding_model_coef_no_feature_selection_decreasing_order(\
val_gridsearch_fit_object,\
val_step_model="model"):
    
    #the names of the columns (features)
    v_names_cols=\
    val_gridsearch_fit_object.best_estimator_.steps[0][1][0].get_feature_names_out() 
    
    #the array with the coefficients (dimension of the array (nb coeffic, 1))
    ar=val_gridsearch_fit_object.best_estimator_.steps[0][1][val_step_model].coef_
    
    #we reshape the array at so it wil be 1 row and ar.size columns
    ar1=ar.reshape(1,ar.shape[0])
    
    #we create dataframe with the coefficients. per feature
    df=pd.DataFrame(data=ar1,columns =v_names_cols)
    
    #we rearrange the dataframe columns in an decreasing 
    #order accoriding to their value
    df1_abs_sorted=df.abs()[df.abs().sum().sort_values(ascending=False).index]
    
    df1_sorted=df[df.sum().sort_values(ascending=False).index]
    
    return df1_abs_sorted, df1_sorted
                                                    

In [27]:
#fct creating and ploting a dataframe with the  feature coefficients
#when no feature selection is employed 
#it returns 
#it returns a sorted dataframe with the absolute values 
#of the (feature) coefficients in an decreasing order
#and
#a sorted dataframe with the values 
#of the (feature) coefficients in an decreasing order

def fct_plot_and_treat_coefficients_model_without_feature_selection(\
val_gridsearchcv_fit_object,\
val_step_model="model",\
val_title_fig="FEATURE IMPORTANCE",\
val_xlabel_fig="Feature",\
val_ylabel_fig="Ridge Coefficient"):
    
    #create the dataframes with the absolute and real values of the 
    #coefficients of each feature in descending order
    df_abs_sorted,df_sorted=\
    fct_create_sorted_df_regarding_model_coef_no_feature_selection_decreasing_order(\
    val_gridsearch_fit_object=val_gridsearchcv_fit_object,\
    val_step_model="model")
    
    #barplots of the absolute values of the coefficients
    fct_create_barplot(\
    val_df_with_absolute_vals=df_abs_sorted,\
    val_title=val_title_fig,\
    val_xlabel=val_xlabel_fig,\
    val_ylabel=val_ylabel_fig)
    
    return df_abs_sorted,df_sorted

In [28]:
#it returns a dictionary
#key=the number of samples
#value=the number of clusters

#and a dict, wuth the dbscan objects  per sample and epsilon
#key=value of  sample, value= idt, key=value of epsilon, value=dbscan object

#IT SHOULD BE ADDED A DICT WITH THE DATAFRAMES PER VALUE OF SAMPLE
#EACH DATAFRAME WIL HAVE THE SAMPLES FOR EACH  CLUSTET CORRESPONDING
#TO THE VALUE OF KEY EPSILON
#THIS WILL BE USED TO FIND THE MOST IMPORTANT FEATURES OF EACH CLUSTER
#CREATED BY DBSCAN JUST LIKE WE DID WITH THE OTHER UNSUPERVISED ALGOS
#KMEANS AND KMANS++ (TO BE DONE)

def fct_create_dbscan_clusters(\
val_array_epsilons,\
val_li_samples,
val_dataframe
):
    
    #di=dict
    #key=nb samples
    #value=[,...nb clusters for the ith epsilon,.....]
    di={}
    
    
    #di_1=dict
    #key=id sample, value=dict, key=id epsilon, value=dbscan object
    di_1={}
    
    #for each sample
    for i in val_li_samples:
        #a=round(i,2)
        di[i]=[]
        
        di_1[i]={}
        #for each epsilon
        for j in val_array_epsilons:
        
            dbscan = cluster.DBSCAN(eps=j, min_samples=i).fit(val_dataframe)
            
            n_clusters = len(np.unique(dbscan.labels_))
            
            di[i].append(n_clusters)
            
            di_1[i][j]=dbscan
            
        
            
    return di, di_1

In [29]:
#val_di=dict
#key=nb samples
#value=[..., nb clusters for the ith epsilon,.....]
def fct_plot_clusters_versus_epsilon(val_di,val_array_epsilons):
    
    plt.figure(figsize=(14,7))
    
    for i in val_di.keys():
    #print(i)
        plt.plot(val_array_epsilons, val_di[i],label="nb samples: "+str(i))
        plt.xlabel('Epsilon')
        plt.ylabel('Number of Clusters')
        plt.title("How the Number of Clusters varies with parameter epsilon ")
        plt.legend()
        plt.show()
        
        
        
        #plt.close(fig)

In [30]:
def fct_create_and_plot_dbscan_clusters(\
val_array_epsilons,\
val_li_samples,
val_dataframe):
    
    #di=dict
    #key=the number of samples
    #value=the number of clusters
    
    #di_2=dict
    #key=the number of samples
    #value=the dbscan object
    di,di_2=fct_create_dbscan_clusters(\
    val_array_epsilons=val_array_epsilons,\
    val_li_samples=val_li_samples,
    val_dataframe=val_dataframe
    )
    
    #we plot the number of clusters per value of parameter \epsilon
    fct_plot_clusters_versus_epsilon(val_di=di,\
                                    val_array_epsilons=val_array_epsilons)
    
    #di_dfs=fct_create_dict_dataframes_with_cluster_points_from_cluster_object(\
    #val_cluster_object,\
    #val_df_with_all_samples)
    
    return di,di_2