# Feature selection Techniques

This notebook consists in detailing the most appropriate features according to different feature selection techniques

## Filter Methods 

Calculate the correlations between the features and target attributes

### Divide of features into two categories 
    - Categorical features (Nominal): albumin - sugar - red_blood_cells  - pus_cell  - pus_cell_clumps  - bacteria  - hypertension  - diabetes_mellitus  - coronary_artery_disease  - appetite  - peda_edema  - aanemia  
 
    - Numearical features (Ordinal) : age  - blood_pressure  - specific_gravity    - blood_glucose_random  - blood_urea  - serum_creatinine  - sodium  - potassium  - hemoglobin  - packed_cell_volume  - white_blood_cell_count  - red_blood_cell_count
    

To divide them, we checked the values in the dataset of each feature and the meaning behind it 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [None]:
correlation_numfeature_numfeature=0.7
correlation_catfeature_catfeature=0.7
correlation_numfeature_catfeature=0.7
correlation_target_numfeature=0.2
correlation_target_catfeature=0.2
important_num_features=[]
important_cat_features=[]
how_to_divide_features_num_cat='default' 
#  It can also be: automatically / or the user can enter: manually.
threshold_unique_values_tobe_cat_feature=5

## List of personalized functions used in this notebook:

In [None]:
def numberOfLigne(numberGraphe,numberOfColumn):
    return int(math.ceil(numberGraphe/numberOfColumn))


In [None]:
def detect_correlated_feature_feature(mask,list_important_feature):

    # will be used for plotting high_corr_pairs and selected_pairs
    high_corr_pairs = np.where(mask)
    selected_pairs = [(list_important_feature[i], list_important_feature[j]) for i, j in zip(*high_corr_pairs) if i != j]

    strongly_correlated_features = corr[mask].stack().dropna().reset_index()
    
    
    return  strongly_correlated_features, selected_pairs

In [None]:
def delete_correlated_feature(strongly_correlated_features,important_features):
    deleted_features=[]
    for f in important_features:
    
        if f not in deleted_features:
            df=strongly_correlated_features[strongly_correlated_features['level_0']==f]
    
            mask = np.isin(important_features, df['level_1'].values)
               
       
            
            deleted_features=np.concatenate((deleted_features, important_features[mask]), axis=0).copy()
            
    mask = np.isin(important_features, deleted_features)
   
    important_features=important_num_features[~mask]
    
    return important_features,deleted_features


In [None]:
def transform_unique_paris(selected_pairs):
    unique_pairs = set()
    for pair in selected_pairs:
    
        sorted_pair = tuple(sorted(pair))
        unique_pairs.add(sorted_pair)


    unique_pairs_list = list(unique_pairs)
    return unique_pairs_list

In [None]:
def features_num_cat_manually(columns , target , cat_features,num_features):
    for col in columns:
        if( col != target):
            while True:
                    print(f'Enter the type of feature {col} by writing cat or num:')
                    type_feature = input()
                    if type_feature== 'cat' or type_feature=='num':
                        break
            if type_feature=='cat':
                cat_features.append(col)
            else:
                num_features.append(col)

In [None]:
def features_num_cat_automatically(columns , target , cat_features,num_features,Threshold):
    for col in clean_df:
        if( col != target):
            if len(clean_df[col].unique()) <= Threshold:
                cat_features.append(col)
            else:
                num_features.append(col)

In [None]:
def get_name_target(columns):
    while True:
        print('Donner le nom de la target')
        target = input()
        if target in columns:
            break
    return target

In [None]:
clean_df = pd.read_csv(r'Data/clean_dataset.csv')
clean_df.head()

In [None]:
cat_features=[]
num_features=[]
if how_to_divide_features_num_cat=='manually':
    target=get_name_target(clean_df.columns)
    features_num_cat_manually(clean_df.columns , target , cat_features,num_features) 
elif how_to_divide_features_num_cat=='automatically':
    target=get_name_target(clean_df.columns)
    features_num_cat_automatically(clean_df.columns , target , cat_features,num_features,threshold_unique_values_tobe_cat_feature)
else:
    target='classification'
    
    num_features=['age'  ,
              'blood_pressure'  ,
              'specific_gravity'    ,
              'blood_glucose_random'  ,
              'blood_urea'  ,
              'serum_creatinine'  ,
              'sodium'  ,
              'potassium'  ,
              'hemoglobin'  ,
              'packed_cell_volume',
              'white_blood_cell_count'  ,
              'red_blood_cell_count']
    
    cat_features=['albumin' ,
              'sugar' ,
              'red_blood_cells'  ,
              'pus_cell'  ,
              'pus_cell_clumps'  ,
              'bacteria'  ,
              'hypertension'   ,
              'diabetes_mellitus'  ,
              'coronary_artery_disease'  ,
              'appetite'   ,
              'peda_edema'  ,
              'aanemia']

In [None]:
cat_features

In [None]:
num_features

In [None]:
target

In [None]:
all_features = np.concatenate((cat_features, num_features), axis=0)
all_features

### Variance test
The objective of this test is to determine the constants in the project

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
selector = VarianceThreshold(threshold = 0)
selector.fit(clean_df)

print(selector.get_support())

In [None]:
# select the name of columns that are constant value false
columns_to_delete =clean_df.columns[np.logical_not(selector.get_support())]

print(columns_to_delete)

clean_df = clean_df.drop(columns=columns_to_delete)

clean_df.head()

In [None]:
print('conclusion')
if len(columns_to_delete) ==0:
    print('There is no constant in the project')
else :
    print('columns deleted : ',columns_to_delete)

### Correlation between numerical features and  the target (categorical variable) 

In [None]:
corr=[]
for feature in num_features:
    pb_corr, pb_p_value = stats.pointbiserialr(   # Used between a binary categorical variable and numerical one
        clean_df[feature],
        clean_df[target])
    corr.append([feature,abs(pb_corr)])
corr=pd.DataFrame(corr,columns=['num_feature','correlation with the target'])

display(corr.sort_values(by='correlation with the target'  ,ascending=False))

features=corr[corr['correlation with the target']>correlation_target_numfeature].sort_values(by='correlation with the target'  ,ascending=False)


important_num_features=  features['num_feature'].values.copy() 




In [None]:
print('conclusion :')
print(f'Important features according to the parametres correlation_target_feature {correlation_target_numfeature}')
print('-----------------------------')
print(important_num_features)
print('-----------------------------')
print(f'features deleted according to the parametres correlation_target_feature {correlation_target_numfeature}')
print(corr[corr['correlation with the target']<correlation_target_numfeature])



In [None]:
# Check the resultat by ploting some graphs
# for the different 3 features (hemoglobin , packed_cell_volume ,specific_gravity) high correlated with the target
# for the different 3 features (white_blood_cell_count , potassium , age) that are low correlated with the target 

plt.figure(figsize = (20, 25))
plotnumber=1

numberGrapheColumn=2
numberGrapheLigne =numberOfLigne(len(important_num_features),numberGrapheColumn)

for feature in important_num_features:
    ax = plt.subplot(numberGrapheLigne, numberGrapheColumn, plotnumber)
    sns.boxplot(x=target, y=feature, data=clean_df)
    plotnumber+=1

plt.show()

# Can confirm the resultats by the graphs

### Correlation between numerical features (Pearson)

In [None]:
corr=clean_df[important_num_features].corr()

plt.figure(figsize=(16, 6))
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
mask = (abs(corr) > correlation_numfeature_numfeature) & (abs(corr) <1)

strongly_correlated_features,selected_pairs=detect_correlated_feature_feature(mask,important_num_features)


display(strongly_correlated_features)

print(selected_pairs)


In [None]:
important_num_features

In [None]:
if len(selected_pairs)>0:
    important_num_features,deleted_features=delete_correlated_feature(strongly_correlated_features,important_num_features)
    print(deleted_features)
    print(important_num_features)

In [None]:
print('Conclusion')
if len(selected_pairs)>0:
    print('------------------------')
    print(f"feature deleted due to high correlation ",deleted_features)
    print('------------------------')
    print(f"numerical feature:",important_num_features)
else:
    print('There is no correlation')

In [None]:
# Get the feature names for the high correlation pairs

selected_pairs

In [None]:
unique_pairs_list=transform_unique_paris(selected_pairs)

print(unique_pairs_list)

In [None]:
# Check the resultat by ploting some graph
# for the different features (hemoglobin , packed_cell_volume ,red_blood_cell_count) are correlated
# (hemoglobin , potassium , age) that are not correlated 



plt.figure(figsize = (15, 7))
plotnumber=1


numberGrapheLigne =numberOfLigne(len(unique_pairs_list),numberGrapheColumn)

for features in unique_pairs_list:
    ax = plt.subplot(numberGrapheLigne, numberGrapheColumn, plotnumber)
    sns.scatterplot(data=clean_df, x=features[0],y=features[1])
    plotnumber+=1

plt.show()


# Can confirm the resultats by the graphs

### Correlation between categorical features and the categorical target

In [None]:
def carmerV(clean_df,col1,col2):
    contingency_table = pd.crosstab(clean_df[col1], clean_df[col2])
    

    # Calculate Cramér's V
    chi2 = stats.chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    V = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    
    return V

In [None]:
corr=[]
for col in cat_features:
    V=carmerV(clean_df,col,target)
    corr.append([col,V])
    

corr=pd.DataFrame(corr,columns=['cat_feature','correlation with the target'])

corr.sort_values(by='correlation with the target'  ,ascending=False)

In [None]:
features=corr[corr['correlation with the target']>correlation_target_catfeature].sort_values(by='correlation with the target'  ,ascending=False)


important_cat_features=  features['cat_feature'].values.copy() 

important_cat_features

In [None]:
print('conclusion :')
print(f'Important features according to the parametres correlation_target_feature {correlation_target_catfeature}')
print('-----------------------------')
print(important_cat_features)
print('-----------------------------')
print(f'features deleted according to the parametres correlation_target_feature {correlation_target_catfeature}')
print(corr[corr['correlation with the target']<correlation_target_catfeature])


### Correlation between categorical features 

In [None]:
rows= []

for f1 in cat_features:
    col = []
    for f2 in cat_features:
        v=carmerV(clean_df,f1,f2)
        col.append(v)
    rows.append(col)
    
cramers_results = np.array(rows)
corr_cat_features = pd.DataFrame(cramers_results, columns = cat_features, index =cat_features)
corr_cat_features

In [None]:
important_cat_features

In [None]:
mask = (corr_cat_features > correlation_catfeature_catfeature) & (corr_cat_features <0.97)
mask

strongly_correlated_features, selected_pairs=detect_correlated_feature_feature(mask,important_cat_features)

display(strongly_correlated_features)

print(selected_pairs)

In [None]:
if len(selected_pairs)>0:
    important_cat_features,deleted_features=delete_correlated_feature(strongly_correlated_features,important_cat_features)
    
    print(important_cat_features)
    
    print(deleted_features)

In [None]:
print('Conclusion')
if len(selected_pairs)>0:
    print('------------------------')
    print(f"feature deleted due to high correlation ",deleted_features)
    print('------------------------')
    print(f"numerical feature:",important_cat_features)
else:
    print('There is no correlation')

In [None]:
if len(selected_pairs)>0:
    
    unique_pairs_list=transform_unique_paris(selected_pairs)

    print(unique_pairs_list)

    plt.figure(figsize = (15, 7))
    plotnumber=1


    numberGrapheLigne =numberOfLigne(len(unique_pairs_list),numberGrapheColumn)

    for features in unique_pairs_list:
        ax = plt.subplot(numberGrapheLigne, numberGrapheColumn, plotnumber)
        sns.scatterplot(data=clean_df, x=features[0],y=features[1])
        plotnumber+=1

    plt.show()



### Correlation between categorical features and num features  

In [None]:
from sklearn.feature_selection import f_classif



plt.figure(figsize = (10,200))
plotnumber=1


for cat_feature in  important_cat_features:
    f_scores, p_values = f_classif(clean_df[important_num_features],clean_df[cat_feature])
    
    # Print the results for the current target variable
    for j, feature in enumerate(important_num_features):
        if p_values[j]>0.05 :
            print(f" {feature} and {cat_feature}: F-score = {f_scores[j]}, p-value = {p_values[j]}")
            ax = plt.subplot(len(important_cat_features)*len(important_num_features), 1, plotnumber)
            sns.boxplot(x=cat_feature, y=feature, data=clean_df)
            plotnumber+=1
plt.show()

### Store the important features into json file

In [None]:
# write the list of feature into an excel file 
import json

important_features = np.concatenate((important_num_features, important_cat_features), axis=0).tolist()
print(important_features)
# Open the file in write mode
with open('important_features.json', 'w') as file:
    # Write the array to the file in JSON format
    json.dump(important_features, file)

## Feature importance using tree model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,
                                  random_state=0)

rf.fit(clean_df[all_features], clean_df[target])

feature_importances = rf.feature_importances_

feature_importances

In [None]:
indices = feature_importances.argsort()[::-1]
sorted_feature_importances = feature_importances[indices]
sorted_feature_names = [all_features[i] for i in indices]

# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.bar(range(len(sorted_feature_importances)), sorted_feature_importances)
plt.xticks(range(len(sorted_feature_importances)), sorted_feature_names, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.title('Feature Importances')
plt.tight_layout()
plt.show()

## Feature importance using Permutation Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance



train_X, val_X, train_y, val_y = train_test_split(clean_df[all_features], clean_df[target], random_state=1)
my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)


perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

## Feature importance using  Coefficients in  Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(clean_df[all_features], clean_df[target])

coefficients = logreg.coef_[0]

# Get the absolute feature importance
feature_importances = np.abs(coefficients)

# Sort feature importances in descending order
sorted_idx = np.argsort(feature_importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.bar(range(len(sorted_idx)), feature_importances[sorted_idx])
plt.xticks(range(len(sorted_idx)), [clean_df[all_features].columns[i] for i in sorted_idx], rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.title('Feature Importances (Logistic Regression)')
plt.tight_layout()
plt.show()

## Represent the data in other format and check if it improve the resultat of the machine learning algorthms 

### PCA representation

In [None]:
from sklearn.decomposition import PCA

pca= PCA (n_components=0.99,whiten=True)

pca_data=pca.fit_transform(clean_df)
variance_explained =pca.explained_variance_ratio_
print('The variance added by each component')
print(variance_explained)
print('number of PCA is ',len(variance_explained))

In [None]:
cumulative_variance_ratio = np.cumsum(variance_explained)

# Plot the cumulative explained variance ratio
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Variance Explained by Principal Components')
plt.show()

In [None]:
scatter = sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=clean_df[target], palette='viridis')

# Manually set the legend labels
scatter.legend_.set_title('classification')
scatter.legend_.texts = ['ckd', 'notckd']

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Scatter Plot with 2 PC')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

from matplotlib.colors import ListedColormap

# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette("husl", 2).as_hex())

# plot
sc = ax.scatter(pca_data[:, 0], pca_data[:, 1], pca_data[:, 2], s=40, c=clean_df[target], marker='o', cmap=cmap, alpha=1)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('Scatter Plot with 3 PC')


# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)


In [None]:
name_columns_pc = [f'PC{i}' for i in range(1, len(variance_explained) + 1)]
name_columns_pc

In [None]:
df_pca = pd.DataFrame(data=pca_data,columns=name_columns_pc)
df_pca=pd.concat((df_pca, clean_df[target]), axis=1)
df_pca.head()

In [None]:
# Saved it to excel file 
df_pca.to_csv('Data/pca_dataset.csv', index=False)

### LDA representation

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

# Fit the LDA model on the training data
X_lda=lda.fit_transform(clean_df[all_features], clean_df[target])

X_lda.size

In [None]:
plt.scatter(X_lda, clean_df[target], c=clean_df[target], cmap='viridis')
plt.xlabel('LDA Component')
plt.ylabel('Class')
plt.title('Scatter Plot of LDA-Transformed Data')
plt.colorbar(label='Class')
plt.show()

In [None]:
name_columns_lda = [f'LDA{i}' for i in range(1, len(lda.explained_variance_ratio_) + 1)]
name_columns_lda

In [None]:
df_lda = pd.DataFrame(data=X_lda,columns=name_columns_lda)
df_lda=pd.concat((df_lda, clean_df[target]), axis=1)
df_lda.head()

In [None]:
# Saved it to excel file 
df_lda.to_csv('Data/lda_dataset.csv', index=False)

## TO DO:
- Automation of feature importance for correlation / and feature importance (reorganise et creation d'un fichier json qui contient les valuers)
- Automation of dataset division selon la variation c'est esay 
