# Feature selection Techniques

This notebook consists in detailing the most appropriate features according to different feature selection techniques

## Filter Methods 

Calculate the correlations between the features and target attributes

### Divide of features into two categories 
    - Categorical features (Nominal): albumin - sugar - red_blood_cells  - pus_cell  - pus_cell_clumps  - bacteria  - hypertension  - diabetes_mellitus  - coronary_artery_disease  - appetite  - peda_edema  - aanemia  
 
    - Numearical features (Ordinal) : age  - blood_pressure  - specific_gravity    - blood_glucose_random  - blood_urea  - serum_creatinine  - sodium  - potassium  - hemoglobin  - packed_cell_volume  - white_blood_cell_count  - red_blood_cell_count
    

To divide them, we checked the values in the dataset of each feature and the meaning behind it 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [None]:
clean_df = pd.read_csv(r'Data/clean_dataset.csv')
clean_df.head()

In [None]:
target='classification'

In [None]:
cat_features=['albumin' ,
              'sugar' ,
              'red_blood_cells'  ,
              'pus_cell'  ,
              'pus_cell_clumps'  ,
              'bacteria'  ,
              'hypertension'   ,
              'diabetes_mellitus'  ,
              'coronary_artery_disease'  ,
              'appetite'   ,
              'peda_edema'  ,
              'aanemia']

In [None]:
num_features=['age'  ,
              'blood_pressure'  ,
              'specific_gravity'    ,
              'blood_glucose_random'  ,
              'blood_urea'  ,
              'serum_creatinine'  ,
              'sodium'  ,
              'potassium'  ,
              'hemoglobin'  ,
              'packed_cell_volume',
              'white_blood_cell_count'  ,
              'red_blood_cell_count']

In [None]:
features = np.concatenate((cat_features, num_features), axis=0)
features

In [None]:
len(num_features)+len(cat_features)

### Variance test
The objective of this test is to determine the constants in the project

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
selector = VarianceThreshold(threshold = 0)
selector.fit(clean_df)

print(selector.get_support())

Conclusion:
- There is no constant in the project

### Correlation between numerical features (Pearson)

In [None]:
corr=clean_df[num_features].corr()

plt.figure(figsize=(16, 6))
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# strongly  correlated feature are 0.7 and more so we will select the thershold = 0.7
correlation_threshold=0.7

mask = (abs(corr) > correlation_threshold) & (abs(corr) <1)
strongly_correlated_features = corr[mask].stack().dropna().reset_index()

selected_features = set(strongly_correlated_features['level_0']).union(strongly_correlated_features['level_1'])

strongly_correlated_features

In [None]:
selected_features

In [None]:
def maximizeCorrelation(corr,selected_features):
    max=0
    best_feature_tokeep=''
    for feature in selected_features:
        s= corr[corr['level_0']==feature][0].sum()
        if s > max:
            max=s
            best_feature_tokeep=feature
    return [best_feature_tokeep,max]

In [None]:
maximizeCorrelation(strongly_correlated_features,selected_features)

In [None]:
math.ceil(math.factorial(len(selected_features)) /4)

In [None]:
import warnings

# Check the resultat by ploting some graph
# for the different features (hemoglobin , packed_cell_volume ,red_blood_cell_count) are correlated
# (hemoglobin , potassium , age) that are not correlated 

features=['packed_cell_volume','red_blood_cell_count','potassium','age']


plt.figure(figsize = (15, 7))
plotnumber=1


for feature in features:
    ax = plt.subplot(2, 2, plotnumber)
    sns.scatterplot(data=clean_df, x='hemoglobin', y=feature)
    plotnumber+=1

plt.show()


# Can confirm the resultats by the graphs

#### Conclusion:
- We can delete 'packed_cell_volume' and 'red_blood_cell_count' features and keep the 'hemoglobin' feature, as it maximizes the desired outcome.
- We need to first check the correlation of these features with the target variable before deciding whether to delete them.

### Correlation between numerical features and  the target (categorical variable) 

In [None]:
corr=[]
for feature in num_features:
    pb_corr, pb_p_value = stats.pointbiserialr(   # Used between a binary categorical variable and numerical one
        clean_df[feature],
        clean_df[target])
    corr.append([feature,abs(pb_corr)])
corr=pd.DataFrame(corr,columns=['num_feature','correlation with the target'])

corr.sort_values(by='correlation with the target'  ,ascending=False)

In [None]:
# Check the resultat by ploting some graphs
# for the different features (hemoglobin , packed_cell_volume ,specific_gravity) high correlated with the target
# (white_blood_cell_count , potassium , age) that are low correlated with the target 
features=['hemoglobin','specific_gravity','packed_cell_volume','potassium','white_blood_cell_count','age']


plt.figure(figsize = (20, 15))
plotnumber=1


for feature in features:
    ax = plt.subplot(3, 2, plotnumber)
    sns.boxplot(x=target, y=feature, data=clean_df)
    plotnumber+=1

plt.show()

# Can confirm the resultats by the graphs

Conclusion:
- The most important features are : hemoglobin, specific_gravity ,packed_cell_volume ,red_blood_cell_count
- feature to delete : potassium

### Correlation between categorical features and the categorical target

In [None]:
def carmerV(clean_df,col1,col2):
    contingency_table = pd.crosstab(clean_df[col1], clean_df[col2])
    

    # Calculate Cramér's V
    chi2 = stats.chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    V = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    
    return V

In [None]:
corr=[]
for col in cat_features:
    V=carmerV(clean_df,col,target)
    corr.append([col,V])
    

corr=pd.DataFrame(corr,columns=['cat_feature','correlation with the target'])

corr.sort_values(by='correlation with the target'  ,ascending=False)

Conclusion:
- The most important features are : albumin, hypertension ,diabetes_mellitus ,red_blood_cells


### Correlation between categorical features 

In [None]:
rows= []

for f1 in cat_features:
    col = []
    for f2 in cat_features:
        v=carmerV(clean_df,f1,f2)
        col.append(v)
    rows.append(col)
    
cramers_results = np.array(rows)
corr_cat_features = pd.DataFrame(cramers_results, columns = cat_features, index =cat_features)
corr_cat_features

In [None]:
correlation_threshold=0.7
mask = (corr_cat_features > correlation_threshold) & (corr_cat_features <0.97)
corr_cat_features[mask]

Conclusion:
- Categorical features are not correlated with each other (have low correlation)  

In [None]:
### Test de correlation entre cat features and num features (utilisation de anova / faut verifier les assumptions)

## Feature importance using tree model

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100,
                                  random_state=0)

rf.fit(clean_df[features], clean_df[target])

feature_importances = rf.feature_importances_

feature_importances

In [None]:
clean_df.columns

In [None]:
indices = feature_importances.argsort()[::-1]
sorted_feature_importances = feature_importances[indices]
sorted_feature_names = [clean_df.columns[i] for i in indices]

# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.bar(range(len(sorted_feature_importances)), sorted_feature_importances)
plt.xticks(range(len(sorted_feature_importances)), sorted_feature_names, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.title('Feature Importances')
plt.tight_layout()
plt.show()

## Feature importance using Permutation Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance



train_X, val_X, train_y, val_y = train_test_split(clean_df[features], clean_df[target], random_state=1)
my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)


perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

## Feature importance using  Coefficients in  Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(clean_df[features], clean_df[target])

coefficients = logreg.coef_[0]

# Get the absolute feature importance
feature_importances = np.abs(coefficients)

# Sort feature importances in descending order
sorted_idx = np.argsort(feature_importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.bar(range(len(sorted_idx)), feature_importances[sorted_idx])
plt.xticks(range(len(sorted_idx)), [clean_df[features].columns[i] for i in sorted_idx], rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.title('Feature Importances (Logistic Regression)')
plt.tight_layout()
plt.show()

## Represent the data in other format and check if it improve the resultat of the machine learning algorthms 

### PCA representation

In [None]:
from sklearn.decomposition import PCA

pca= PCA (n_components=0.99,whiten=True)

pca_data=pca.fit_transform(clean_df)
variance_explained =pca.explained_variance_ratio_
print('The variance added by each component')
print(variance_explained)
print('number of PCA is ',len(variance_explained))

In [None]:
cumulative_variance_ratio = np.cumsum(variance_explained)

# Plot the cumulative explained variance ratio
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Variance Explained by Principal Components')
plt.show()

In [None]:
scatter = sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=clean_df[target], palette='viridis')

# Manually set the legend labels
scatter.legend_.set_title('classification')
scatter.legend_.texts = ['ckd', 'notckd']

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Scatter Plot with 2 PC')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

from matplotlib.colors import ListedColormap

# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette("husl", 2).as_hex())

# plot
sc = ax.scatter(pca_data[:, 0], pca_data[:, 1], pca_data[:, 2], s=40, c=clean_df[target], marker='o', cmap=cmap, alpha=1)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('Scatter Plot with 3 PC')


# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)


In [None]:
name_columns_pc = [f'PC{i}' for i in range(1, len(variance_explained) + 1)]
name_columns_pc

In [None]:
df_pca = pd.DataFrame(data=pca_data,columns=name_columns_pc)
df_pca=pd.concat((df_pca, clean_df[target]), axis=1)
df_pca.head()

In [None]:
# Saved it to excel file 
df_pca.to_csv('Data/pca_dataset.csv', index=False)

### LDA representation

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

# Fit the LDA model on the training data
X_lda=lda.fit_transform(clean_df[features], clean_df[target])

X_lda.size

In [None]:
plt.scatter(X_lda, clean_df[target], c=clean_df[target], cmap='viridis')
plt.xlabel('LDA Component')
plt.ylabel('Class')
plt.title('Scatter Plot of LDA-Transformed Data')
plt.colorbar(label='Class')
plt.show()

In [None]:
name_columns_lda = [f'LDA{i}' for i in range(1, len(lda.explained_variance_ratio_) + 1)]
name_columns_lda

In [None]:
df_lda = pd.DataFrame(data=X_lda,columns=name_columns_lda)
df_lda=pd.concat((df_lda, clean_df[target]), axis=1)
df_lda.head()

In [None]:
# Saved it to excel file 
df_lda.to_csv('Data/lda_dataset.csv', index=False)

## TO DO:
- Corrolation between cat and num feature
- Make a conclusion 
- Use random forest and other algo for feature importance

# RQ:
- Can add LDA (the model finds linear combinations of the features that achieve maximum separability between the classes and minimum variance within each class) (done)
- PCA as preprocessing step (done)
- diplay with the only two PCA and then three PCA (done)