In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import statistics as stats # https://docs.python.org/3/library/statistics.html#statistics.fmean
#import scipy.stats as spstats
import matplotlib.pyplot as plt

### Validation & Normalization methods ###
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

### ML models ###
from sklearn.linear_model import LogisticRegression, SGDClassifier # C1 loss: log_loss => LogisticRegression with SGD
from sklearn.linear_model import Perceptron # C2
from sklearn.svm import SVC # C3
from sklearn.svm import LinearSVC # C4
from sklearn.tree import DecisionTreeClassifier # C5
from sklearn.ensemble import RandomForestClassifier # C6
from sklearn.neural_network import MLPClassifier # C7

### Metrics ###
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, make_scorer
from imblearn.metrics import geometric_mean_score # https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.geometric_mean_score.html
import time
import timeit # https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit

### Pipeline ###
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Analysis ###
from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif

### Custom Modules ###
from functions.data_types import optimize_dtypes
from functions.dataframe_actions import df_info, df_clean, show_value_counts, fill_missing_values
from functions.ml_training import train_classifiers, train_classifiers_tuned

### Other configurations ###
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
# #import warnings library
# import warnings
# # ignore all warnings
# warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(working_memory=1024*20) 

In [4]:
# to read .csv files from another directory
data_location = "../Data/" # "/<path>"

df = pd.read_csv(data_location + "train_filled.csv")
df = optimize_dtypes(df)
df.head()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
0,Female,Good,7.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,No,No,Yes,No,No,No,Yes,No,No,No,No,Yes,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 65 to 69,1.75,88.4375,28.796875,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Tested positive using home test without a heal...,No
1,Male,Very good,0.0,4.308594,Within past 5 years (2 years but less than 5 y...,Yes,7.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Not at all (right now),Yes,Hispanic,Age 50 to 54,1.669922,90.75,32.53125,No,Yes,No,No,"Yes, received tetanus shot but not sure what type",No,No,No
2,Male,Excellent,2.0,15.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 18 to 24,1.849609,90.75,26.390625,No,No,No,No,"Yes, received tetanus shot but not sure what type",No,No,No
3,Male,Fair,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,8.0,1 to 5,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,Hispanic,Age 65 to 69,1.650391,93.0,34.125,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,No
4,Male,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,No,No,No,No,Yes,No,Yes,No,Yes,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 50 to 54,1.830078,117.9375,35.25,Yes,No,Yes,No,"Yes, received Tdap",No,No,No


In [5]:
df_mapped = pd.read_csv(data_location + "train_filled_mapped.csv")
df_mapped = optimize_dtypes(df_mapped)
df_mapped.head()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
0,0,2,7.0,2.0,3,1,7.0,3,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,6569,1.75,88.4375,28.796875,0,0,0,0,0,0,1,0
1,1,3,0.0,4.308594,1,1,7.0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,5054,1.669922,90.75,32.53125,0,1,0,0,1,0,0,0
2,1,4,2.0,15.0,2,1,6.0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1824,1.849609,90.75,26.390625,0,0,0,0,1,0,0,0
3,1,1,0.0,0.0,3,1,8.0,2,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,6569,1.650391,93.0,34.125,0,0,1,0,1,0,0,0
4,1,4,0.0,0.0,3,1,6.0,3,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,5054,1.830078,117.9375,35.25,1,0,1,0,3,0,0,0


In [6]:
#how many cases are included in the dataset
length = len(df)
#how many features are in the dataset
features = df.shape[1]-1 # - HadHeartAttack

# Number of cases with heart attack
hha_yes = len(df[df['HadHeartAttack']=='Yes'])

#Number of cases without heart attack
hha_no = len(df[df['HadHeartAttack']=='No'])

#Rate of heart attacks over all cases
rate = (float(hha_yes)/(length))*100

print ("There are "+ str(len(df))+" cases in this dataset")
print ("There are {}".format(features)+" features in this dataset")
print ("There are {}".format(hha_yes)+" cases with heart attack")
print ("There are {}".format(hha_no)+" cases without heart attack")
print ("The percentage of heart attack cases is: {:.2f}%".format(rate))

There are 353653 cases in this dataset
There are 38 features in this dataset
There are 20086 cases with heart attack
There are 333567 cases without heart attack
The percentage of heart attack cases is: 5.68%


In [7]:
# # plot distribution
# #to see how distribution is in regard to the HadHeartAttack, we need to first split the dataset into two groups
# malignant = df_mapped[df_mapped['HadHeartAttack'] == 0]
# benign = df_mapped[df_mapped['HadHeartAttack'] == 1]

# #also bring features back, basically redefining them again
# features = list(df_mapped.columns[1:39])
# # sns.set(style="whitegrid",palette="muted",font_scale = 1.2)
# # plt.rcParams.update({'font.size': 11})
# fig, axes = plt.subplots(nrows=8, ncols=5, figsize=(18,20))
# axes = axes.ravel()
# for i,plot1 in enumerate(axes):
#     plot1.figure
#     binwidth= (max(df_mapped[features[i]]) - min(df_mapped[features[i]]))/38
#     plot1.hist([benign[features[i]],malignant[features[i]]], bins=np.arange(min(df_mapped[features[i]]), max(df_mapped[features[i]]) + binwidth, binwidth) , # density : If True, draw and return a probability density: each bin will display the bin's raw count divided by the total number of counts and the bin width (density = counts / (sum(counts) * np.diff(bins))),
#                 alpha=0.8, density=False, stacked=True ,edgecolor="black",label=['B','M'])          # so that the area under the histogram integrates to 1 (np.sum(density * np.diff(bins)) == 1).If stacked is also True, the sum of the histograms is normalized to 1.
                
#     plot1.legend(loc='upper right')
#     plot1.set_title(features[i])
#     # plot1.grid()
# plt.tight_layout()
# plt.show()
# # https://github.com/bora-pajo/breast-cancer-prediction/blob/master/examples_save1.ipynb

$BMI = weight/height^2$, so `WeightInKilograms` and `HeightInMeters` are not needed since they are represented by `BMI`. (check in dataset report if it is indeed calculated like this) The numbers are approximately like this.

In [8]:
# Separate target variable from feature variables
X_mapped = df_mapped.drop(['HadHeartAttack', 'WeightInKilograms', 'HeightInMeters'], axis=1, inplace=False)  # Features
y_mapped = df_mapped['HadHeartAttack']     

In [9]:
original_dtypes = X_mapped.dtypes

X_mapped_scaled = (X_mapped - X_mapped.min(axis=0)) / (X_mapped.max(axis=0)-X_mapped.min(axis=0))              # min max scale
# X_mapped_scaled = (X_mapped - X_mapped.mean())/X_mapped.std() # If we use StandardScaler, the feature names will be lost, so we do it mannually.

# x_scaled.hist(figsize=(16, 20), bins=30, edgecolor="black") # plot to show features after scaling
# plt.subplots_adjust()

# Cast back to original data types
for column, dtype in original_dtypes.items():
    X_mapped_scaled[column] = X_mapped_scaled[column].astype(dtype)

In [10]:
X_mapped_scaled.head(3)

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,0,0,0.233276,0.06665,1,1,0.260986,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0.191528,0,0,0,0,0,0,0
1,1,0,0.0,0.143677,0,1,0.260986,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.234131,0,1,0,0,0,0,0
2,1,1,0.06665,0.5,0,1,0.217407,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.164062,0,0,0,0,0,0,0


In [11]:
X_mapped_scaled.dtypes

Sex                            uint8
GeneralHealth                  uint8
PhysicalHealthDays           float16
MentalHealthDays             float16
LastCheckupTime                uint8
PhysicalActivities             uint8
SleepHours                   float16
RemovedTeeth                   uint8
HadAngina                      uint8
HadStroke                      uint8
HadAsthma                      uint8
HadSkinCancer                  uint8
HadCOPD                        uint8
HadDepressiveDisorder          uint8
HadKidneyDisease               uint8
HadArthritis                   uint8
HadDiabetes                    uint8
DeafOrHardOfHearing            uint8
BlindOrVisionDifficulty        uint8
DifficultyConcentrating        uint8
DifficultyWalking              uint8
DifficultyDressingBathing      uint8
DifficultyErrands              uint8
SmokerStatus                   uint8
ECigaretteUsage                uint8
ChestScan                      uint8
RaceEthnicityCategory          uint8
A

In [12]:
# #correlation map
# sns.set(style='white',font_scale = 1.3)
# mask = np.triu(np.ones_like(X_mapped_scaled.corr('pearson'), dtype=bool))
# np.fill_diagonal(mask, False)

# f,ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(X_mapped_scaled.corr('pearson'),mask=mask,annot=True, linewidths=.5, fmt= '.1f',ax=ax,cmap= 'coolwarm',annot_kws={'size': 13})
# plt.title('Pearson Correlation Map')

In [13]:
# #correlation map
# sns.set(style='white',font_scale = 1.3)
# mask = np.triu(np.ones_like(X_mapped_scaled.corr('spearman'), dtype=bool))
# np.fill_diagonal(mask, False)

# f,ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(X_mapped_scaled.corr('spearman'),mask=mask,annot=True, linewidths=.5, fmt= '.1f',ax=ax,cmap= 'coolwarm',annot_kws={'size': 13})
# plt.title('Spearman Correlation Map')

In [14]:
# #correlation map
# sns.set(style='white',font_scale = 1.3)
# mask = np.triu(np.ones_like(X_mapped_scaled.corr('kendall'), dtype=bool))
# np.fill_diagonal(mask, False)

# f,ax = plt.subplots(figsize=(20, 20))
# sns.heatmap(X_mapped_scaled.corr('kendall'),mask=mask,annot=True, linewidths=.5, fmt= '.1f',ax=ax,cmap= 'coolwarm',annot_kws={'size': 13})
# plt.title('Kendall Correlation Map')

In [15]:
# sns.set(style='white', font_scale=1)
# df_6best_mi = X_mapped_scaled.loc[:,['PhysicalActivities','AlcoholDrinkers','FluVaxLast12', 'ChestScan', 'Sex','RemovedTeeth']].join(y_mapped)
# g = sns.PairGrid(df_6best_mi,hue='HadHeartAttack',height=2.3, aspect=1,corner=True)
# g.map_diag(sns.kdeplot,fill=True)
# g.map_lower(sns.scatterplot,alpha=0.4)
# g.map_lower(sns.kdeplot,levels=4)
# # g.map_upper(corrdot)
# g.add_legend();
# #https://newbedev.com/correlation-matrix-plot-with-coefficients-on-one-side-scatterplots-on-another-and-distributions-on-diagonal

In [14]:
# Define multiple metrics
scoring = {'Balanced Accuracy': make_scorer(balanced_accuracy_score),
           'F1-score': make_scorer(f1_score, average='weighted'),
           'G-Mean score': make_scorer(geometric_mean_score, average='weighted')
          }

cv=StratifiedKFold(n_splits=3, random_state=None)

In [32]:
# https://www.yourdatateacher.com/2021/04/26None, random, randomture-selection-via-grid-search-in-supervised-models/

pipeline = Pipeline([('scaler', MinMaxScaler()),('selector',SelectKBest(f_classif)),('model',RandomForestClassifier(random_state = 13))])

search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
  'selector__k': list(range(10, 20))
#  , 'model__n_estimators':np.arange(10,200,10)   
 },
    n_jobs=-1,
    scoring=scoring,
    cv=cv,
    verbose=0,
    refit='Balanced Accuracy'
)
search.fit(X_mapped,y_mapped)
search.best_params_

{'selector__k': 16}

In [33]:
# https://www.yourdatateacher.com/2021/04/26None, random, randomture-selection-via-grid-search-in-supervised-models/

pipeline = Pipeline([('scaler', MinMaxScaler()),('selector',SelectKBest(chi2)),('model',RandomForestClassifier(random_state = 13))])

search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
  'selector__k': list(range(10, 20))
#  , 'model__n_estimators':np.arange(10,200,10)   
 },
    n_jobs=-1,
    scoring=scoring,
    cv=cv,
    verbose=0,
    refit='Balanced Accuracy'
)
search.fit(X_mapped,y_mapped)
search.best_params_

{'selector__k': 17}

In [34]:
# https://www.yourdatateacher.com/2021/04/26None, random, randomture-selection-via-grid-search-in-supervised-models/

pipeline = Pipeline([('scaler', MinMaxScaler()),('selector',SelectKBest(mutual_info_classif)),('model',RandomForestClassifier(random_state = 13))])

search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
  'selector__k': list(range(10, 20))
#  , 'model__n_estimators':np.arange(10,200,10)   
 },
    n_jobs=-1,
    scoring=scoring,
    cv=cv,
    verbose=0,
    refit='Balanced Accuracy'
)
search.fit(X_mapped,y_mapped)
search.best_params_

{'selector__k': 16}

In [35]:
# find best scored 12 features
select_feature = SelectKBest(f_classif, k=16).fit(X_mapped_scaled, y_mapped)

# Get columns to keep and create new dataframe with those only
cols = select_feature.get_support(indices=True)
x_kbest_anova = X_mapped_scaled.iloc[:,cols]

# print('Score list:', select_feature.scores_)
# print('Feature list:', X_mapped_scaled.columns)
print('Selected feature list:', x_kbest_anova.columns)

Selected feature list: Index(['PhysicalHealthDays', 'PhysicalActivities', 'RemovedTeeth', 'HadAngina',
       'HadStroke', 'HadCOPD', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'DifficultyWalking',
       'DifficultyErrands', 'ChestScan', 'AgeCategory', 'AlcoholDrinkers',
       'PneumoVaxEver'],
      dtype='object')


In [36]:
# find best scored 12 features
select_feature = SelectKBest(chi2, k=16).fit(X_mapped_scaled, y_mapped)

# Get columns to keep and create new dataframe with those only
cols = select_feature.get_support(indices=True)
x_kbest_chi2 = X_mapped_scaled.iloc[:,cols]

# print('Score list:', select_feature.scores_)
# print('Feature list:', X_mapped_scaled.columns)
print('Selected feature list:', x_kbest_chi2.columns)

Selected feature list: Index(['PhysicalHealthDays', 'RemovedTeeth', 'HadAngina', 'HadStroke',
       'HadCOPD', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes',
       'DeafOrHardOfHearing', 'DifficultyWalking', 'DifficultyDressingBathing',
       'DifficultyErrands', 'ChestScan', 'AgeCategory', 'AlcoholDrinkers',
       'PneumoVaxEver'],
      dtype='object')


In [37]:
# find best scored 12 features
select_feature = SelectKBest(mutual_info_classif, k=16).fit(X_mapped_scaled, y_mapped)

# Get columns to keep and create new dataframe with those only
cols = select_feature.get_support(indices=True)
x_kbest_mi = X_mapped_scaled.iloc[:,cols]

# print('Score list:', select_feature.scores_)
# print('Feature list:', X_mapped_scaled.columns)
print('Selected feature list:', x_kbest_mi.columns)

Selected feature list: Index(['Sex', 'PhysicalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadAngina', 'HadArthritis',
       'DifficultyWalking', 'ChestScan', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'CovidPos'],
      dtype='object')


In [38]:
from mrmr import mrmr_classif
y_mrmr = pd.Series(y_mapped)
mrmr_features = mrmr_classif(X_mapped_scaled, y_mrmr, K=16)

x_mrmr = X_mapped_scaled[mrmr_features]
x_mrmr.head(0)

100%|██████████| 16/16 [00:03<00:00,  4.83it/s]


Unnamed: 0,HadAngina,ChestScan,HadStroke,PneumoVaxEver,HadDiabetes,PhysicalHealthDays,RemovedTeeth,Sex,AgeCategory,HadCOPD,DifficultyWalking,HadKidneyDisease,AlcoholDrinkers,DeafOrHardOfHearing,HadArthritis,BlindOrVisionDifficulty


In [22]:
# Create the RFE object and rank each pixel
clf_rf = RandomForestClassifier(random_state=13)      
rfe = RFE(estimator=clf_rf, n_features_to_select=16, step=1)
rfe = rfe.fit(X_mapped_scaled, y_mapped)

print('Chosen best',rfe.n_features_,'feature by rfe:',X_mapped_scaled.columns[rfe.support_])
print("Feature Ranking: " , rfe.ranking_)

Chosen best 16 feature by rfe: Index(['PhysicalHealthDays', 'MentalHealthDays', 'PhysicalActivities',
       'SleepHours', 'HadAngina', 'HadStroke', 'HadAsthma',
       'HadDepressiveDisorder', 'HadArthritis', 'DifficultyWalking',
       'ChestScan', 'BMI', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'CovidPos'],
      dtype='object')
Feature Ranking:  [ 3 19  1  1 16  1  1 10  1  1  1  8  9  1 13  1  4  5 15  7  1 18 14 12
 21  1 17 11  1  2  1  1  1  6 20  1]


## Feature Selection Summary

In [39]:
# Convert indexes to lists
x_kbest_anova_list = x_kbest_anova.columns.tolist()
x_kbest_chi2_list = x_kbest_chi2.columns.tolist()
x_kbest_mi_list = x_kbest_mi.columns.tolist()
x_mrmr_list = x_mrmr.columns.tolist()
x_rfe_list = X_mapped_scaled.columns[rfe.support_].tolist()

# Combine all feature lists
all_features = set(x_kbest_anova_list) | set(x_kbest_chi2_list) | set(x_kbest_mi_list) | set(x_mrmr_list) | set(x_rfe_list)

# Features suggested by all algorithms
common_features = set(x_kbest_anova_list) & set(x_kbest_chi2_list) & set(x_kbest_mi_list) & set(x_mrmr_list) & set(x_rfe_list)

# Convert to lists
all_features_list = list(all_features)
common_features_list = list(common_features)

print("Features included in at least one method:")
print(len(all_features_list), " : ", all_features_list)
print("Features included in all methods: ")
print(len( common_features_list), " : ",  common_features_list)


Features included in at least one method:
28  :  ['RemovedTeeth', 'HIVTesting', 'AlcoholDrinkers', 'HadKidneyDisease', 'DeafOrHardOfHearing', 'AgeCategory', 'HadAngina', 'HadArthritis', 'Sex', 'DifficultyWalking', 'LastCheckupTime', 'HadDiabetes', 'HadCOPD', 'SleepHours', 'FluVaxLast12', 'BlindOrVisionDifficulty', 'BMI', 'HadStroke', 'DifficultyErrands', 'PneumoVaxEver', 'PhysicalHealthDays', 'PhysicalActivities', 'HadAsthma', 'MentalHealthDays', 'CovidPos', 'HadDepressiveDisorder', 'ChestScan', 'DifficultyDressingBathing']
Features included to all methods: 
6  :  ['PneumoVaxEver', 'PhysicalHealthDays', 'HadArthritis', 'HadAngina', 'DifficultyWalking', 'ChestScan']


In [40]:
# Count the number of times each feature appears
print("Number of times each feature appears (descending order):")
feature_counts = {}
for feature in all_features:
    feature_counts[feature] = sum(feature in s for s in [x_kbest_anova_list, x_kbest_chi2_list, x_kbest_mi_list, x_mrmr_list, x_rfe_list])

sorted_feature_counts = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
for feature, count in sorted_feature_counts:
    print(feature, ":", count)

# Create a final dataframe with final features
final_features = [feature for feature, count in sorted_feature_counts if count >= 3]
X_final = X_mapped_scaled[final_features]

print("Final dataframe shape:", X_final.shape)
# print("Final dataframe columns:", final_df.columns)
X_final.head(0)

Number of times each feature appears (descending order):
HadAngina : 5
HadArthritis : 5
DifficultyWalking : 5
PneumoVaxEver : 5
PhysicalHealthDays : 5
ChestScan : 5
RemovedTeeth : 4
AlcoholDrinkers : 4
HadStroke : 4
HadKidneyDisease : 3
DeafOrHardOfHearing : 3
AgeCategory : 3
HadDiabetes : 3
HadCOPD : 3
PhysicalActivities : 3
HIVTesting : 2
Sex : 2
SleepHours : 2
FluVaxLast12 : 2
BMI : 2
DifficultyErrands : 2
CovidPos : 2
LastCheckupTime : 1
BlindOrVisionDifficulty : 1
HadAsthma : 1
MentalHealthDays : 1
HadDepressiveDisorder : 1
DifficultyDressingBathing : 1
Final dataframe shape: (353653, 15)


Unnamed: 0,HadAngina,HadArthritis,DifficultyWalking,PneumoVaxEver,PhysicalHealthDays,ChestScan,RemovedTeeth,AlcoholDrinkers,HadStroke,HadKidneyDisease,DeafOrHardOfHearing,AgeCategory,HadDiabetes,HadCOPD,PhysicalActivities


In [44]:
df_train_mapped_selected = pd.concat([X_final, y_mapped], axis=1)
df_train_mapped_selected.to_csv('../Data/train_mapped_selected.csv', index=False)

In [41]:
# *** Functions for Mutual Information (score and plot) ***
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=13)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#:~:text=Mutual%20information%20(MI)%20%5B1,higher%20values%20mean%20higher%20dependency.
# https://www.kaggle.com/code/ryanholbrook/mutual-information/tutorial
# https://www.kaggle.com/code/palazar/exercise-mutual-information/edit

In [45]:
mi_scores = make_mi_scores(X_final, y_mapped)
print(mi_scores.head(37))
# print(mi_scores.tail(20))  # uncomment to see bottom 20
plt.style.use('default')
plt.figure(dpi=100, figsize=(8, 10))
plt.grid()
plot_mi_scores(mi_scores.head(37))
# plot_mi_scores(mi_scores.tail(20))  # uncomment to see bottom 20

In [27]:
# # The "accuracy" scoring is proportional to the number of correct classifications
# clf_rf_4 = RandomForestClassifier(random_state=13)
# min_features_to_select = 10  # Minimum number of features to consider
# rfecv = RFECV(estimator=clf_rf_4,min_features_to_select=min_features_to_select, step=1, cv=cv,scoring='f1_macro')
# rfecv = rfecv.fit(X_mapped_scaled, y_mapped)

# print('Optimal number of features :', rfecv.n_features_)
# print('Best features :', X_mapped_scaled.columns[rfecv.support_])

In [28]:
# # Plot number of features VS. cross-validation scores
# print("Optimal number of features : %d" % rfecv.n_features_)

# # Plot number of features VS. cross-validation 
# sns.set(font_scale = 1)
# plt.figure(figsize=(8,6))
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross validation score (F1 score)")
# plt.plot(
#     range(min_features_to_select, len(rfecv.cv_results_['mean_test_score']) + min_features_to_select),
#     rfecv.cv_results_['mean_test_score'],
# )
# plt.axhline(y=rfecv.cv_results_['mean_test_score'].max(), color='r', linestyle='dotted',alpha=1)
# plt.show()