In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
#from sklearn.feature_selection import chi2 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

dataset1 = pd.read_csv("prep.csv", index_col=None) # index_col None says there in no index column otherwise the first column is taken as index
df2 = dataset1
df2=pd.get_dummies(df2, drop_first=True)

indep_X = df2.drop('classification_yes', 1)
dep_Y = df2['classification_yes']

In [16]:
def rfeFeature(indep_X, dep_Y, n ):
    rfeList=[]
    
    log_model = LogisticRegression(solver='lbfgs')
    RF = RandomForestClassifier(n_estimators = 10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
    svc_model = SVC(kernel='linear', random_state=0)
    NB = GaussianNB()
    knn = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    
    rfeModelList = [log_model, RF, svc_model, DT]
    
    for i in rfeModelList:
        print(i)
        log_rfe = RFE(i, n)#test = SelectKBest(score_func=chi2, k= n)
        fit1 = log_rfe.fit(indep_X, dep_Y)
        selected_columns = indep_X.columns[log_rfe.get_support()]  # Retrieve selected feature names
        print(f"Selected features for model {i}: {list(selected_columns)}")  # Print the selected feature names

        
        log_rfe_feature = fit1.transform(indep_X)
        
        rfeList.append(log_rfe_feature)
    return rfeList


In [3]:
 def split_scalar(indep_X, dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size =0.25, random_state= 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.fit_transform(X_test)
        return X_train, X_test, y_train, y_test
        

In [4]:
def confusionMatrix_prediction(classifier, X_test):
    y_pred = classifier.predict(X_test)
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    Accuracy = accuracy_score(y_test, y_pred)
    
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, X_test, y_test, cm
    

In [7]:
def logistic(X_train, y_train, X_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [8]:
def svm_linear(X_train, y_train, X_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [9]:
def svm_NL(X_train, y_train, X_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [10]:
def Navie(X_train, y_train, X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [11]:
def knn(X_train, y_train, X_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [12]:
def Decision(X_train, y_train, X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [13]:
def random(X_train, y_train, X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = confusionMatrix_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

In [14]:
def rfe_Classification_create_Table(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    RFE_dataframe = pd.DataFrame(index=['Logistic', 'SVC', 'Random', 'DecisionTree' ], columns = ['Logistic', 'SVMl','SVMnl', 'KNN', 'Navie', 'Decision', 'Random' ])
    print(len(acclog), len(accsvml), len(accsvmnl), len(accknn), len(accnav), len(accdes), len(accrf))
    for number, idex in enumerate(RFE_dataframe.index):
        RFE_dataframe['Logistic'][idex]=acclog[number]
        RFE_dataframe['SVMl'][idex]=accsvml[number]
        RFE_dataframe['SVMnl'][idex]=accsvmnl[number]
        RFE_dataframe['KNN'][idex]=accknn[number]
        RFE_dataframe['Navie'][idex]=accnav[number]
        RFE_dataframe['Decision'][idex]=accdes[number]
        RFE_dataframe['Random'][idex]=accrf[number]
    return RFE_dataframe



In [17]:


#for j in [3]:  
rfeList = rfeFeature(indep_X, dep_Y, 3)
acclog = [] 
accsvml= [] 
accsvmnl= []
accknn= [] 
accnav= [] 
accdes= []
accrf = []
for i in rfeList:
    X_train, X_test, y_train, y_test = split_scalar (i, dep_Y)
    classifier, Accuracy, report, X_test, y_test, cm = logistic(X_train, y_train, X_test)
    acclog.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = svm_linear(X_train, y_train, X_test)
    accsvml.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = svm_NL(X_train, y_train, X_test)
    accsvmnl.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = knn(X_train, y_train, X_test)
    accknn.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = Navie(X_train, y_train, X_test)
    accnav.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = Decision(X_train, y_train, X_test)
    accdes.append(Accuracy)
    classifier, Accuracy, report, X_test, y_test, cm = random(X_train, y_train, X_test)
    accrf.append(Accuracy)

result = rfe_Classification_create_Table(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
result 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Selected features for model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False): ['sg_c', 'sg_d', 'htn_yes']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_s

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Logistic,0.94,0.94,0.94,0.94,0.64,0.94,0.94
SVC,0.94,0.95,0.94,0.94,0.93,0.77,0.79
Random,0.87,0.87,0.87,0.87,0.64,0.87,0.87
DecisionTree,0.98,0.95,0.96,0.95,0.64,0.95,0.95


In [None]:
# from the above table we can see the RFEFeature model will be using DecisionTree with K=3 and using Logistic ML algorithm the prediction is 98%
# Selected features for model DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
#                        max_features='sqrt', max_leaf_nodes=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=1, min_samples_split=2,
#                        min_weight_fraction_leaf=0.0, presort=False,
#                        random_state=0, splitter='best'): ['hrmo', 'sg_c', 'dm_yes']
# 3 input fields are 'hrmo', 'sg', 'dm'