In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

In [3]:
def rfefeature(indep_x, dep_y, n):
    rfelist = []
    log_model = LogisticRegression(solver = 'lbfgs')
    RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    SVC_model = SVC(kernel = 'linear', random_state = 0)
    DT = DecisionTreeClassifier(criterion = 'gini', max_features = 'sqrt', splitter = 'best', random_state = 0)
    rfemodellist = [log_model, RF, SVC_model, DT]
    for i in rfemodellist:
        print(i)
        log_rfe = RFE(i, n_features_to_select = n)
        log_fit = log_rfe.fit(indep_x, dep_y)
        log_rfe_transform = log_fit.transform(indep_x)
        rfelist.append(log_rfe_transform)
    return rfelist        

In [4]:
def scaler_split(indep_x, dep_y):
    x_train, x_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size = 0.30, random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train, x_test, y_train, y_test

In [5]:
def cm_prediction(classifier, x_test):
    y_pred = classifier.predict(x_test)
    # making Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test, y_pred)
    from sklearn.metrics import accuracy_score
    Accuracy = accuracy_score(Y_test, y_pred)
    from sklearn.metrics import classification_report
    report = classification_report(Y_test, y_pred)
    return cm, Accuracy, report, x_test, Y_test, classifier    

In [6]:
def logistic(x_train, y_train, x_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test)
    return classifier, Accuracy, report, x_test, Y_test, cm 

In [7]:
def svm_linear(x_train, y_train, x_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test) 
    return  classifier,Accuracy,report,x_test,Y_test,cm    

In [8]:
def svm_nl(x_train, y_train, x_test):
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test) 
    return  classifier,Accuracy,report,x_test,Y_test,cm    

In [9]:
def naive(x_train, y_train, x_test):
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test) 
    return  classifier,Accuracy,report,x_test,Y_test,cm    

In [10]:
def decision(x_train, y_train, x_test):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test) 
    return  classifier,Accuracy,report,x_test,Y_test,cm    

In [11]:
def random(x_train, y_train, x_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(x_train, y_train)
    classifier, Accuracy, report, x_test, Y_test, cm = cm_prediction(classifier, x_test) 
    return  classifier,Accuracy,report,x_test,Y_test,cm    

In [12]:
def rfe_classification(acclog, accsvml, accsvmnl, accknn, accnaive, accdeci, accrand):
    rfedataframe = pd.DataFrame(index = ['Logistic', 'SVM', 'Decision','Random'], 
                             columns = ['Logistic', 'SVML', 'SVMNL', 'KNN', 'Naive', 'Decision', 'Random'])
    for number, item in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][item] = acclog[number]
        rfedataframe['SVML'][item] = accsvml[number]
        rfedataframe['SVMNL'][item] = accsvmnl[number]
        rfedataframe['KNN'][item] = accknn[number]
        rfedataframe['Naive'][item] = accnaive[number]
        rfedataframe['Decision'][item] = accdeci[number]
        rfedataframe['Random'][item] = accrand[number]
    return rfedataframe        

In [13]:
dataset = pd.read_csv("Employee_Attrition_Dataset.csv")
dataset

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [14]:
dataset.drop(['EmployeeCount', 'EmployeeNumber','StandardHours'], axis = 1, inplace = True)

In [15]:
df1 = pd.get_dummies(dataset, dtype = int, drop_first = True)
df1

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,2,94,3,2,4,5993,...,0,0,0,0,0,1,0,0,1,1
1,49,279,8,1,3,61,2,2,2,5130,...,0,0,0,0,1,0,0,1,0,0
2,37,1373,2,2,4,92,2,1,3,2090,...,1,0,0,0,0,0,0,0,1,1
3,33,1392,3,4,4,56,3,1,3,2909,...,0,0,0,0,1,0,0,1,0,1
4,27,591,2,1,1,40,3,1,2,3468,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,3,41,4,2,4,2571,...,1,0,0,0,0,0,0,1,0,0
1466,39,613,6,1,4,42,2,3,1,9991,...,0,0,0,0,0,0,0,1,0,0
1467,27,155,4,3,2,87,4,2,2,6142,...,0,0,1,0,0,0,0,1,0,1
1468,49,1023,2,3,4,63,2,2,2,5390,...,0,0,0,0,0,1,0,1,0,0


In [16]:
# Input and Output Split
indep_x = df1.drop("Attrition_Yes", axis =1)
dep_y = df1['Attrition_Yes']

In [None]:
rfelist = rfefeature(indep_x, dep_y,3)

acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnaive = []
accdeci = []
accrand = []

LogisticRegression()
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
SVC(kernel='linear', random_state=0)


In [None]:
for i in rfelist:
    x_train, x_test, y_train, y_test = split_scaler(i, dep_Y)
        
    classifier,accuracy,report,x_test,y_test,cm=logistic(x_train,y_train,x_test)
    acclog.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = svm_linear(x_train, y_train, x_test)
    accsvml.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = svm_nl(x_train, y_train, x_test)
    accsvmnl.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = naive(x_train, y_train, x_test)
    accnaive.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = knn(x_train, y_train, x_test)
    accknn.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = decision(x_train, y_train, x_test)
    accdeci.append(accuracy)

    classifier, accuracy, report, x_test, y_test, cm = random(x_train, y_train, x_test)
    accrand.append(accuracy)

result = rfe_classification(acclog,accsvml,accsvmnl,accnaive,accknn,accdeci,accrand)

In [None]:
# For K = 3
result