In [1]:
# import needed libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [19]:
# select k algorthim using chisquare

def selectkbest(indep_x,dep_y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_x,dep_y)
    selectk_features=fit1.transform(indep_x)
    return selectk_features

# split train and test set, preprocessing using standardscalar

def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test=train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

# confusion metric, accuracy, and report using predict test set

def cm_prediction(classifier,x_test):
    y_pred=classifier.predict(x_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(y_test,y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    return classifier,Accuracy,report,x_test,y_test,cm

# classification logistic regression 

def logistic(x_train,y_train,x_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_test,y_test,cm

# Support vector machine - linear

def svm_linear(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='linear',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# Support vector machine - RBF

def svm_NL(x_train,y_train,x_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='rbf',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# naive bayes - Gaussian naive bayes

def Navie(x_train,y_train,x_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# K nearest neighbar 

def knn(x_train,y_train,x_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# Decision tree

def Decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# Random Forest classifier

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    classifier,Accuracy,report,x_train,y_test,cm=cm_prediction(classifier,x_test)
    return classifier,Accuracy,report,x_train,y_test,cm

# create dataframe for select k 

def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf):
    dataframe=pd.DataFrame(index=["ChiSquare"],columns=['Logistic','SVMl','SVMnl','KNN','Navie','Decision','Random'])
    for number,idex in enumerate(dataframe.index):
        dataframe['Logistic'][idex]=acclog[number]
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['Navie'][idex]=accnav[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

# import dataset

dataset1=pd.read_csv("prep.csv",index_col=None)

df2=dataset1

# drop dummies dataset

df2=pd.get_dummies(df2,drop_first=True)

# split input and output dataset 

indep_x=df2.drop('classification_yes',1)
dep_y=df2['classification_yes']

# model fit

kbest=selectkbest(indep_x,dep_y,5)

# create list for chi2 values with each respectively algorithm

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

# split train and test with preprocessing set

x_train,x_test,y_train,y_test=split_scalar(kbest,dep_y)

# call to action for each algorthim

classifier,Accuracy,report,x_test,y_test,cm=logistic(x_train,y_train,x_test)
acclog.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=svm_linear(x_train,y_train,x_test)
accsvml.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=svm_NL(x_train,y_train,x_test)
accsvmnl.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=knn(x_train,y_train,x_test)
accknn.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=Navie(x_train,y_train,x_test)
accnav.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=Decision(x_train,y_train,x_test)
accdes.append(Accuracy)

classifier,Accuracy,report,x_test,y_test,cm=random(x_train,y_train,x_test)
accrf.append(Accuracy)

# final all the values are in dataframe.

result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)


result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
ChiSquare,0.94,0.94,0.95,0.89,0.83,0.96,0.95
