In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
#train_test split
def train_test(Indep, dep):
    x_train,x_test,y_train,y_test=train_test_split(Indep, dep, test_size=0.25, random_state=0)
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

In [3]:
#cm score
def CM(classifier,x_test):
    y_pred=classifier.predict(x_test)
    cm=confusion_matrix(y_test,y_pred)
    cr=classification_report(y_test,y_pred)
    Accuracy=accuracy_score(y_test, y_pred )
    return cm, cr, Accuracy, x_test, y_test

In [4]:
#LogisticRegression Model
def logistic(x_train,y_train,x_test):       
    classifier = LogisticRegression(solver='lbfgs', max_iter=200)
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test  

In [5]:
#RandomForestClassifier Model
def rf(x_train,y_train,x_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test

In [6]:
#GaussianNB Model
def gaus(x_train,y_train,x_test):
    classifier=GaussianNB()
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test

In [7]:
#KNeighborsClassifier Model
def KN(x_train,y_train,x_test):
    classifier=KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test

In [8]:
#DecisionTreeClassifier Model
def DT(x_train,y_train,x_test):
    classifier=DecisionTreeClassifier(criterion = 'gini', max_features='sqrt',splitter='best',random_state = 0)
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test

In [9]:
#SVC Model
def SVCM(x_train,y_train,x_test):
    classifier=SVC(kernel = 'linear', random_state = 0)
    classifier.fit(x_train,y_train)
    cm, cr, Accuracy, x_test, y_test=CM(classifier,x_test)
    return cm, cr, Accuracy, x_test, y_test

In [10]:
#feature Selection
def rfe_f(Indep,dep,n):
    rfelist = []
    log=LogisticRegression(solver='lbfgs', max_iter=200)
    rf=RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    dt=DecisionTreeClassifier(criterion = 'gini', max_features='sqrt',splitter='best',random_state = 0)
    sv=SVC(kernel = 'linear', random_state = 0)

    rfemodel=[log,rf,dt,sv]
    
    for i in rfemodel:
        print (i)
        best = RFE(estimator=i, n_features_to_select=n)
        fit1=best.fit(Indep,dep)
        feature=fit1.transform(Indep)
        rfelist.append(feature)
    return rfelist

In [11]:
#Table Creation
def selectk_regression(logm,rfm, guasm,knn, dtm, svcm):
    dataframe = pd.DataFrame(index=['Logistic','SVC','Random','DecisionTree'],columns=['Logistic','RandomForest', 'Guassian','KNN','DecisionTree', 'SVC'])
    for num, idex in enumerate(dataframe.index):
        dataframe['Logistic'][idex]=logm[num]
        dataframe['RandomForest'][idex]=rfm[num]
        dataframe['Guassian'][idex]=guasm[num]
        dataframe['KNN'][idex]=knn[num]
        dataframe['DecisionTree'][idex]=dtm[num]
        dataframe['SVC'][idex]=svcm[num]
    return dataframe

In [12]:
dataset= pd.read_csv("prep.csv")
ds=dataset

In [13]:
ds= pd.get_dummies(ds, drop_first=True)


In [14]:
Indep=ds.drop('classification_yes', axis=1)
dep=ds['classification_yes']


In [26]:
rfelist = rfe_f(Indep,dep,5)

LogisticRegression(max_iter=200)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
SVC(kernel='linear', random_state=0)


In [27]:
logm=[]
rfm=[]
guasm=[]
knn=[]
dtm=[]
svcm=[]

In [28]:

for i in rfelist:
    x_train,x_test, y_train,y_test= train_test(i, dep)

    cm, cr, Accuracy, x_test, y_test= logistic(x_train,y_train,x_test)
    logm.append(Accuracy)

    cm, cr, Accuracy, x_test, y_test= rf(x_train,y_train,x_test)
    rfm.append(Accuracy)

    cm, cr, Accuracy, x_test, y_test= gaus(x_train,y_train,x_test)
    guasm.append(Accuracy)

    cm, cr, Accuracy, x_test, y_test= KN(x_train,y_train,x_test)
    knn.append(Accuracy)

    cm, cr, Accuracy, x_test, y_test= DT(x_train,y_train,x_test)
    dtm.append(Accuracy)

    cm, cr, Accuracy, x_test, y_test= SVCM(x_train,y_train,x_test)
    svcm.append(Accuracy)

result = selectk_regression(logm,rfm, guasm,knn, dtm, svcm)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic'][idex]=logm[num]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or S

In [29]:
#5
result

Unnamed: 0,Logistic,RandomForest,Guassian,KNN,DecisionTree,SVC
Logistic,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.97,0.98,0.91,0.97,0.95,0.97
Random,0.95,0.98,0.85,0.94,0.97,0.98
DecisionTree,0.99,0.99,0.99,0.99,0.99,0.99


In [25]:
#4
result

Unnamed: 0,Logistic,RandomForest,Guassian,KNN,DecisionTree,SVC
Logistic,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.97,0.97,0.87,0.98,0.92,0.97
Random,0.98,0.98,0.81,0.98,0.98,0.98
DecisionTree,0.96,0.96,0.96,0.96,0.96,0.96


In [20]:
#3
result

Unnamed: 0,Logistic,RandomForest,Guassian,KNN,DecisionTree,SVC
Logistic,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.94,0.92,0.9,0.94,0.9,0.94
Random,0.98,0.97,0.79,0.98,0.97,0.98
DecisionTree,0.87,0.87,0.87,0.87,0.87,0.87
