In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest,f_regression

from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import pickle
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
def selectkbest(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1= test.fit(indep_X,dep_Y)
        selectk_features = fit1.transform(indep_X)
        return selectk_features

In [3]:
def split_scalar(indep_x,dep_y):
    X_train,X_test,y_train,y_test=train_test_split(indep_x,dep_y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,y_train,y_test

In [4]:
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

In [5]:
def logistic(X_train,y_train,X_test,y_test):       
       
        param_grid = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'saga'],
             'penalty':['l2']} 
        grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted') 
        
        #classifier = LogisticRegression(random_state = 0)
        #classifier.fit(X_train, y_train)
        grid.fit(X_train,y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm   


In [6]:
def svm_linear(X_train,y_train,X_test,y_test):
                
        param_grid = {'kernel':['linear','rbf','poly','sigmoid'],
             'gamma':['auto','scale'],
             'C':[10,100,1000,2000,3000]} 

        grid = GridSearchCV (SVC(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted') 
        grid.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [7]:
def svm_NL(X_train,y_train,X_test,y_test):
                
        param_grid = {'kernel':['linear','rbf','poly','sigmoid'],
             'gamma':['auto','scale'],
             'C':[10,100,1000,2000,3000]} 

        grid = GridSearchCV (SVC(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted') 
        grid.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [8]:
def knn(X_train,y_train,X_test,y_test):

     # Create an untuned KNN model
     knn = KNeighborsClassifier()
       
     param_grid = {'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],'algorithm': ['auto', 'ball_tree', 'kd_tree']}

     # Create a GridSearchCV object
     grid= GridSearchCV(knn, param_grid, cv=5)
     
     # Fit the grid search object to the data
     grid.fit(X_train, y_train)
     classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
     return  classifier,Accuracy,report,X_test,y_test,cm

In [9]:
def BernoulliNB(X_train,y_train,X_test,y_test):       
        
        from sklearn.naive_bayes import BernoulliNB
        # Create a BernoulliNB classifier
        clf = BernoulliNB()
        # Define the grid of parameters to search over
        param_grid = {
              'alpha': [0.1, 0.5, 1.0]
                }
        # Create a GridSearchCV object
        grid = GridSearchCV(clf, param_grid, cv=5)

        # Fit the grid search object to the data
        grid.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 
           

In [10]:
def GaussianNB(X_train,y_train,X_test,y_test): 

        from sklearn.naive_bayes import GaussianNB

        # Create an untuned KNN model
        clf = GaussianNB()

        # Define the parameter grid
        param_grid = {
                'var_smoothing': np.logspace(0, -9, num=100)
                }

        # Create a GridSearchCV object
        grid = GridSearchCV(clf, param_grid, cv=5)

        # Fit the grid search object to the data
        grid.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 

In [11]:
def Decision(X_train,y_train,X_test,y_test):
        
        param_grid = {'criterion':['gini','entropy'],
              'max_features': ['auto','sqrt','log2'],
              'splitter':['best','random']} 

        grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted') 
   
        # fitting the model for grid search 
        grid.fit(X_train, y_train) 
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm      


In [12]:
def random(X_train,y_train,X_test,y_test):

    
        param_grid = {'criterion':['gini','entropy'],
                      'max_features': ['auto','sqrt','log2'],
                      'n_estimators':[10,100]} 

        grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1') 
   
        grid.fit(X_train, y_train) 
        
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(grid,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [13]:
def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accbern,accgaus,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Logistic','SVMl','SVMnl','KNN','BernoulliNB','GaussianNB','Decision','Random'])
    for number,idex in enumerate(dataframe.index):      
        dataframe['Logistic'][idex]=acclog[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['BernoulliNB'][idex]=accbern[number]
        dataframe['GaussianNB'][idex]=accgaus[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [14]:
dataset1=pd.read_csv("autism.csv",index_col=None)

df2=dataset1

In [15]:
df2.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jaundice,austim,Class/ASD
0,1,0,1,1,1,1,0,1,1,1,18.605397,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,13.829369,0,0,0,0
2,1,1,1,1,1,1,0,0,1,1,14.679893,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,61.035288,0,0,0,0
4,0,0,0,0,1,0,0,0,1,1,14.256686,1,0,1,0


In [16]:
indep_X=df2.drop(['Class/ASD'], axis=1)
dep_Y=df2['Class/ASD']

In [21]:
kbest=selectkbest(indep_X,dep_Y,3)   

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accbern=[]
accgaus=[]
accdes=[]
accrf=[]

In [22]:
X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y) 


In [23]:
classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test,y_test)
acclog.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test,y_test)  
accsvml.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test,y_test)  
accsvmnl.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test,y_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=BernoulliNB(X_train,y_train,X_test,y_test)  
accbern.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=GaussianNB(X_train,y_train,X_test,y_test)  
accgaus.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test,y_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test,y_test)  
accrf.append(Accuracy)
    
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accbern,accgaus,accdes,accrf)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [24]:
result # K=3   is the best

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,BernoulliNB,GaussianNB,Decision,Random
ChiSquare,0.86,0.87,0.87,0.86,0.875,0.875,0.86,0.845


In [20]:
result # K=4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,BernoulliNB,GaussianNB,Decision,Random
ChiSquare,0.865,0.85,0.85,0.855,0.86,0.86,0.85,0.85
