In [2]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession, Row, Column, SQLContext, DataFrame, DataFrameStatFunctions
from pyspark.sql.functions import split
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("app1").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, DoubleType

import calendar
from datetime import date,datetime
import math
from pyspark.sql.functions import date_format, lit

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pyarrow.parquet as pq
import scipy.io
import pandas as pd
le = LabelEncoder()

from sklearn.feature_selection import VarianceThreshold
from skfeature.function.similarity_based import fisher_score
from skfeature.utility import construct_W

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import time

In [4]:
def two_class_feature_selection(variance, Xtrain, y, Xtest):   
    score = fisher_score.fisher_score(Xtrain, y)
    idx = fisher_score.feature_ranking(score)
    features_number = len([i for i in range(len(idx)) if idx[i]==0])
    Xfisher = Xtrain[:, idx[0:features_number]]
    XtestFisher = Xtest[:, idx[0:features_number]]
    #print(Xfisher.shape)
    
    sel = VarianceThreshold(threshold=variance)
    Xvariance = sel.fit_transform(Xtrain)  
    #print(Xvariance.shape)
    XtestVariance = sel.transform(Xtest)
    #print(XtestVariance.shape)
    
    return Xvariance,Xfisher,XtestVariance,XtestFisher

In [5]:
wines = sqlContext.read.format('csv').options(header='true',delimiter=',',mode='DROPMALFORMED',inferSchema=True).load("cardio.csv")

In [6]:
wines.printSchema()

root
 |-- V1: integer (nullable = true)
 |-- V2: integer (nullable = true)
 |-- V3: integer (nullable = true)
 |-- V4: integer (nullable = true)
 |-- V5: integer (nullable = true)
 |-- V6: integer (nullable = true)
 |-- V7: integer (nullable = true)
 |-- V8: integer (nullable = true)
 |-- V9: integer (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: integer (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: integer (nullable = true)
 |-- V14: integer (nullable = true)
 |-- V15: integer (nullable = true)
 |-- V16: integer (nullable = true)
 |-- V17: integer (nullable = true)
 |-- V18: integer (nullable = true)
 |-- V19: integer (nullable = true)
 |-- V20: integer (nullable = true)
 |-- V21: integer (nullable = true)
 |-- V22: integer (nullable = true)
 |-- V23: integer (nullable = true)
 |-- V24: integer (nullable = true)
 |-- V25: integer (nullable = true)
 |-- V26: integer (nullable = true)
 |-- V27: integer (nullable = true)
 |-- V28: integer (nullable = true

In [41]:
wines = wines.withColumn("_c8", wines["_c8"].cast(StringType()))

In [56]:
wines.count()

731

In [7]:
winesTrain = wines.sample(False,0.5)
winesTest = wines.subtract(winesTrain)
print(winesTrain.count())
print(winesTest.count())
print(winesTrain.count()+winesTest.count())
print(wines.count())

1046
1071
2117
2126


In [78]:
print(YwinesTrain)

[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.

In [8]:
winesTrainPandas = winesTrain.toPandas()
winesTestPandas = winesTest.toPandas()

In [9]:
winesTrain = winesTrainPandas.values
winesTest = winesTestPandas.values

XwinesTrain = winesTrain[:,0:35]
YwinesTrain = winesTrain[:,35]
XwinesTest = winesTest[:,0:35]
Ytest = winesTest[:,35]

scaler = StandardScaler().fit(XwinesTrain)
XwinesTrain = scaler.transform(XwinesTrain)
XwinesTest = scaler.transform(XwinesTest)

In [12]:
#print(XtrainSMOTE.shape,XtrainADASYN.shape,XtrainROS.shape)

In [15]:
results = pd.DataFrame(columns=['classifier','resampler','selection_algorithm','accuracy','precision','recall','duration'])

In [11]:
smote = SMOTE(k=3)
adasyn = ADASYN(k=3)
ros = RandomOverSampler()
resamplers = np.array([smote, adasyn])

In [16]:
#KNeighborsTest
weights = ['uniform','distance']
algorithms = ['ball_tree','kd_tree','brute']

for i in resamplers:
    Xtrain, ytrain = i.fit_sample(XwinesTrain,YwinesTrain)
    #print(Xtrain.shape)
    XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), Xtrain,ytrain,XwinesTest)
    #print("After selection")
    for j in range(len(algorithms)):
        for k in range(len(weights)):
            for l in (range(4)):
                classifier=KNeighborsClassifier(n_neighbors=((2*l)+1),weights=weights[k],algorithm=algorithms[j])
                
                start=time.clock()
                predicted=classifier.fit(XtrainVariance,ytrain).predict(XtestVariance)
                end=time.clock()
                duration=end-start
                
                start=time.clock()
                predicted2=classifier.fit(XtrainFisher,ytrain).predict(XtestFisher)
                end=time.clock()
                duration2=end-start
                
                #print(str(classifier)[:5])
                results.loc[len(results)]=[str(classifier),str(i),"variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted, average='macro'), recall_score(Ytest,predicted, average='macro'), duration]
                results.loc[len(results)]=[str(classifier),str(i),"fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2, average='macro'), recall_score(Ytest,predicted2, average='macro'), duration2]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [17]:
results.to_csv("cardioKNeighborsK1.csv",sep=";")
results.drop(results.index, inplace=True)

In [18]:
#BayesTest
multinomial = MultinomialNB()
gauss = GaussianNB()
bernoulli = BernoulliNB()
classifiers = np.array([gauss, bernoulli])

for i in resamplers:
    Xtrain, ytrain = i.fit_sample(XwinesTrain,YwinesTrain)
    #print(Xtrain.shape)
    XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), Xtrain,ytrain,XwinesTest)
    #print("After selection")
    for j in classifiers:
   
        start=time.clock()
        predicted=j.fit(XtrainVariance,ytrain).predict(XtestVariance)
        end=time.clock()
        duration=end-start
    
        start=time.clock()
        predicted2=j.fit(XtrainFisher,ytrain).predict(XtestFisher)
        end=time.clock()
        duration2=end-start
        
        #print(str(j)[:5])
        results.loc[len(results)]=[str(j),str(i),"variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted, average="macro"), recall_score(Ytest,predicted, average="macro"), duration]
        results.loc[len(results)]=[str(j),str(i),"fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2,average='macro'), recall_score(Ytest,predicted2, average="macro"), duration2]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [19]:
results.to_csv("cardioBayesK1.csv",sep=";")
results.drop(results.index, inplace=True)

In [21]:
#DecisionTreeTest
criterion = np.array(['gini','entropy'])
splitters = np.array(['random','best'])
max_features = np.array(['auto','sqrt','log2', None])

for i in resamplers:
    Xtrain, ytrain = i.fit_sample(XwinesTrain,YwinesTrain)
    #print(Xtrain.shape)
    XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), Xtrain,ytrain,XwinesTest)
    #print("After selection")
    for j in criterion:
        for k in splitters:
            for l in max_features:
                classifier=DecisionTreeClassifier(criterion=j,splitter=k,max_features=l)
                
                start=time.clock()
                predicted=classifier.fit(XtrainVariance,ytrain).predict(XtestVariance)
                end=time.clock()
                duration=end-start
                
                start=time.clock()
                predicted2=classifier.fit(XtrainFisher,ytrain).predict(XtestFisher)
                end=time.clock()
                duration2=end-start
                
                #print(str(classifier)[:5])
                results.loc[len(results)]=[str(classifier),str(i),"variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted,average="macro"), recall_score(Ytest,predicted,average="macro"), duration]
                results.loc[len(results)]=[str(classifier),str(i),"fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2,average="macro"), recall_score(Ytest,predicted2,average="macro"), duration2]



TypeError: precision_score() got an unexpected keyword argument 'averafe'

In [152]:
results.to_csv("cardioDTK1.csv",sep=";")
results.drop(results.index, inplace=True)

In [79]:
Xtrain.shape

(670, 7)

In [133]:
#KNeighborsTest - no resampling
weights = ['uniform','distance']
algorithms = ['ball_tree','kd_tree','brute']

#print(Xtrain.shape)
XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), XwinesTrain,YwinesTrain,XwinesTest)
#print("After selection")
for j in range(len(algorithms)):
    for k in range(len(weights)):
        for l in (range(4)):
            classifier=KNeighborsClassifier(n_neighbors=((2*l)+1),weights=weights[k],algorithm=algorithms[j])
            
            start=time.clock()
            predicted=classifier.fit(XtrainVariance,YwinesTrain).predict(XtestVariance)
            end=time.clock()
            duration=end-start
            
            start=time.clock()
            predicted2=classifier.fit(XtrainFisher,YwinesTrain).predict(XtestFisher)
            end=time.clock()
            duration2=end-start
            
            #print(str(classifier)[:5])
            results.loc[len(results)]=[str(classifier),"none","variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted), recall_score(Ytest,predicted), roc_auc_score(Ytest,predicted), duration]
            results.loc[len(results)]=[str(classifier),"none","fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2), recall_score(Ytest,predicted2), roc_auc_score(Ytest,predicted), duration2]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [134]:
results.to_csv("resultsKNeighborsNoResampling.csv",sep=";")
results.drop(results.index, inplace=True)

In [135]:
#BayesTest - no resampling
gauss = GaussianNB()
bernoulli = BernoulliNB()
classifiers = np.array([gauss, bernoulli])

XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), XwinesTrain,YwinesTrain,XwinesTest)
#print("After selection")
for j in classifiers:

    start=time.clock()
    predicted=j.fit(XtrainVariance,YwinesTrain).predict(XtestVariance)
    end=time.clock()
    duration=end-start

    start=time.clock()
    predicted2=j.fit(XtrainFisher,YwinesTrain).predict(XtestFisher)
    end=time.clock()
    duration2=end-start
    
    #print(str(j)[:5])
    results.loc[len(results)]=[str(j),str(i),"variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted), recall_score(Ytest,predicted), roc_auc_score(Ytest,predicted), duration]
    results.loc[len(results)]=[str(j),str(i),"fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2,average='macro'), recall_score(Ytest,predicted2), roc_auc_score(Ytest,predicted), duration2]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [136]:
results.to_csv("resultsBayesNoResampling.csv",sep=";")
results.drop(results.index, inplace=True)

In [137]:
#DecisionTreeTest - no resampling
criterion = np.array(['gini','entropy'])
splitters = np.array(['random','best'])
max_features = np.array(['auto','sqrt','log2', None])
XtrainVariance,XtrainFisher,XtestVariance,XtestFisher = two_class_feature_selection((0.8*(1-0.8)), XwinesTrain,YwinesTrain,XwinesTest)
#print("After selection")
for j in criterion:
    for k in splitters:
        for l in max_features:
            classifier=DecisionTreeClassifier(criterion=j,splitter=k,max_features=l)
            
            start=time.clock()
            predicted=classifier.fit(XtrainVariance,YwinesTrain).predict(XtestVariance)
            end=time.clock()
            duration=end-start
            
            start=time.clock()
            predicted2=classifier.fit(XtrainFisher,YwinesTrain).predict(XtestFisher)
            end=time.clock()
            duration2=end-start
            
            #print(str(classifier)[:5])
            results.loc[len(results)]=[str(classifier),"none","variance",accuracy_score(Ytest,predicted),precision_score(Ytest,predicted), recall_score(Ytest,predicted), roc_auc_score(Ytest,predicted), duration]
            results.loc[len(results)]=[str(classifier),"none","fisher",accuracy_score(Ytest,predicted2),precision_score(Ytest,predicted2), recall_score(Ytest,predicted2), roc_auc_score(Ytest,predicted), duration2]

In [138]:
results.to_csv("resultsDTNoResampling.csv",sep=";")
results.drop(results.index, inplace=True)