In [32]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession, Row, Column, SQLContext, DataFrame, DataFrameStatFunctions
from pyspark.sql.functions import split
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("app1").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [50]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, DoubleType

import calendar
from datetime import date,datetime
import math
from pyspark.sql.functions import date_format, lit

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pyarrow.parquet as pq
import scipy.io
import pandas as pd
le = LabelEncoder()

from sklearn.feature_selection import VarianceThreshold
from skfeature.function.similarity_based import lap_score
from skfeature.utility import construct_W

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import time

In [34]:
wines = sqlContext.read.format('csv').options(header='true',delimiter=';',mode='DROPMALFORMED',inferSchema=True).load("winequality-white.csv")

In [35]:
wines.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [36]:
wines = wines.withColumn("quality", wines["quality"].cast(StringType()))

In [37]:
#wines.select(wines["*"]).where(wines["quality"]==10).count()

In [38]:
# 3- 20
# 4- 163
# 5- 1457
# 6- 2198
# 7- 880
# 8- 175
# 9- 5

In [89]:
winesTrain = wines.sample(False,0.75,40)
winesTest = wines.subtract(winesTrain)
print(winesTrain.count())
print(winesTest.count())
print(winesTrain.count()+winesTest.count())
print(wines.count())

3635
859
4494
4898


In [90]:
winesTrainPandas = winesTrain.toPandas()
winesTestPandas = winesTest.toPandas()

In [97]:
winesTrain = winesTrainPandas.values
winesTest = winesTestPandas.values

XwinesTrain = winesTrain[:,0:11]
YwinesTrain = winesTrain[:,11]
XwinesTest = winesTest[:,0:11]
YwinesTest = winesTest[:,11]

scaler = StandardScaler().fit(XwinesTrain)
XwinesTrain = scaler.transform(XwinesTrain)
XwinesTest = scaler.transform(XwinesTest)



In [105]:
XtrainSMOTE, YtrainSMOTE = SMOTE(k=1).fit_sample(XwinesTrain, YwinesTrain)
XtrainADASYN, YtrainADASYN = ADASYN(k=1).fit_sample(XwinesTrain, YwinesTrain)
XtrainROS, YtrainROS = RandomOverSampler().fit_sample(XwinesTrain,YwinesTrain)
print(XtrainSMOTE.shape,XtrainADASYN.shape,XtrainROS.shape)



(11382, 11) (11271, 11) (11382, 11)


In [104]:
bayes = GaussianNB()
tree = DecisionTreeClassifier()
neigh = KNeighborsClassifier(n_neighbors=5)
classifiers=np.array([bayes,tree,neigh])
for i in classifiers:
    start=time.clock()
    predicted=i.fit(XtrainSMOTE,YtrainSMOTE).predict(XwinesTest)
    end=time.clock()
    duration=end-start
    
    start=time.clock()
    predicted2=i.fit(XtrainADASYN,YtrainADASYN).predict(XwinesTest)
    end=time.clock()
    duration2=end-start
    
    start=time.clock()
    predicted3=i.fit(XtrainROS,YtrainROS).predict(XwinesTest)
    end=time.clock()
    duration3=end-start
    
    start=time.clock()
    predicted4=i.fit(XwinesTrain,YwinesTrain).predict(XwinesTest)
    end=time.clock()
    duration4=end-start
    
    print("Klasyfikator", str(i)[:5])
    print("Trafnosc: ", accuracy_score(YwinesTest,predicted), accuracy_score(YwinesTest,predicted2), accuracy_score(YwinesTest,predicted3), accuracy_score(YwinesTest,predicted4))
    print("Precyzja: ", precision_score(YwinesTest,predicted, average='macro'), precision_score(YwinesTest,predicted2, average='macro'), precision_score(YwinesTest,predicted3, average='macro'), precision_score(YwinesTest,predicted4, average='macro'))
    print("Czulosc: ", recall_score(YwinesTest,predicted, average='macro'), recall_score(YwinesTest,predicted2, average='macro'), recall_score(YwinesTest,predicted3, average='macro'), recall_score(YwinesTest,predicted4, average='macro')) 
    #print("AUC: ",roc_auc_score(YwinesTest,predicted, average='micro'),roc_auc_score(YwinesTest,predicted2, average='micro'),roc_auc_score(YwinesTest,predicted3, average='micro'),roc_auc_score(YwinesTest,predicted4, average='micro'))
    print("Czas: ", duration,duration2,duration3,duration4)

Klasyfikator Gauss
Trafnosc:  0.28754365541327126 0.2828870779976717 0.320139697322468 0.4528521536670547
Precyzja:  0.23398430243722487 0.23339979088463622 0.25718590446448675 0.3562631263067968
Czulosc:  0.35498702489289363 0.34294670470691774 0.3728133263979516 0.38619779109504143
Czas:  0.02597036882843895 0.024473051994846173 0.02782324152030924 0.010020986328981962


  'recall', 'true', average, warn_for)


Klasyfikator Decis
Trafnosc:  0.43538998835855647 0.43538998835855647 0.4342258440046566 0.440046565774156
Precyzja:  0.24797235605710716 0.24668086670756442 0.23942810946347542 0.2349523627276055
Czulosc:  0.2628375828425371 0.259592454452909 0.23988777944457815 0.2324484540155535
Czas:  0.2420971310339155 0.2908884971138832 0.13081212940414844 0.054282855210203707


  'recall', 'true', average, warn_for)


Klasyfikator KNeig
Trafnosc:  0.4342258440046566 0.42258440046565776 0.4481955762514552 0.5110593713620489
Precyzja:  0.2537197956401608 0.2551362475565369 0.26177221710391646 0.3211261020504938
Czulosc:  0.288222142272263 0.2959663745897051 0.2913502190822959 0.28414044102585273
Czas:  0.25444373712525703 0.27886445129752246 0.28469556738446045 0.13335677157556347


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
