In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, multilabel_confusion_matrix,precision_recall_fscore_support,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer

def data_preprocess(data):
    x = data.drop(['quality'], axis=1) #data without quality column
    y = data['quality']                #quality column
    #split data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15) #0.15 works better than 0.25
    return x_train, x_test, y_train, y_test

#creation of the classifier
def classifier_creation(data):
    x_train, x_test, y_train, y_test=data_preprocess(data)
    clf = SVC(kernel = 'rbf', decision_function_shape='ovo') #creation of SVM, rbf is faster but less accurate than 'linear'
    clf.fit(x_train,y_train)
    prediction = clf.predict(x_test)
    return x_train, x_test, y_train, y_test, prediction

#metrics for f1_score, precission score and recall score
def metrics(data):
    x_train, x_test, y_train, y_test, prediction = classifier_creation(data)
    f1 = f1_score(y_test, prediction,average='weighted', labels=np.unique(prediction))
    precision = precision_score(y_test, prediction,average='weighted')
    recall = recall_score(y_test, prediction,average='weighted')
    print("Precision Score : ",precision)
    print("Recall Score : ",recall)
    print("f1_Score : ",f1)
 
#removing 33% of pH values
def ph_remove(data):
    phNewValues = data['pH'].values
    rowsToBeNan = np.random.randint(0,1599,528)
    for i in rowsToBeNan:
        phNewValues[i]=np.nan
    data.update(phNewValues)
    return data

#choosing a method to handle the missing values
def choose_method(method_choice,data):
    if method_choice=='1': #First method, delete pH column
        data=data.drop('pH',axis=1) #deletes pH column
        metrics(data)
        return data
        
    if method_choice=='2': #Second method, replace all NaN values with average value of pH
        average_ph_value = data['pH'].mean() #keeps the mean of pH column
        data['pH'].values[np.isnan(data['pH'].values)]= average_ph_value #replace NaN values with mean
        metrics(data)
        return data
        
    if method_choice=='3': #Third method, use Logistic Regression
        #prepei na ftiaxtei to x_test y_test
        'TODO'
        LogReg = LogisticRegression()
        LogReg.fit(x_test,y_test)
        predict = LogReg.predict(x_train)
        data['pH'].values[np.isnan(data['pH'].values)]= predict
        metrics(data)
        return data
        
    if method_choice=='4': #Fourth method, use K-Means 
        data = pd.read_csv('winequality-red.csv')
        columns = data.columns #saves the names of the columns to use it later
        kmeans=KMeans(n_clusters=4).fit(data) #kmeans model

        raw_data = ph_remove(data) #removes the 33% of ph values
        is_nan_values = np.isnan(raw_data) #keeps NaN values
        data_mean = np.nanmean(raw_data) #keeps the mean of values, ignoring NaN
        cleared_data = np.where(is_nan_values, data_mean, raw_data) #replace NaN values with mean

        #set new labels and centroids according to new data
        labels = kmeans.fit_predict(cleared_data)
        centroids = kmeans.cluster_centers_
        cleared_data[is_nan_values] = centroids[labels][is_nan_values]

        cleared_data = pd.DataFrame(cleared_data, columns = columns)            
        metrics(cleared_data)
        return cleared_data
        
def main():
    data = pd.read_csv('winequality-red.csv')
    data = ph_remove(data)
    method_choice= input('Choose method:\n1. Delete column\n2. Fill with the average data value of the column\n3. Use Logistic Regression\n4. Use K-Means to fill missing values')
    data = choose_method(method_choice,data)
    
if __name__ == "__main__":
    main()

Choose method:
1. Delete column
2. Fill with the average data value of the column
3. Use Logistic Regression
4. Use K-Means to fill missing values 4


Precision Score :  0.45062207907035495
Recall Score :  0.4666666666666667
f1_Score :  0.4832100330264944


  _warn_prf(average, modifier, msg_start, len(result))
