# Predicting Wine Quality using Wine Quality Dataset

### Import modules that will be used in process

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import cvxopt
%matplotlib inline

import plotly.express as px

import warnings
import matplotlib.cbook

# warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
# warnings.simplefilter(action='ignore', category=FutureWarning)
# from pandas.core.common import SettingWithCopyWarning
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.filterwarnings('ignore')

from numpy import mean, std,isnan, asarray, polyfit
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier #SGDClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix,recall_score, precision_score,classification_report
from sklearn.datasets import make_classification
# from sklearn.datasets.samples_generator import make_blobs
from sklearn.svm import LinearSVC, SVC
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from scipy.stats import pearsonr

### Import data set (csv)

### We have got df_final dataframe in our EDA file. So here we can import our save file.
#### df_final = pd.read_csv('Dataframe_final.csv')
#### df_final.sample(5)

In [2]:
df = pd.read_csv("winequalityN.csv")
# df.sample(5)                  #shows 5 random choosen rows
# df.head(5)                    #shows first 5 rows from data set

In [4]:
# save a instance.json dataframe for test data input during deployement
df.sample(1000).to_json (r'instance.json')

In [3]:
def data_correcting(data_):
    
    """ In dataset some values for fixed acidity, volatile acidity,
    citric acid, residual sugar, chlorides, pH, sulphates are missing. 
    Solve this problem by filling null values with mean values of train dataframe.
     irst of all we need to change text values of wine type to the 0 and 1.
    Then we need to get train data and Fill Nan values by this data mean."""
    
    data_["type"].replace({"red": 0, "white": 1}, inplace=True) # "RED": 0, "WHITE": 1       
    train_df = data_.sample(frac=0.8, random_state=42)
    df.fillna(train_df.mean(axis=0), inplace=True)
    
    """ Now we have 1599 red and 4898 white wine rows
    You can check it by this:
    dups_color = df.pivot_table(index=['type'], aggfunc='size')
    dups_color """
    
    return data_

df = data_correcting(df)

def oversampling_data(data_):
    
    """ Share of white wines is 75%. So we deside to oversampling data with random choosen red wine data.
    As in the future we can have other share of white and red wines, we get this solution for all possible cases. """
    
    red_count = df.loc[data_['type'] == 0].count()[0]
    white_count = df.loc[data_['type'] == 1].count()[0]
    if white_count > red_count:
        for i in range((white_count-red_count)):
            df1 = data_.loc[data_['type'] == 0].sample()
            data_ = data_.append(df1)
    else:
        for i in range((red_count-white_count)):
            df1 = data_.loc[df['type'] == 1].sample()
            data_ = data_.append(df1)
            
    """ Now combining fixed acidity, volatile acidity and citric acid into one variable total_acidity
    and our target variable into two classes: low quality-->0 (3, 4, 5)  and high quality-->1 (6,7,8,9)"""
    
    data_["total_acidity"]= data_['fixed acidity']+data_['volatile acidity']+data_['citric acid']
    quaity_mapping = { 3 : 0, 4 : 0, 5: 0, 6 : 1, 7: 1, 8 : 1, 9 : 1}
    data_["quality"] =  data_["quality"].map(quaity_mapping)
    
    """You can check that it works by this
    dups_color = df.pivot_table(index=['type'], aggfunc='size')
    dups_color
    Now we have 4898 of red and 4898 of white wines data """
    
    return data_
    

# From all EDA analyze we can see that there are some outliers. So we have 2 variants
# 1. Remove this outliers.
# 2. Replace them with max/min values, so they may contain good values for other features and this variant will save their values
# The whole analyse we have shown in the initial stage of project. So here we will show only the result of our final decisioans.
# 3. Combine our target variable into two classes: low quality (3, 4, 5)  and high quality (6,7,8,9)


def first_data(data_):
    lower_limit = data_["free sulfur dioxide"].mean() - 3*data_["free sulfur dioxide"].std()
    upper_limit = data_["free sulfur dioxide"].mean() + 3*data_["free sulfur dioxide"].std()
    df2 = data_[(data_["free sulfur dioxide"] > lower_limit) & (data_["free sulfur dioxide"] < upper_limit)]
    lower_limit = df2['total sulfur dioxide'].mean() - 3*df2['total sulfur dioxide'].std()
    upper_limit = df2['total sulfur dioxide'].mean() + 3*df2['total sulfur dioxide'].std()
    df3 = df2[(df2['total sulfur dioxide'] > lower_limit) & (df2['total sulfur dioxide'] < upper_limit)]
    lower_limit = df3['residual sugar'].mean() - 3*df3['residual sugar'].std()
    upper_limit = df3['residual sugar'].mean() + 3*df3['residual sugar'].std()
    df4 = df3[(df3['residual sugar'] > lower_limit) & (df3['residual sugar'] < upper_limit)]
    
    return df4
    
dataframe_1 = first_data(oversampling_data(df)).copy()

def second_data(data_):
    lower_limit = data_["free sulfur dioxide"].mean() - 3*data_["free sulfur dioxide"].std()
    upper_limit = data_["free sulfur dioxide"].mean() + 3*data_["free sulfur dioxide"].std()
    df2_repl = data_
    
    def replace_outliers(arr):
        arr = np.array(arr)
        upper = arr.mean() + 3 * arr.std()
        lower = arr.mean() - 3 * arr.std()
        arr[(arr > upper)] = upper
        arr[(arr < lower)] = lower
        
        return arr
    
    df2_repl["free sulfur dioxide"] = replace_outliers(df2_repl["free sulfur dioxide"])
    df2_repl["total sulfur dioxide"] = replace_outliers(df2_repl["total sulfur dioxide"])
    df2_repl["residual sugar"] = replace_outliers(df2_repl["residual sugar"])

    lower_limit = df2_repl["free sulfur dioxide"].mean() - 3*df2_repl["free sulfur dioxide"].std()
    upper_limit = df2_repl["free sulfur dioxide"].mean() + 3*df2_repl["free sulfur dioxide"].std()
    
    return df2_repl

dataframe_2 = second_data(oversampling_data(df)).copy()


def lst_of_dataframes(d1, d2):
    
    """ list of dataframe_1 and dataframe_2 """
    
    df_list = [d1, d2] 
    for i in range(len(df_list)):
        df_ = df_list[i]
        df_final = df_[["total_acidity", "chlorides", "pH", "sulphates", "alcohol", "quality"]]
        df_list[i] = df_final
        
    return df_list
        
def get_dataset(dataframe):
    
    X = dataframe.drop("quality", axis = 1)
    y = dataframe["quality"]
    X = StandardScaler().fit_transform(X)
    
    return X, y

def checking_better_dataframe(lst):
    
    #lst = lst_of_dataframes(dataframe_1, dataframe_2)
    
    """ This function will get accuracy for two dataframes that we have, compare them
    and return the dataframe wich gives us better accuracy. """
    
    l_accuracy = []
    
    for i in range(len(lst)):
        X, y = get_dataset(lst[i])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        log_reg = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                       intercept_scaling=1, l1_ratio=None, max_iter=100,
                       multi_class='auto', n_jobs=None, penalty='l2',
                       random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                       warm_start=False)
        log_reg.fit(X_train,y_train)
        y_hat = log_reg.predict(X_test)        
        cm = confusion_matrix(y_hat,y_test)
        accuracy = metrics.accuracy_score(y_hat,y_test)
        l_accuracy.append(accuracy)
#         print(f'Accuracy of dataframe_{i+1} is {accuracy}')
#         print(f'log_reg.intercept_ of dataframe_{i+1} is {log_reg.intercept_}')
#         print(f'log_reg.coef_ of dataframe_{i+1} is {log_reg.coef_}')
#         print(f'confusion_matrix of dataframe_{i+1} is {cm}\n')
    if l_accuracy[0] > l_accuracy[1]:
        
        return lst[0]
    
    else:
        
        return lst[1]
        
df_final = checking_better_dataframe(lst_of_dataframes(dataframe_1, dataframe_2))[["total_acidity", "chlorides", "pH", "sulphates", "alcohol", "quality"]]


In [4]:
df_final.describe()

Unnamed: 0,total_acidity,chlorides,pH,sulphates,alcohol,quality
count,9796.0,9796.0,9796.0,9796.0,9796.0,9796.0
mean,8.300812,0.067328,3.250254,0.57675,10.473834,0.600653
std,1.677094,0.045563,0.166826,0.17459,1.141221,0.489789
min,4.13,0.009,2.72,0.22,8.0,0.0
25%,7.2,0.042,3.14,0.46,9.5,0.0
50%,7.89,0.059,3.24,0.55,10.3,1.0
75%,8.96625,0.081,3.36,0.65,11.2,1.0
max,17.045,0.611,4.01,2.0,14.9,1.0


In [5]:
df_final.sample(5)

Unnamed: 0,total_acidity,chlorides,pH,sulphates,alcohol,quality
6291,8.77,0.078,3.3,0.48,10.2,0
6105,11.17,0.136,3.35,0.94,10.0,0
5150,11.93,0.09,3.17,0.53,10.5,0
5294,7.355,0.122,3.47,0.53,9.9,0
5069,8.59,0.073,3.29,0.61,9.2,1


In [6]:
df_final.to_csv('Dataframe_final.csv')