In [365]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [366]:
df = pd.read_csv('../../data/raw/winequalityN.csv')


## PipeLine

In this part, I will use automation of machine learning workflow to achieve the same operations in the Final Project.ipynb.

### Predefined Function to preprocess the data.

In [374]:
def cutBins(df):
    """
    function: cut the quality to three bins
    param: DataFrame
    return: new DataFrame after cutting bins
    """
    df_bins= df.copy()
    bins = [0,4,7,10]

    labels = [0,1,2] # 'low'=0, 'medium'=1,'high'=2
    df_bins['quality_range']= pd.cut(x=df_bins['quality'], bins=bins, labels=labels)
    df_bins = df_bins.drop('quality', axis=1) 
    return df_bins

def mapToNumber(df):
    """
    function: map the wine type from white to 0 and red to 1
    param: DataFrame
    return: new DataFrame after mapping the type
    """
    df.type = df.type.map({'white':0, 'red':1})
    return df

def fillNullValues(df):
    """
    function: fill the null value with mean
    param: DataFrame
    return: new DataFrame after filling the null Value
    """
    df_mean = df.copy()
    fill_list = (null_cell(df_mean)).index
    for col in fill_list:
        df_mean.loc[:, col].fillna(df_mean.loc[:, col].mean(), inplace=True)
    return df_mean
    
def null_cell(df): 
    """
    function: find the null value and store them into list
    param: DataFrame
    return: null value
    """
    total_missing_values = df.isnull().sum() 
    missing_values_per = df.isnull().sum()/df.isnull().count() 
    null_values = pd.concat([total_missing_values, missing_values_per], axis=1, keys=['total_null', 'total_null_perc']) 
    null_values = null_values.sort_values('total_null', ascending=False) 
    return null_values[null_values['total_null'] > 0] 

In [375]:
y_train

4152    6
931     5
4252    6
1743    6
1238    7
       ..
905     4
5192    6
3980    5
235     6
5157    7
Name: quality, Length: 5197, dtype: int64

In [376]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression

In [377]:

X = df[['type','quality','alcohol', 'density', 'volatile acidity', 'chlorides',
       'citric acid', 'fixed acidity', 'free sulfur dioxide',
       'total sulfur dioxide', 'sulphates', 'residual sugar', 'pH']] 
y = df.quality

X=fillNullValues(X)
X=cutBins(X)
X=mapToNumber(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, random_state = 1)

In [378]:
logistic_model=Pipeline(steps=([('scaler',StandardScaler()),('LR', LogisticRegression())]))

In [379]:
lr = logistic_model.fit(X_train,y_train)
lr

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('LR',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

## Performance Evaluation 

In [380]:
from sklearn import metrics

y_pred = lr.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.6053846153846154