# Citric Acid Draft 7

# Libraries

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
import datetime
import sqlite3
import imblearn


from IPython.display import display
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import MinMaxScaler

# Load Raw Data

In [32]:
redwine = pd.read_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\RedWine\\DataSets\\winequality-red.csv",
    index_col=False,
)

# Two Groups

In [33]:
redwine["quality"] = redwine["quality"].replace([7, 8], 1)
redwine["quality"] = redwine["quality"].replace([3, 4, 5, 6, 7], 0)

# Data Normalization

In [34]:
scaler = MinMaxScaler()
scaled_redwine = scaler.fit_transform(redwine)
scaled_redwine = pd.DataFrame(scaled_redwine, columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'])
scaled_redwine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846,0.0
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385,0.0
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385,0.0
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385,0.0
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846,0.0


# Functions

### Feature Columns

In [35]:
def feature_columns(feature_cols):
    feature_cols = [
    "citric acid",
    ]

    X = redwine[feature_cols]
    y = redwine.quality

    return X, y

### Random Over Sampler

In [36]:
def random_oversampler(X, y):
    oversample = RandomOverSampler(sampling_strategy="minority")
    oversample = RandomOverSampler(sampling_strategy="auto")
    steps = [("over", RandomOverSampler()), ("model", LogisticRegression())]
    pipeline = Pipeline(steps=steps)
    X, y = oversample.fit_resample(X, y)
    
    return X, y

### Logistic Regression Model

In [37]:
def logistic_regression_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, test_size=0.4, random_state=0)
    log_model = LogisticRegression(solver="liblinear", random_state=None)
    log_model.fit(X_train, y_train)
    y_predict = log_model.predict(X_test)
    
    return log_model, X_test, y_test, y_predict

### Confusion Matrix

In [38]:
def print_confusion_matrix(y_test, y_predict):
    print(confusion_matrix(y_test, y_predict))

### Classification Report

In [39]:
def print_classification_report(y_test, y_predict):
    print(classification_report(y_test, y_predict))

### Receiver Operating Characteristic (ROC) Curve

In [40]:
def display_roc_curve(log_model, X_test, y_test, y_predict):
    logit_roc_auc = roc_auc_score(y_test, y_predict)
    fpr, tpr, thresholds = roc_curve(y_test, log_model.predict_proba(X_test)[:,1])

    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')

    plt.show()

# Logistic Regression

In [41]:
feature_cols = [
    "citric acid",
]

X, y = feature_columns(feature_cols)
X, y = random_oversampler(X, y)
log_model, X_test, y_test, y_predict = logistic_regression_model(X, y)

### Reports

In [42]:
print_confusion_matrix(y_test, y_predict)

[[344 222]
 [119 421]]


In [43]:
print_classification_report(y_test, y_predict)

              precision    recall  f1-score   support

           0       0.74      0.61      0.67       566
           1       0.65      0.78      0.71       540

    accuracy                           0.69      1106
   macro avg       0.70      0.69      0.69      1106
weighted avg       0.70      0.69      0.69      1106



In [44]:
display_roc_curve(log_model, X_test, y_test, y_predict)

NameError: name 'X_test' is not defined