# Citric Acid Density Fixed Acidity Draft 2

# Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
import datetime
import sqlite3
import imblearn


from IPython.display import display
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

# Load Raw Data

In [2]:
redwine = pd.read_csv(
    "C:\\Users\\Space\\Documents\\py\\Projects\\TuringCollege\\RedWine\\DataSets\\winequality-red.csv",
    index_col=False,
)

# Two Groups

In [3]:
redwine["quality"] = redwine["quality"].replace([7, 8], 1)
redwine["quality"] = redwine["quality"].replace([3, 4, 5, 6, 7], 0)

# Data Normalization

In [4]:
scaler = MinMaxScaler()
scaled_redwine_linear = scaler.fit_transform(redwine)
scaled_redwine_linear = pd.DataFrame(scaled_redwine_linear, columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'])

In [5]:
scaled_redwine_linear = scaled_redwine_linear.drop('quality', axis=1)

## Functions

### Feature Columns

In [10]:
def feature_columns(feature_cols, dependent_variable):

    X = scaled_redwine_linear[feature_cols]
    y = scaled_redwine_linear[dependent_variable]

    return X, y

## Linear Regression

In [11]:
def linear_regression_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, test_size=0.2, random_state=0)
    log_model = LinearRegression()
    log_model.fit(X_train, y_train)
    y_predict = log_model.predict(X_test)
    
    return log_model, X_test, y_test, y_predict

## Print R Squared

In [12]:
def print_rsquared(y_test, y_predict):
    print('R Squared is: ', np.round(r2_score(y_test, y_predict),2))

# Citric Acid

In [14]:
feature_cols = [
    "citric acid",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.01


# Density

In [15]:
feature_cols = [
    "density",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.26


# Fixed Acidity

In [16]:
feature_cols = [
    "fixed acidity",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.0


# Sulphates

In [17]:
feature_cols = [
    "sulphates",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.0


# Chlorides

In [18]:
feature_cols = [
    "chlorides",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.07


# Total Sulfur Dioxide

In [19]:
feature_cols = [
    "total sulfur dioxide",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.03


# Free Sulfur Dioxide

In [20]:
feature_cols = [
    "free sulfur dioxide",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  -0.01


# pH

In [21]:
feature_cols = [
    "pH",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.05


# Multivariant Linear Regression

In [22]:
feature_cols = [
    "density",
    "volatile acidity",
    "residual sugar",
    "total sulfur dioxide",
    "chlorides",
    "pH",
]

dependent_variable = "alcohol"

X, y = feature_columns(feature_cols, dependent_variable)
log_model, X_test, y_test, y_predict = linear_regression_model(X, y)
print_rsquared(y_test, y_predict)

R Squared is:  0.44
