In [None]:
#title: "Wine Quality Prediction"
#author: "Natalie Eversole"
#date: "2022-07-21"

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('winequality-red.csv')
df.describe()

In [None]:
#Plot correlation matrix
corrMatrix = df.corr()
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
#Find fully correlated variables
for a in range(len(df.corr().columns)):
    for b in range(a):
        if abs(df.corr().iloc[a,b]) > 0.67:
            name = df.corr().columns[a]
            print(name)

In [None]:
#Bar plot (Quality vs alcohol)
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = df)

In [None]:
#Bar plot (Quality vs sulphates)
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = df)

In [None]:
#Bar plot (Quality vs sulphates)
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = df)

In [None]:
#Normalize
sc = StandardScaler()
norm_df = pd.DataFrame(sc.fit_transform(df))

#Plot normalized correlation matrix
corrMatrix = norm_df.corr()
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
#scatter matrix
pd.plotting.scatter_matrix(norm_df, alpha=0.2, figsize=(20,20), diagonal='hist')
plt.show()

In [None]:
df_new=df.drop('citric acid',axis=1)
df_new.isnull().sum()
df_new.update(df_new.fillna(df_new.mean()))

x = df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
y = df['quality']

#Split to train & test sets
train, test, train_labels, test_labels = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=42)

#Normalize
sc = StandardScaler()
norm_train_data = pd.DataFrame(sc.fit_transform(train), columns=x.columns)
norm_test_data = pd.DataFrame(sc.fit_transform(test), columns=x.columns)

In [None]:
#Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train, train_labels)
#prediction
pred = rfc.predict(test)
print("Initial accuracy")
print('Accuracy of training data: ', rfc.score(train, train_labels))
print('Accuracy of testing data: ',rfc.score(test, test_labels))

In [None]:
#Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(train, train_labels)
#prediction
pred = rfc.predict(test)
print("Initial accuracy")
print('Accuracy of training data: ', rfc.score(train, train_labels))
print('Accuracy of testing data: ',rfc.score(test, test_labels))

In [None]:
#create new data frame with original and predicted values
pred_df = {'Original Values': test_labels, 'Predicted Values':pred}
pd.DataFrame(pred_df).head(50)

In [None]:
#tune rfc parameters for a random grid search
print('Parameters currently in use:\n')
print(rf.get_params())

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 6]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(train, train_labels)

print(rf_random.best_params_)

In [None]:
print("Accuracy after Randomized Search")
print('New accuracy of training data: ',rf_random.score(train, train_labels))
print('New accuracy of testing data: ',rf_random.score(test, test_labels))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [30, 40, 50, 60, 70],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 4, 6],
    'n_estimators': [1000, 1200, 1400]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train, train_labels)

print(grid_search.best_params_)

In [None]:
print("Accuracy after Grid Search")
print('Accuracy of training data: ',grid_search.score(train, train_labels))
print('Accuracy of testing data: ',grid_search.score(test, test_labels))

In [None]:
# Accuracy decreased after performing grid search, proving that our initial model predicted the quality of wine more accuratly.

In [None]:
# Plot of original values vs predicted values from initial model.
pred_plot = pd.DataFrame(pred_df)
pred_plot.plot.hist(alpha=0.5)