In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as pltx
import joblib


wine_set = pd.read_csv("data/winequality.csv")

# Analyze size of the datasets
print("Rows, columns: " + str(wine_set.shape))

In [None]:
# See the first five rows and columns
wine_set.head()


In [None]:
# See the last five rows and columns
wine_set.tail()

In [None]:
# Check for missing values (If present)
print (wine_set.isna().sum())
# No missing values


In [None]:
# Exploring different variables
# Histogram of the variable 'quality'
quality_histogram = pltx.histogram(wine_set, x='quality')
quality_histogram.show()

In [None]:
corr_val = wine_set.corr()
plt.pyplot.subplots(figsize=(15,10))
sns.heatmap(corr_val, xticklabels=corr_val.columns, yticklabels=corr_val.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
# Create classification of wine between Good and 'not as good' Quality
# Good quality wine will be referred as 'good'
# Quality score equal or superion to 7 is considered good
wine_set['good'] = [1 if x >= 7 else 0 for x in wine_set['quality']]

# Prepare the new variables for future comparison
# Separate variables as feature and target variable

# feature
X = wine_set.drop(['quality','good'], axis  = 1)
# target
Y = wine_set['good']

In [None]:
sns.barplot(x="alcohol", y="good", data=wine_set)

In [None]:
sns.barplot(x="density", y="good", data=wine_set)

In [None]:
sns.barplot(x="free sulfur dioxide", y="good", data=wine_set)

In [None]:
sns.pairplot(wine_set, hue = "good")

In [None]:
# Observe proportion of good wine vs the rest of the wine dataset
wine_set['good'].value_counts()
# Good wine: 1277
# Not as good wine: 5220

In [None]:
# Standardizing feature variables
from sklearn.preprocessing import StandardScaler

X_features = X
X = StandardScaler().fit_transform(X)



In [None]:
# Split data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.25, random_state=0)

In [None]:
# Machine learning modelling

# First modelling technique --- Decision Tree
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier


decision_tree_model = DecisionTreeClassifier(random_state=1)
decision_tree_model.fit(X_train, Y_train)
Y_decision_tree_pred = decision_tree_model.predict(X_test)

print(classification_report(Y_test, Y_decision_tree_pred))

In [None]:
# Third modelling technique --- KNN
from sklearn.neighbors import KNeighborsClassifier

knn_modelling = KNeighborsClassifier(n_neighbors=3)
knn_modelling.fit(X_train, Y_train)
Y_knn_pred = knn_modelling.predict(X_test)


print(classification_report(Y_test, Y_random_forest_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

log_regress  = LogisticRegression(random_state=40)
log_regress.fit(X_train, Y_train)
Y_log_regress_pred = log_regress.predict(X_test)

print(classification_report(Y_test, Y_log_regress_pred))

In [None]:
# Comparing prominent feature of good/not as good wine

# Filtering wine_set for only good quality
wine_set_temp = wine_set[wine_set['good']==1]
wine_set_temp.describe()

In [None]:
# Filtering wine_set for only not as good quality

wine_set_temp1 = wine_set[wine_set['good']==0]
wine_set_temp1.describe()

In [None]:
import tpot
autoML = tpot.TPOTClassifier(
                            generations=10,
                            population_size=100,
                            offspring_size=None,
                            mutation_rate=0.9,
                            crossover_rate=0.1,
                            scoring=None,
                            cv=5,
                            subsample=1.0,
                            n_jobs=1,
                            max_time_mins=None,
                            max_eval_time_mins=5,
                            random_state=None,
                            config_dict=None,
                            template=None,
                            warm_start=False,
                            memory=None,
                            use_dask=False,
                            periodic_checkpoint_folder=None,
                            early_stop=None,
                            verbosity=2,
                            disable_update_check=False,
                            )
autoML.fit(X_train,Y_train)                          

In [None]:


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix as cm
import matplotlib.pyplot as pyplt

predictions = autoML.predict(X_test)
score = round(accuracy_score(Y_test, predictions), 3)
cm1 = cm(Y_test, predictions)
sns.heatmap(cm1, annot=True, fmt=".0f")
pyplt.xlabel("Predicted Values")
pyplt.ylabel("Actual Values")
pyplt.title("Accuracy Score: {0}".format(score), size = 15)
pyplt.show()


In [None]:
with open('model/autoML-model_knn.joblib', 'wb') as input:
    joblib.dump(knn_modelling, input)
# Now save model using a joblib file

In [None]:
with open('model/autoML-model_Decision-Tree.joblib', 'wb') as input:
    joblib.dump(decision_tree_model, input)
# Now save model using a joblib file

In [None]:
with open('model/autoML-model_Log-Regress.joblib', 'wb') as input:
    joblib.dump(log_regress, input)
# Now save model using a joblib file