In [1]:
#import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
%matplotlib inline

In [2]:
#import Csv
redWine = pd.read_csv('/Users/heath/Project-3-Wine/Data/wineQualityReds.csv', index_col=0)
whiteWine = pd.read_csv('/Users/heath/Project-3-Wine/Data/wineQualityWhites.csv', index_col=0)

In [3]:
#add color to the csvs based on data location
redWine['wine_color'] = 'red'
whiteWine['wine_color'] = 'white'

In [4]:
#rename white wine data set columns that are mismatched to red wine data
whiteWine.rename({"fixed.acidity": "fixed acidity",
                 "volatile.acidity": "volatile acidity",
                 "citric.acid": "citric acid",
                 "residual.sugar": "residual sugar",
                 "free.sulfur.dioxide": "free sulfur dioxide",
                 "total.sulfur.dioxide": "total sulfur dioxide",
                 "pH": "ph"},
                axis = "columns", inplace = True)


whiteWine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,ph,sulphates,alcohol,quality,wine_color
1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white
2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,white
3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white
5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,white


In [5]:
#concatenate red & white wine frames
frames = [redWine, whiteWine]
wine = pd.concat(frames)
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,ph,sulphates,alcohol,quality,wine_color
1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [6]:
#summarize wine data----- white wine data much more prevalent
wine.info()
wine['wine_color'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 1 to 4898
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   ph                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  wine_color            6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 710.6+ KB


white    4898
red      1599
Name: wine_color, dtype: int64

In [7]:
#encode white wine color data
label_color = LabelEncoder()
wine['wine_color']= label_color.fit_transform(wine['wine_color'])
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,ph,sulphates,alcohol,quality,wine_color
1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [8]:
#Separate dataset as response variable & feature variable
X = wine.drop('wine_color', axis =1)
y = wine['wine_color']

In [9]:
#split train data & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state =42)

In [10]:
#apply scaling to get optimized result- limits bias by not allowing large numbers in a field to overpower smaller numbers in another
#ie sulfur v chlorides

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc = sc.transform(X_test)

In [11]:
clf = svm.SVC()
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)

In [13]:
print(classification_report(y_test, clf_pred))
print(confusion_matrix(y_test, clf_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       341
           1       0.99      1.00      1.00       959

    accuracy                           0.99      1300
   macro avg       1.00      0.99      0.99      1300
weighted avg       0.99      0.99      0.99      1300

[[335   6]
 [  1 958]]


In [14]:
#understand summary metrics to get feel for bins
wine['alcohol'].describe()

count    6497.000000
mean       10.491801
std         1.192712
min         8.000000
25%         9.500000
50%        10.300000
75%        11.300000
max        14.900000
Name: alcohol, dtype: float64

In [15]:
# create bins 
bins = (8, 9.5, 10.3, 11.3, 14.9)
bin_names = ['low_alcohol', 'medium_low', 'medium_high','high_alcohol']
wine['alcohol'] = pd.cut(wine['alcohol'], bins, labels=bin_names)
wine['alcohol'].unique()

[low_alcohol, medium_low, medium_high, high_alcohol, NaN]
Categories (4, object): [low_alcohol < medium_low < medium_high < high_alcohol]

In [16]:
wine["alcohol"]= wine["alcohol"].astype(str)
label_alc = LabelEncoder()
wine['alcohol']= label_color.fit_transform(wine['alcohol'])
wine.head(40)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,ph,sulphates,alcohol,quality,wine_color
1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,1,5,0
2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,3,5,0
3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,3,5,0
4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,3,6,0
5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,1,5,0
6,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,1,5,0
7,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,1,5,0
8,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,3,7,0
9,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,1,7,0
10,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,2,5,0


In [17]:
#0=high
#1=low_alcohol
#2=medium-high
#3=medium-low


#Separate dataset as response variable & feature variable
X = wine.drop('alcohol', axis =1)
y = wine['alcohol']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state =42)

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc = sc.transform(X_test)

In [20]:
clf = svm.SVC()
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)

In [21]:
print(classification_report(y_test, clf_pred))
print(confusion_matrix(y_test, clf_pred))

              precision    recall  f1-score   support

           0       0.90      0.84      0.87       328
           1       0.80      0.79      0.79       368
           2       0.69      0.70      0.69       325
           3       0.57      0.60      0.58       279

    accuracy                           0.74      1300
   macro avg       0.74      0.73      0.73      1300
weighted avg       0.74      0.74      0.74      1300

[[277   0  50   1]
 [  0 289   7  72]
 [ 31  11 229  54]
 [  1  63  48 167]]
