In [1]:
#Import deps 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

.... Some Introductory Text 

In [2]:
# Import Data 
all_wine_raw_df = pd.read_csv('./Wine_data_both.csv') 
red_wine_df = pd.read_csv('./Wine_data_red.csv')
white_wine_df = pd.read_csv('./Wine_data_white.csv')

In [3]:
# Previewing Data
display(all_wine_raw_df)
display(all_wine_raw_df['quality'].value_counts())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Wine
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,White
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,White
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,White
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,White
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,Red
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,Red
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,Red
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,Red


quality
6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: count, dtype: int64

In [4]:
#Encode Wine column using get_dummies
wine_cats = pd.get_dummies(all_wine_raw_df['Wine'],dtype=int)
wine_cats.value_counts()

Red  White
0    1        4898
1    0        1599
Name: count, dtype: int64

In [5]:
# Concatenate encoded wine column to main dataframe
all_wine_raw_concat = pd.concat([all_wine_raw_df.drop('Wine',axis=1),wine_cats],join='outer',axis=1)
all_wine_raw_concat

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Red,White
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1,0


In [6]:
# Setup X and y variables
X = all_wine_raw_concat.drop(columns='quality')
y = all_wine_raw_concat['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)
X_train.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Red,White
count,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0,4872.0
mean,7.210088,0.340368,0.318007,5.460211,0.055947,30.67303,115.857245,0.994704,3.219333,0.52967,10.489429,0.246921,0.753079
std,1.297567,0.164934,0.144987,4.781869,0.034602,17.980481,56.704002,0.003027,0.159228,0.147407,1.194576,0.431265,0.431265
min,3.9,0.08,0.0,0.6,0.012,1.0,6.0,0.98713,2.72,0.22,8.0,0.0,0.0
25%,6.4,0.23,0.24,1.8,0.038,17.0,78.0,0.9923,3.11,0.43,9.5,0.0,1.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.9949,3.21,0.5,10.3,0.0,1.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,0.0,1.0
max,15.6,1.33,1.23,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.2,1.0,1.0


In [7]:
# Preprocess data - Scaling
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Build Model 
clf = RandomForestClassifier(random_state=13)
clf.fit(X_train_scaled,y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6873846153846154


In [9]:
# Evaluate and Optimize Model

... Text interpreting and describing results 

In [10]:
# Export Results