In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
%matplotlib inline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score


#Accesses and reads data from csv 
red_df = pd.read_csv('winequality-red.csv', sep = ';')
white_df = pd.read_csv('winequality-white.csv', sep = ';')

#join red and white data
wines=[red_df,white_df]
all_df = pd.concat(wines, ignore_index=True)

# splitting wine into good and bad groups
# 2-6.5 are "bad" quality, and wines that are  between 6.5 - 8 are "good"
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
red_df['quality'] = pd.cut(red_df['quality'], bins = bins, labels = group_names)
white_df['quality'] = pd.cut(white_df['quality'], bins = bins, labels = group_names)
all_df['quality'] = pd.cut(all_df['quality'], bins = bins, labels = group_names)


# Assigning numeric labels to our quality variables
label_quality = LabelEncoder()


#Change dataframe to reflect new label
red_df['quality'] = label_quality.fit_transform(red_df['quality'])
white_df['quality'] = label_quality.fit_transform(white_df['quality'])
all_df['quality'] = label_quality.fit_transform(all_df['quality'])


#set dependent and independent variables
#red wine variables
Y_red = red_df.quality
X_red = red_df.drop('quality', axis=1)
#white wine variables
Y_white = white_df.quality
X_white = white_df.drop('quality', axis=1)
#combined variables
Y_all = all_df.quality
X_all = all_df.drop('quality', axis=1)



#split data into training and testing
Xr_train, Xr_test, Yr_train, Yr_test = train_test_split(X_red, Y_red, test_size = 0.2, random_state = 0)
Xw_train, Xw_test, Yw_train, Yw_test = train_test_split(X_white, Y_white, test_size = 0.2, random_state = 0)
Xa_train, Xa_test, Ya_train, Ya_test = train_test_split(X_all, Y_all, test_size = 0.2, random_state = 0)

#Feature Scaling
#take account of variations in mean and SDs
sc = StandardScaler()
#red wine variables
Xr_train = sc.fit_transform(Xr_train)
Xr_test = sc.transform(Xr_test)
#white wine variables
Xw_train = sc.fit_transform(Xw_train)
Xw_test = sc.transform(Xw_test)
#combined sets variables
Xa_train = sc.fit_transform(Xa_train)
Xa_test = sc.transform(Xa_test)


forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(Xr_train, Yr_train)
forest.fit(Xw_train, Yw_train)
forest.fit(Xa_train, Ya_train)


print('Red Wines Random Forest Classifier Training Accuracy:', forest.score(Xr_train, Yr_train))
print('White Wines Random Forest Classifier Training Accuracy:', forest.score(Xw_train, Yw_train))
print('All Wines Random Forest Classifier Training Accuracy:', forest.score(Xa_train, Ya_train))



Red Wines Random Forest Classifier Training Accuracy: 0.8553557466770915
White Wines Random Forest Classifier Training Accuracy: 0.810617662072486
All Wines Random Forest Classifier Training Accuracy: 0.9915335770636906
