# Imports, Splitting, Scaling

In [66]:
#Imports
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split

In [67]:
#Read in Data
red_df = pd.read_csv('../Resources/winequality-red.csv', delimiter=';')
white_df = pd.read_csv('../Resources/winequality-white.csv', delimiter=';')

In [68]:
#Splitting
target_red = red_df['quality']
features_red = red_df.drop(columns='quality')

target_white = pd.get_dummies(white_df['quality'])
features_white = white_df.drop(columns='quality')

X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(features_red, target_red, random_state=42)
    
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(features_white, target_white, random_state=42)

In [69]:
#Scaling
X_scaler_red = skl.preprocessing.StandardScaler()
X_scaler_white = skl.preprocessing.StandardScaler()

X_scaler_red.fit(X_train_red)
X_scaler_white.fit(X_train_white)

X_train_scaled_red = X_scaler_red.transform(X_train_red)
X_test_scaled_red = X_scaler_red.transform(X_test_red)

X_train_scaled_white = X_scaler_white.transform(X_train_white)
X_test_scaled_white = X_scaler_white.transform(X_test_white)

# Model Testing

## Logistic Regression
- Train: .61
- Test: .58

In [70]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

Training Data Score: 0.6138448707256047
Testing Data Score: 0.5875
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       1.00      0.08      0.14        13
           5       0.63      0.74      0.68       164
           6       0.56      0.59      0.57       169
           7       0.43      0.27      0.33        48
           8       0.00      0.00      0.00         5

    accuracy                           0.59       400
   macro avg       0.44      0.28      0.29       400
weighted avg       0.58      0.59      0.57       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM - Linear Kernel
- Train: .59
- Test: .56

In [71]:
# SVM - Linear Kernel
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

Training Data Score: 0.5904920767306089
Testing Data Score: 0.565
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.61      0.76      0.68       164
           6       0.52      0.60      0.56       169
           7       0.00      0.00      0.00        48
           8       0.00      0.00      0.00         5

    accuracy                           0.56       400
   macro avg       0.19      0.23      0.21       400
weighted avg       0.47      0.56      0.51       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM - Poly Model
- Train: .68
- Test: .57

In [72]:
# SVM - Poly Kernel
from sklearn.svm import SVC
model = SVC(kernel='poly')
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

  _warn_prf(average, modifier, msg_start, len(result))


Training Data Score: 0.683069224353628
Testing Data Score: 0.5775
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.61      0.79      0.69       164
           6       0.56      0.52      0.54       169
           7       0.48      0.27      0.35        48
           8       0.00      0.00      0.00         5

    accuracy                           0.58       400
   macro avg       0.27      0.26      0.26       400
weighted avg       0.54      0.58      0.55       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM - rbf Model
- Train: .67
- Test: .62

In [73]:
# SVM - rbf Kernel
from sklearn.svm import SVC
model = SVC(kernel='rbf')
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

Training Data Score: 0.6713928273561302
Testing Data Score: 0.6275
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.66      0.78      0.72       164
           6       0.59      0.65      0.62       169
           7       0.62      0.27      0.38        48
           8       0.00      0.00      0.00         5

    accuracy                           0.63       400
   macro avg       0.31      0.28      0.29       400
weighted avg       0.60      0.63      0.60       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest
- Train: 1
- Test: .67

In [131]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50)
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

Training Data Score: 1.0
Testing Data Score: 0.66
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        13
           5       0.72      0.74      0.73       164
           6       0.62      0.72      0.67       169
           7       0.58      0.46      0.51        48
           8       0.00      0.00      0.00         5

    accuracy                           0.66       400
   macro avg       0.32      0.32      0.32       400
weighted avg       0.63      0.66      0.64       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree
- Train: 1
- Test: .58

In [102]:
# Decision Trees
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train_scaled_red, y_train_red)
# Score the model
print(f'Training Data Score: {model.score(X_train_scaled_red, y_train_red)}')
print(f'Testing Data Score: {model.score(X_test_scaled_red, y_test_red)}')
# Predict outcomes for test data set
predicts = model.predict(X_test_scaled_red)
# Calculate classification report
from sklearn.metrics import classification_report
predict = model.predict(X_test_scaled_red)
print(classification_report(y_test_red, predict))

Training Data Score: 1.0
Testing Data Score: 0.5875
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.14      0.15      0.15        13
           5       0.68      0.67      0.68       164
           6       0.62      0.56      0.59       169
           7       0.44      0.60      0.51        48
           8       0.00      0.00      0.00         5

    accuracy                           0.59       400
   macro avg       0.31      0.33      0.32       400
weighted avg       0.60      0.59      0.59       400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
