In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from scipy import stats
import re

#import acquire
#import prepare

In [2]:
df = pd.read_csv('Wine_Viable.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id,is_viable
0,0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,False
1,1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,False
2,2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2,False


## features to model:
### volatile acidity, citric acid, fixed acidity))), sulphates, alcohol, free sulphur dioxide)))

))) possible drops from modeling

In [4]:
def split_wine(df):
    '''
    This function performs split on WineQT data, stratify quality.
    Returns train, validate, and test dfs.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.quality)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.quality)
    return train, validate, test

In [11]:
def model_prep(train, validate, test):
    #features to keep
    keep_columns = ['volatile acidity','citric acid','fixed acidity','sulphates', 'alcohol', 'free sulfur dioxide', 'is_viable'] 
    train = train[keep_columns]
    validate = validate[keep_columns]
    test = test[keep_columns]
    
    train_x = train.drop(columns='is_viable').reset_index(drop=True)
    train_y = train[['is_viable']].reset_index(drop=True)

    validate_x = validate.drop(columns='is_viable').reset_index(drop=True)
    validate_y = validate[['is_viable']].reset_index(drop=True)

    test_x = test.drop(columns='is_viable').reset_index(drop=True)
    test_y = test[['is_viable']].reset_index(drop=True)
    
    return train_x, validate_x, test_x, train_y, validate_y, test_y

In [55]:
def get_tree(train_x, validate_x, train_y, validate_y):
    
    '''get decision tree accuracy on train and validate'''
    #create classifier then fit
    tree = DecisionTreeClassifier(max_depth=4, random_state=123)
    tree = tree.fit(train_x, train_y)

    print(f"Accuracy of Decision Tree on train data is {tree.score(train_x, train_y)}")
    print(f"Accuracy of Decision Tree on validate data is {tree.score(validate_x, validate_y)}")
    


def get_rf(train_x, validate_x, train_y, validate_y):

    '''get rf accuracy on train and validate'''
    #create classifier and fit
    rf = RandomForestClassifier(max_depth=4, random_state=123)
    rf = rf.fit(train_x, train_y)

    print(f"Accuracy of Random Forest on train data is {rf.score(train_x, train_y)}")
    print(f"Accuracy of Random Forest on validate data is {rf.score(validate_x, validate_y)}")


def get_reg(train_x, validate_x, train_y, validate_y):

    '''get reg accuracy on train and validate'''
    #create classifier and fit
    logit = LogisticRegression(solver='liblinear')
    logit = logit.fit(train_x, train_y)

    print(f"Accuracy of Logistic Regression on train is {logit.score(train_x, train_y)}")
    print(f"Accuracy of Logistic Regression on validate is {logit.score(validate_x, validate_y)}")



def get_knn(train_x, validate_x, train_y, validate_y):
    
    '''get knn accuracy on train and validate'''
    knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
    knn = knn.fit(train_x, train_y)

    # print results
    print(f"Accuracy of KNN on train is {knn.score(train_x, train_y)}")
    print(f"Accuracy of KNN on validate is {knn.score(validate_x, validate_y)}")



In [9]:
train, validate, test = split_wine(df)

In [12]:
train_x, validate_x, test_x, train_y, validate_y, test_y = model_prep(train, validate, test)

In [13]:
train_x

Unnamed: 0,volatile acidity,citric acid,fixed acidity,sulphates,alcohol,free sulfur dioxide
0,0.280,0.54,12.5,1.36,9.800000,12.0
1,0.420,0.35,9.6,0.66,11.100000,17.0
2,0.670,0.02,7.3,0.68,11.066667,31.0
3,0.580,0.00,7.4,0.58,11.300000,7.0
4,0.645,0.25,8.6,0.60,10.000000,8.0
...,...,...,...,...,...,...
634,0.640,0.21,7.7,0.45,9.900000,32.0
635,0.470,0.43,10.8,0.76,10.800000,27.0
636,0.430,0.30,8.3,0.61,10.500000,7.0
637,0.885,0.20,8.2,0.46,10.000000,7.0


In [14]:
train_y

Unnamed: 0,is_viable
0,True
1,True
2,True
3,True
4,True
...,...
634,False
635,True
636,False
637,False


In [27]:
348/639*100

54.460093896713616

## 54.46% Baseline accuracy (assuming viable every time)

## Decision Tree

In [39]:
get_tree(train_x, validate_x, train_y, validate_y)

Accuracy of Decision Tree on train data is 0.7856025039123631
Accuracy of Decision Tree on validate data is 0.7163636363636363


## Random Forest

In [40]:
get_rf(train_x, validate_x, train_y, validate_y)

Accuracy of Random Forest on train data is 0.8184663536776213
Accuracy of Random Forest on validate data is 0.7527272727272727


## Logistic Regression

In [41]:
get_reg(train_x, validate_x, train_y, validate_y)

Accuracy of Logistic Regression on train is 0.7449139280125195
Accuracy of Logistic Regression on validate is 0.72


## KNN

In [56]:
get_knn(train_x, validate_x, train_y, validate_y)

Accuracy of KNN on train is 0.7652582159624414
Accuracy of KNN on validate is 0.6654545454545454


### Random Forest performed the best with an accuracy of 75.27%, 20.81% above baseline