In [1]:
!pip install sklearn

from sklearn import tree
import pandas as pd
import os
from pickle import dump
import numpy as np



In [2]:
# make dataframe with wine csv
df_wine = pd.read_csv(os.path.join("..", "Resources", "winequality-joined.csv"))
df_wine.head()

Unnamed: 0,color,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# sorting the quality column to get range of values for target (3 to 9)
wine_desc = df_wine.sort_values(by='quality', ascending=False)
wine_desc.head()

Unnamed: 0,color,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
827,white,7.4,0.24,0.36,2.0,0.031,27.0,139.0,0.99055,3.28,0.48,12.5,9
820,white,6.6,0.36,0.29,1.6,0.021,24.0,85.0,0.98965,3.41,0.61,12.4,9
1605,white,7.1,0.26,0.49,2.2,0.032,31.0,113.0,0.9903,3.37,0.42,12.9,9
876,white,6.9,0.36,0.34,4.2,0.018,57.0,119.0,0.9898,3.28,0.36,12.7,9
774,white,9.1,0.27,0.45,10.6,0.035,28.0,124.0,0.997,3.2,0.46,10.4,9


In [4]:
# we are going to make the range 0 to 10 so that new outlier data can be added by user
# hard code a drop down menu on the user interface to select 0-10 so people can't type decimals
target = df_wine["quality"]
target_names = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] 
target_names

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [10]:
wine_qual_drop = df_wine.drop("quality", axis=1)
feature_names = wine_qual_drop.columns
wine_qual_drop.head()

Unnamed: 0,color,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wine_qual_drop, target, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'white'

In [7]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)

0.6875

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=420)
model = rf.fit(X_train_scaled, y_train)
model.score(X_test_scaled, y_test)

0.7395833333333334

In [9]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.26544459158597344, 'Glucose'),
 (0.1641806068402667, 'BMI'),
 (0.1420791105058266, 'Age'),
 (0.120112695186777, 'DiabetesPedigreeFunction'),
 (0.08748996351500753, 'BloodPressure'),
 (0.07980383203508229, 'Pregnancies'),
 (0.07168196872218029, 'Insulin'),
 (0.06920723160888621, 'SkinThickness')]

In [10]:
res=rf.predict(X_test_scaled[0].reshape(1,-1))[0]
target_names[res]

'negative'

In [11]:
dump(X_scaler, open('../scaler.pkl', 'wb'))
dump(model, open('../model.pkl', 'wb'))

In [30]:
from pickle import load
model = load(open('model.pkl', 'rb'))
# load the scaler
scaler = load(open('scaler.pkl', 'rb'))

new_data = np.array([[6,148,72,35,0,33.6,0.627,50]])

In [36]:
target_names[model.predict(scaler.transform(new_data))[0]]

'positive'