In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Reading the wine_data.csv and loading into dataframe
wine_df = pd.read_csv('Wine_data.csv')
wine_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,White,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,White,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,White,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,White,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,White,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,Red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,Red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
6494,Red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,Red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


## **PREPROCESSING**

In [3]:
#split dataset into features and target
X = wine_df[["residual sugar",'alcohol']]
y = wine_df['quality']

In [4]:
X = X.rename(columns={"residual sugar":"residual_sugar"})

In [5]:
X.head()

Unnamed: 0,residual_sugar,alcohol
0,20.7,8.8
1,1.6,9.5
2,6.9,10.1
3,8.5,9.9
4,8.5,9.9


In [6]:
y.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

## Scaling Data

* Train/Test
* StandardScaler

In [7]:
#Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create a StandardScaler() model and fit it to the training data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)


In [9]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled[0]

array([-0.08824573,  0.25267266])

# Logistic Regression

In [10]:
LR_clf = LogisticRegression(max_iter=1000).fit(X_train_scaled, y_train)
print(f"Training Data Score: {LR_clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test_scaled, y_test)}")

Training Data Score: 0.5106732348111659
Testing Data Score: 0.5378461538461539


# Random Forest Classifier

In [11]:
RF_clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {RF_clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test_scaled, y_test)}')

Training Score: 0.8019293924466339
Testing Score: 0.5501538461538461


In [13]:
# Prediction using RandomForestClassifier
predictions = RF_clf.predict(X_test_scaled)
result_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
result_df.head(5)


Unnamed: 0,Prediction,Actual
3103,7,7
1419,5,7
4761,6,6
4690,6,6
4032,5,5


# Pickle

In [14]:
import pickle
import requests
import json

In [15]:
# Saving model to disk
pickle.dump(LR_clf, open('LR_model.pkl','wb'))
pickle.dump(RF_clf, open('RF_model.pkl','wb'))

# save the scaler
pickle.dump(X_scaler, open('LR_RF_scaler.pkl','wb'))
# Loading model to compare the results


In [16]:
# Testing the pickle
# scaler = pickle.load(open('scaler.pkl', 'rb'))
# model = pickle.load(open('model.pkl','rb'))

scaler = pickle.load(open('LR_RF_scaler.pkl', 'rb'))
RF_model = pickle.load(open('RF_model.pkl','rb'))
LR_model = pickle.load(open('LR_model.pkl','rb'))



In [17]:
test_data = [[10.6,10.4]]

data_scaled = scaler.transform(test_data)

print(RF_model.predict(data_scaled))
print(LR_model.predict(data_scaled))

[9]
[6]


