In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import requests
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')


In [2]:
# Carregando a base de dados
red = pd.read_csv('winequality-red.csv',sep=';')

In [3]:
# Observando o tamanho da base de dados
red.shape

(1599, 12)

In [4]:
# Visualizando as 5 primeiras linhas 
red.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [51]:
# Observando se existem dados faltantes na base e o tipo das variáveis
red.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [52]:
# Visualizando de forma geral as principais metricas estatisticas das variáveis
red.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
# Verificando dados duplicados
red.duplicated().sum()

240

In [6]:
# Removendo os dados dulicados
print('Verificando quantidades de linhas e colunas antes: ',red.shape)
red = red.drop_duplicates()
print('Verificando quantidades de linhas e colunas depois: ',red.shape)

Verificando quantidades de linhas e colunas antes:  (1599, 12)
Verificando quantidades de linhas e colunas depois:  (1359, 12)


## Regressor que estima o valor da qualidade do vinho tinto

In [7]:
# Tiraremos as variaveis residual sugar e alcohol pois elas tem forte correlação com a variavél density como vimos na anlises acima.

X = red.drop(['quality'], axis = 1)
y = red['quality']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25, 
                                                    random_state = 13,
                                                    stratify = y)

In [8]:
X_train.shape, X_test.shape

((1019, 11), (340, 11))

In [9]:
patch = os.getcwd()

In [16]:
# Trazendo os dados para uma mesma escala

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

pickle.dump(scaler, open(patch+'//scaler.pkl','wb'))



In [17]:
# Instanciando o modelo e treinando o modelo

model = LinearRegression()
model.fit(X_train_std, y_train)

LinearRegression()

In [18]:
pickle.dump(model, open(patch+'//model_wine.pkl','wb'))

In [15]:
# Predição para a base de teste
X_test_std = scaler.transform(X_test)
y_pred = model.predict(X_test_std)

In [14]:
# Verificação das principais métricas de Regressões
print('R²: ', r2_score(y_test,y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

R²:  0.36131127021692855
MAE: 0.5111867897829282
MSE: 0.4359161080549701
RMSE: 0.6602394323690233


In [19]:
modelo2 = pickle.load(open(patch+'//model_wine.pkl','rb'))

In [20]:
# Predição para a base de teste
scaler1 = pickle.load(open(patch+'//scaler.pkl','rb'))
X_test_std = scaler1.transform(X_test)
y_pred = modelo2.predict(X_test_std)

In [21]:
# Verificação das principais métricas de Regressões
print('R²: ', r2_score(y_test,y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

R²:  0.36131127021692855
MAE: 0.5111867897829282
MSE: 0.4359161080549701
RMSE: 0.6602394323690233


In [22]:
df_json = X_test.sample().to_json(orient='records')
df_json

'[{"fixed acidity":6.6,"volatile acidity":0.58,"citric acid":0.0,"residual sugar":2.2,"chlorides":0.1,"free sulfur dioxide":50.0,"total sulfur dioxide":63.0,"density":0.99544,"pH":3.59,"sulphates":0.68,"alcohol":11.4}]'

In [23]:
url = 'http://0.0.0.0:5010/predict'
data = df_json
header = {'Content-type': 'application/json'}

requisicao = requests.post(url = url, data = data, headers = header )

ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=5010): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000262FD140040>: Failed to establish a new connection: [WinError 10049] O endereço solicitado não é válido no contexto'))

Note: you may need to restart the kernel to use updated packages.
