In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from csv import reader
from random import randrange
from math import sqrt
from sklearn.metrics import accuracy_score, classification_report

In [5]:
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i + 1] * row[i]
    return yhat

def coefficients_sgd(train, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            sum_error += error**2
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row)-1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i] 
        
    return coef

In [7]:
df = pd.read_csv('winequality-white.csv',names = ['Fixed acidity','Volatile acidity','Citric acid','Residual sugar','Chlorides','Free sulfur dioxide','Total sulfur dioxide','Density','pH','Sulphates','Alcohol','Quality'])
df

Unnamed: 0,Fixed acidity,Volatile acidity,Citric acid,Residual sugar,Chlorides,Free sulfur dioxide,Total sulfur dioxide,Density,pH,Sulphates,Alcohol,Quality
0,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6
1,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6
2,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6
3,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6
4,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6
5,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6
6,6.2,0.320,0.16,7.00,0.045,30.0,136.0,0.99490,3.18,0.47,9.600000,6
7,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6
8,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6
9,8.1,0.220,0.43,1.50,0.044,28.0,129.0,0.99380,3.22,0.45,11.000000,6


In [8]:
for f in ['Fixed acidity','Volatile acidity','Citric acid','Residual sugar','Chlorides','Free sulfur dioxide','Total sulfur dioxide','Density','pH','Sulphates','Alcohol']:
    df[f] = df[f].astype('category')
    df[f] = df[f].cat.codes

In [9]:
features = ['Fixed acidity','Volatile acidity','Citric acid','Residual sugar','Chlorides','Free sulfur dioxide','Total sulfur dioxide','Density','pH','Sulphates','Alcohol']
X = df.get(features)
X

Unnamed: 0,Fixed acidity,Volatile acidity,Citric acid,Residual sugar,Chlorides,Free sulfur dioxide,Total sulfur dioxide,Density,pH,Sulphates,Alcohol
0,31,36,36,302,34,57,146,878,23,22,5
1,23,42,34,17,38,13,106,471,53,26,12
2,43,38,40,101,39,33,69,560,49,21,24
3,34,28,32,128,47,59,163,601,42,17,21
4,34,28,32,128,47,59,163,601,42,17,21
5,43,38,40,101,39,33,69,560,49,21,24
6,22,46,16,103,34,33,110,544,41,24,15
7,31,36,36,302,34,57,146,878,23,22,5
8,23,42,34,17,38,13,106,471,53,26,12
9,43,26,43,15,33,30,102,453,45,22,43


In [10]:
Y = df['Quality'].values

Y

array([6, 6, 6, ..., 6, 7, 6])

In [11]:
X = MinMaxScaler().fit_transform(X)
ds = np.column_stack([X, Y])
print(ds)

[[ 0.46268657  0.29032258  0.41860465 ...,  0.28205128  0.04901961  6.        ]
 [ 0.34328358  0.33870968  0.39534884 ...,  0.33333333  0.11764706  6.        ]
 [ 0.64179104  0.30645161  0.46511628 ...,  0.26923077  0.23529412  6.        ]
 ..., 
 [ 0.3880597   0.24193548  0.22093023 ...,  0.29487179  0.10784314  6.        ]
 [ 0.20895522  0.32258065  0.34883721 ...,  0.19230769  0.82352941  7.        ]
 [ 0.28358209  0.19354839  0.44186047 ...,  0.11538462  0.6372549   6.        ]]


In [15]:
train, test = train_test_split(ds, test_size = 0.3)

In [16]:
l_rate = 0.0001
n_epoch = 100

coefs = coefficients_sgd(train, l_rate, n_epoch)
print(coefs)

[3.4372468792770579, 0.75279408697817674, -0.10682648284960305, 0.65109074416130897, 0.26724739674730158, 0.24069893769521539, 0.58866343802600818, 0.21559000394083616, 0.08267330561094198, 1.1595444849965131, 0.64305285251194866, 2.1200709998305198]


In [17]:
predictions = list()
for row in test:
    predicted = predict(row, coefs)
    predictions.append(predicted)

In [18]:
y_test = list()
for i in range(1470):
    y_test.append(test[i][11])

In [19]:
predicted = predictions

In [20]:
soma = 0
for i in range(len(y_test)):
    erro = predicted[i] - y_test[i]
    soma += (erro**2)
    erroMedio = soma/float(len(y_test))
acc = sqrt(erroMedio)

In [21]:
acc

0.8005400689392048