In [1]:
import time
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle
import numpy as np
from itertools import combinations_with_replacement
from sympy.core import Mul

In [2]:
def lvec(a):
    n = a.shape[0]
    s = np.empty(0)
    for i in range(n):
        temp = a[i]*a[i:]
        temp[0] = temp[0]/2
        s = np.append(s,temp)
    return s

In [3]:
def create_z(x):
    s = lvec(x)
    z = np.empty(0)
    z = np.append(z,s)
    z = np.append(z,x)
    z = np.append(z,1)
    return z

In [4]:
def create_r(x):
    z = create_z(x)
    eta = lvec(z)
    s = lvec(x)
    r = np.empty(0)
    r = np.append(r,eta)
    r = np.append(r,s)
    return r


In [5]:
def new_feature(x):
    r = create_r(x)
    xnew = np.empty(0)
    xnew = np.append(xnew,r)
    xnew = np.append(xnew,x)
    xnew = np.append(xnew,1)
    return xnew

In [6]:
def Create_features(variables):
    max_degree = 4
    min_degree = 0
    if not variables or max_degree == 0:
        return np.array([1])
    variables = np.array(list(variables) + [1])
    monomials_list_comm = np.empty(0)
    for item in combinations_with_replacement(variables, max_degree):
        powers = dict()
        for variable in variables:
            powers[variable] = 0
        for variable in item:
            if variable != 1:
                powers[variable] += 1
        if max(powers.values()) >= min_degree:
            monomials_list_comm = np.append(monomials_list_comm,Mul(*item))
    return np.array(list(monomials_list_comm))

In [7]:
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped

In [8]:
def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [9]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [10]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw



In [11]:
def sgd(features, outputs):
    max_epochs = 500
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights


In [12]:
def init1():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    df = pd.read_csv('winequalityN.csv')
    
    
    # filling the missing values:
    for col , value in df.items():
        if col != 'type': # type col -> object datatype
            df[col] = df[col].fillna(df[col].mean())
            
    #Mapping values of target variable quality to 'low', 'medium' and 'high' categories for classification
    df['quality']=df['quality'].map({0:'low',1:'low',2:'low',3:'low', 4:'medium', 5:'medium', 6:'medium', 7:'medium', 8:'high', 9:'high',10:'high'})  
    #df['quality']=df['quality'].map({'low':0,'medium':1,'high':2})
    
    #1 denotes white wine and 0 denotes not white (red wine)
    dummies = pd.get_dummies(df['type'], drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop('type', axis=1, inplace=True)
    
    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    #data.drop(data.columns[[0]], axis=1, inplace=True)
    
    diag_map = {'low': 1.0, 'medium': -1.0,'high':-1.0}
    df['quality'] = df['quality'].map(diag_map)
    
    
    # put features & outputs in different data frames
    #Y=df.loc[:,'Rings']
    #X=df.iloc[:,0:-1]
    
    X = df.iloc[:,0:-1]
    y = df.loc[:,'quality']

    
    # make predictions
    #yhat = ovr.predict(X)
    # filter features
    #remove_correlated_features(X)
    #Xnew = np.apply_along_axis(new_feature, 1, X)
    #X = pd.DataFrame(Xnew)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    #X_normalized = MinMaxScaler().fit_transform(Xnew)
    #X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)
    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    
    y_train_predicted=np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


# In[57]:


regularization_strength = 10000
learning_rate = 0.0005


# In[58]:

start = time.time()
# init1 assure if a data point belongs to class 1 or not #
# For Class 1 #
init1()

end = time.time()
print("Time taken to Evaluate: ",end - start)

reading dataset...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 2841964.1975582293
Epoch is: 2 and Cost is: 140214.23459847042
Epoch is: 4 and Cost is: 6042034.558077971
Epoch is: 8 and Cost is: 4566616.060672355
Epoch is: 16 and Cost is: 6022149.777780089
Epoch is: 32 and Cost is: 2921833.275570493
Epoch is: 64 and Cost is: 359064.5651312444
Epoch is: 128 and Cost is: 5062253.72251188
Epoch is: 256 and Cost is: 2950460.0568397464
Epoch is: 499 and Cost is: 4221058.499922915
training finished.
weights are: [  30.03960509    9.14281065   -2.52363241  -82.25516889    0.91494462
 -224.14829714 -559.4832991    -2.20136567   -7.61809845   -2.53571192
  -54.47850588  105.66540771]
testing the model...
accuracy on test dataset: 0.9984615384615385
recall on test dataset: 0.0
precision on test dataset: 0.0
Time taken to Evaluate:  33.45958185195923


In [1]:
import time
import numpy as np
from itertools import combinations_with_replacement
from sympy.core import Mul
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
def new_features(variables):
    variables=variables.tolist()
    max_degree = 4
    min_degree = 0
    if not variables or max_degree == 0:
        return np.array([1])
    variables = np.array(list(variables) + [1])
    monomials_list_comm = np.empty(0)
    for item in combinations_with_replacement(variables, max_degree):
        powers = dict()
        for variable in variables:
            powers[variable] = 0
        for variable in item:
            if variable != 1:
                powers[variable] += 1
        if max(powers.values()) >= min_degree:
            monomials_list_comm = np.append(monomials_list_comm,Mul(*item))
    return np.array(list(monomials_list_comm))

In [3]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [4]:
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))
    
    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw

In [5]:
def sgd(features, outputs):
    max_epochs = 500
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            
            prev_cost = cost
            nth += 1
    return weights

In [6]:
def init1():
    print("reading dataset...")
    # read data in pandas (pd) data frame
    df = pd.read_csv('winequalityN.csv')
    
    
    # filling the missing values:
    for col , value in df.items():
        if col != 'type': # type col -> object datatype
            df[col] = df[col].fillna(df[col].mean())
            
    #Mapping values of target variable quality to 'low', 'medium' and 'high' categories for classification
    df['quality']=df['quality'].map({0:'low',1:'low',2:'low',3:'low', 4:'medium', 5:'medium', 6:'medium', 7:'medium', 8:'high', 9:'high',10:'high'})  
    #df['quality']=df['quality'].map({'low':0,'medium':1,'high':2})
    
    #1 denotes white wine and 0 denotes not white (red wine)
    dummies = pd.get_dummies(df['type'], drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop('type', axis=1, inplace=True)
    
    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    #data.drop(data.columns[[0]], axis=1, inplace=True)
    
    diag_map = {'low': 1.0, 'medium': -1.0,'high':-1.0}
    df['quality'] = df['quality'].map(diag_map)
    
    
    # put features & outputs in different data frames
    #Y=df.loc[:,'Rings']
    #X=df.iloc[:,0:-1]
    
    X = df.iloc[:,0:-1]
    y = df.loc[:,'quality']

    
    # make predictions
    #yhat = ovr.predict(X)
    # filter features
    #remove_correlated_features(X)
    #Xnew = np.apply_along_axis(new_feature, 1, X)
    #X = pd.DataFrame(Xnew)
    #remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    #X_normalized = MinMaxScaler().fit_transform(Xnew)
    #X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b
    #X.insert(loc=len(X.columns), column='intercept', value=1)
    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    
    y_train_predicted=np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    # An unknow data given by me to check which class it belongs to
    #a = np.array([0.8, 0.6, 0.75, 0.75, 1])
    #yp = np.sign(np.dot(a, W))
    #print("Y value for a : ",yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


# In[57]:


regularization_strength = 10000
learning_rate = 0.0005


# In[58]:

start = time.time()
# init1 assure if a data point belongs to class 1 or not #
# For Class 1 #
init1()

end = time.time()
print("Time taken to Evaluate: ",end - start)

reading dataset...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 1366464.8913781878
Epoch is: 2 and Cost is: 1618089.0551780465
Epoch is: 4 and Cost is: 2781971.353480598
Epoch is: 8 and Cost is: 1582259.1003524382
Epoch is: 16 and Cost is: 3169951.884592682
Epoch is: 32 and Cost is: 2340112.3549028724
Epoch is: 64 and Cost is: 4161018.4436278734
Epoch is: 128 and Cost is: 1061039.6787408798
Epoch is: 256 and Cost is: 1807322.4705918992
Epoch is: 499 and Cost is: 5482528.361199586
training finished.
weights are: [ 2.32330102e+01  1.19657611e+01 -5.83697287e+00 -1.04806259e+02
  8.36943461e-01 -7.65616138e+01 -7.89759407e+02 -6.51888488e-01
  1.07764809e+00  2.06416530e+00 -1.05511156e+01  9.78972800e+01]
testing the model...
accuracy on test dataset: 0.9984615384615385
recall on test dataset: 0.0
precision on test dataset: 0.0
Time taken to Evaluate:  96.25946617126465


In [38]:
import time
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [39]:
df = pd.read_csv('winequalityN.csv')

In [40]:
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [41]:
# filling the missing values:
for col , value in df.items():
    if col != 'type': # type col -> object datatype
        df[col] = df[col].fillna(df[col].mean())
            
    #Mapping values of target variable quality to 'low', 'medium' and 'high' categories for classification
df['quality']=df['quality'].map({0:'low',1:'low',2:'low',3:'low', 4:'medium', 5:'medium', 6:'medium', 7:'medium', 8:'high', 9:'high',10:'high'})  
    #df['quality']=df['quality'].map({'low':0,'medium':1,'high':2})
    
    #1 denotes white wine and 0 denotes not white (red wine)
dummies = pd.get_dummies(df['type'], drop_first=True)
df = pd.concat([df, dummies], axis=1)
df.drop('type', axis=1, inplace=True)
    
    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    #data.drop(data.columns[[0]], axis=1, inplace=True)
    
diag_map = {'low': 1.0, 'medium': -1.0,'high':-1.0}
df['quality'] = df['quality'].map(diag_map)
    

In [42]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,white
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,-1.0,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,-1.0,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,-1.0,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,-1.0,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,-1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,-1.0,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,-1.0,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,-1.0,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,-1.0,0


In [43]:
df_lable1 = df["quality"]

In [44]:
df_lable1

0      -1.0
1      -1.0
2      -1.0
3      -1.0
4      -1.0
       ... 
6492   -1.0
6493   -1.0
6494   -1.0
6495   -1.0
6496   -1.0
Name: quality, Length: 6497, dtype: float64

In [45]:
df.drop('quality',axis=1,inplace=True)

In [46]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,white
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,0


In [55]:
import time
from sklearn.model_selection import train_test_split 

In [56]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(df,df_lable1,test_size = 0.5)

In [57]:
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [58]:
start = time.time()

score = model.score(X_test,y_test)
print(score)
end = time.time()
print("Time taken to Evaluate: ",end - start)

0.997229916897507
Time taken to Evaluate:  0.0059697628021240234
