# Read Data

In [1]:
import pandas as pd
print("reading data...")
data=pd.read_csv('winequality-red.csv',delimiter=';')

reading data...


# Get an idea of data by examining it

In [2]:
# print data
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


###### Data has 1599 training examples, 11 features, quality is class label
##### Find the values of quality


In [3]:
import numpy as np
print("unique values of label quality:",np.unique(data['quality']))

unique values of label quality: [3 4 5 6 7 8]


# Label quality takes 6 different values: 3,4,5,6,7,8
## Convert this to three classes: 0(bad)  if quality is less than 5, 1(good) if value is 5 or 6 and to 2(great) otherwise**

#### Criteria: 
**quality<5: class 0(bad)**   
**quality=5 or 6: class 1(good)**  
**quality>6: class 2(great)**

In [4]:
#converting dataframe to numpy array
data_array=pd.DataFrame.to_numpy(data)

In [5]:
print(data_array)
rows,cols=np.shape(data_array)
print(rows,cols)

[[ 7.4    0.7    0.    ...  0.56   9.4    5.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    5.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    5.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     6.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    5.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     6.   ]]
1599 12


In [6]:
# separate features and labels

x=data_array[:,:-1]
y=data_array[:,-1]

for i in range(rows):
    if y[i]<5:
        value=0
    elif y[i]<=6:
        value=1
    else:
        value=2
    y[i]=value

## Normalize all the other attributes by Z-score normalization

In [7]:

# find mean of each feature
mue=np.mean(x,axis=0)

# find standard deviation of each feature
std=np.std(x,axis=0)

# Normalize features
for i in range(0,cols-1):
    x[:,i]=(np.subtract(x[:,i],mue[i]))/std[i]


In [9]:
x.shape

(1599, 11)

### Segrete the attribute values into 4 equal spaced bins each giving the values between 0 to 3  and replace the values for that attribute with the number of corresponding to the interval they belong.

In [10]:
# find minimum and maximum values of each attribute
min_x=np.min(x,axis=0)
max_x=np.max(x,axis=0)

# calculate step size to define range of bins
step=(max_x-min_x)/4

#segregate them into bins
row,col=np.shape(x)
for j in range(col):
    for i in range(row):
        if(x[i,j]<=min_x[j]+step[j]):
            x[i,j]=0
        elif(x[i,j]<=min_x[j]+2*step[j]):
            x[i,j]=1
        elif(x[i,j]<=min_x[j]+3*step[j]):
            x[i,j]=2
        else:
            x[i,j]=3
x=x.astype(int)     
y=y.astype(int)
print(x)
print(y)

[[0 1 0 ... 2 0 0]
 [1 2 0 ... 1 0 0]
 [1 1 0 ... 1 0 0]
 ...
 [0 1 0 ... 2 1 1]
 [0 1 0 ... 2 0 1]
 [0 0 1 ... 2 0 1]]
[1 1 1 ... 1 1 1]


# concatenate x and y

In [17]:
# concatenate x and y
y=np.reshape(y,(len(y),1))
data=np.concatenate((x,y),axis=1)
data.shape

(1599, 12)

# Split data into training and testing data

In [19]:
m=data.shape[0]
split=int(0.8*m)   # 80 percent for training
ind=np.random.permutation(m)  # randomly permute a sequence
train_ind,test_ind=ind[:split],ind[split:]
#x1,y1 input and output data for training
#x2,y2 input and output data for testing
train_data=data[train_ind]
test_data=data[test_ind]


In [20]:
print(train_data.shape,test_data.shape)

(1279, 12) (320, 12)


In [46]:
# Converting numpy array to dataframe

columns=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
          'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']
features=columns[:-1]
trainData_df=pd.DataFrame(data=train_data,columns=columns)
testData_df=pd.DataFrame(data=test_data,columns=columns)
trainData_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,2,1,1,0,0,0,0,2,0,0,0,1
1,1,0,1,0,0,0,0,2,1,0,0,1
2,1,0,1,0,0,0,0,1,1,1,3,1
3,0,0,1,0,0,1,0,1,3,0,2,2
4,1,1,0,0,0,0,0,2,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1274,0,1,0,0,0,1,0,1,2,0,1,1
1275,1,1,0,0,0,1,0,2,2,0,0,1
1276,1,1,0,0,0,1,0,1,1,0,0,1
1277,0,1,0,0,0,0,0,1,2,1,0,1


# Implement ID3 Decision tree algorithm  
### It uses information gain to choose which attribute to split at each point
### Stop splitting a node if it has less than 10 data points

# Informatin Gain

In [26]:
def entropy(target):
    
    #Find different classes and counts corresponding to each class
    values,counts = np.unique(target,return_counts = True)
    
    entropy1=0
    #calculate entropy
    for i in range(len(values)):
        p=counts[i]/np.sum(counts)
        entropy1+=(-p)*np.log2(p)
    return entropy1

#Function to find Information gain
#It takes data and split attribute name
def InfoGain(data,attribute):
    target='quality'
    
    #Entropy at parent node
    total_entropy=entropy(data[target])
    
    
    val,counts=np.unique(data[attribute],return_counts = True)
    
    y=np.zeros(len(val),dtype=object)
    for j in range(len(val)):
        #store data separately for each of the unique values of attribute
        y[j]=data.where(data[attribute]==val[j]).dropna()[target]

        
    weighted_entropy=np.sum([(counts[j]/np.sum(counts))*entropy(y[j]) for j in range(len(val))])
    information_gain=total_entropy-weighted_entropy

In [74]:
# Decision tree

In [27]:
def ID3(data,originaldata,features,target_class='quality',parent_node_class=None):
    #Values and corresponding counts of target class at parent node
    val_t,count_t=np.unique(data[target_class],return_counts=True)
    #Find the dominant class(The class which occurs maximum no. of times in original dataset)
    val_o,count_o=np.unique(originaldata[target_class],return_counts=True)
    dominant_class=val_o[np.argmax(count_o)]
    #If all target values have same value assign that value as class
    if len(val_t) <= 1:
        return val_t
    #If the dataset is empty, return default class
    
    elif len(data)==0:
        return dominant_class
    #If the splitting is done using all features (No more features are left to split), assign the class which occurs 
    #maximum no. of times at parent node i.e parent_node_class
    elif len(features) ==0:
        return parent_node_class
    #If data points present at the node are less than or equal to 10, assign parent_node_class
    elif np.sum(count_t)<10:
        return parent_node_class
    else:

        parent_node_class=val_t[np.argmax(count_t)]
        gain=np.zeros(np.shape(features)[0])
        #Split the node using all remaining features and select a feature which gives maximum information gain
        for i in range(np.size(features)):
            attribute_name=features[i]
    
            gain[i]=InfoGain(data,attribute_name)
        best_feature=features[np.argmax(gain)]
        #Build tree with best_feature
        tree = {best_feature:{}}
        
        #Remove best feature from features as its splitting is done
        features = [item for item in features if item != best_feature]
        
        #Grow a branch under the parent node for each possible value of the parent  node feature
        
            
        val=np.unique(data[best_feature])
        for i in range(np.size(val)):
            value = val[i]
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            new_data = data.where(data[best_feature] == value).dropna()
            
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters (Recursion)
            subtree = ID3(new_data,originaldata,features,target_class,parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return(tree)   

In [28]:
#Prediction

def predict(test_data,tree_d,dominant_class):   
    test_keys=list(test_data.keys())
    tree_keys=list(tree_d.keys())
    for key in (test_keys):
        if key in (tree_keys):
            try:
                result = tree_d[key][test_data[key]] 
           
            except:
                return dominant_class
            
            #check if output is leaf node or not
            #If result is dictionary it is not leaf node, otherwise it will be leaf node
            if isinstance(result,dict):
                #If not leaf node, again predict using resultant tree
                return predict(test_data,result,dominant_class)
            else:
                #If leaf it is leaf node, return result
                return result

In [30]:
tree=ID3(trainData_df,trainData_df,features,target_class='quality',parent_node_class=None)

# convert test dataframe to dictionary

In [59]:
dict_test=testData_df.to_dict('records')
true_y=testData_df['quality']
true_y=true_y.to_numpy()

In [61]:
# Find dominant class of training data
# Assign sample to this class when different conditions occur
val_o,count_o=np.unique(trainData_df['quality'],return_counts=True)
dominant_class=val_o[np.argmax(count_o)]
pred_y=np.zeros(len(dict_test))

# predict output for every sample in test data

In [62]:
i=0
for sample in dict_test:
    pred_y[i]=predict(sample,tree,dominant_class)
    i=i+1
    

# Evaluation

In [67]:
def evaluate(h,y):
    accuracy=np.sum(h==y)/len(h)
    precision=np.sum(np.logical_and(h==1,y==1))/np.sum(h==1)
    recall=np.sum(np.logical_and(h==1,y==1))/np.sum(y==1)
    return accuracy,precision,recall

In [70]:
accuracy,precision,recall=evaluate(pred_y,true_y)
print('accuracy:{:.2%}\nprecision:{:.2%}\nrecall:{:.2%}'.format(accuracy,precision,recall))

accuracy:85.94%
precision:88.00%
recall:96.70%
