In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

%matplotlib inline 
plt.rcParams["figure.figsize"] = (20,10)

In [2]:
colnames = ['variance','skewness','curtosis','entropy','class'] ##List of column names of dataset
dataset = pd.read_csv('banknote.txt',names = colnames) ##Reads the dataset into Datafra,e
dataset.head(100)
dataset.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,class
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,0.444606
std,2.842763,5.869047,4.31003,2.101013,0.497103
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.773,-1.7082,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [3]:
dataset['class'].replace(0,-1,inplace =True)##Converts Class labels from 0 to -1 as required for algorithm
dataset.head(25)
dataset.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,class
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,-0.110787
std,2.842763,5.869047,4.31003,2.101013,0.994207
min,-7.0421,-13.7731,-5.2861,-8.5482,-1.0
25%,-1.773,-1.7082,-1.574975,-2.41345,-1.0
50%,0.49618,2.31965,0.61663,-0.58665,-1.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [4]:
duplicate = dataset[dataset.duplicated()] ##Checks if Duplicate Rows present
print('Original Dataset Shape :',dataset.shape)
if(len(duplicate)!=0):
    dataset.drop_duplicates(inplace = True) ## If duplicate rows are found then they are dropped
    print('Dataset Shape after removing duplicates :',dataset.shape)

Original Dataset Shape : (1372, 5)
Dataset Shape after removing duplicates : (1348, 5)


In [5]:
##Creates a held out Validation Set
validation = dataset.sample(frac=0.2582)
print('Validation Set Shape :',validation.shape)

Validation Set Shape : (348, 5)


In [6]:
##Forms Training Set by removing rows which are present in Validation Set

##We perform outer join on dataset and Validation Set and keep only those rows which only came from dataset
train = dataset.merge(validation, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']

train.drop('_merge',axis='columns',inplace = True) ## We drop the extra column '_merge' added during merge operation
print('Training Set Shape :',train.shape)
dataset.head(50)

Training Set Shape : (1000, 5)


Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,-1
1,4.5459,8.1674,-2.4586,-1.4621,-1
2,3.866,-2.6383,1.9242,0.10645,-1
3,3.4566,9.5228,-4.0112,-3.5944,-1
4,0.32924,-4.4552,4.5718,-0.9888,-1
5,4.3684,9.6718,-3.9606,-3.1625,-1
6,3.5912,3.0129,0.72888,0.56421,-1
7,2.0922,-6.81,8.4636,-0.60216,-1
8,3.2032,5.7588,-0.75345,-0.61251,-1
9,1.5356,9.1772,-2.2718,-0.73535,-1


In [7]:
##Perceptron Algorithm as per Lecture Slides

def percepalgo(X,Y,learn_rate,threshold):
    w = np.array([1,1,1,1]) ##Initialize Weight Vector
    nt = learn_rate ##Set Learning Rate
    count = 0 ##Used to keep Track for how many iterations our model has not made any prediction mistake
    k = threshold
    while(True):
        
        randint = np.random.randint(0,len(Y)) ##Generates a random integer to be used select a random (xn,yn) pair
        xn = np.array(X.iloc[randint])
        yn = Y.iloc[randint]
    
        if(yn * np.sum(w*xn) < 0): ##Checks whether model has made a prediction mistake. if yes, then model weights are updated
                                   ##and count is reset to 0. If No, then count is increased by 1
        
            w = w + nt*yn*xn
            count = 0
            
        else:
            count+=1
        
        if(count >k): ##Checks whether model has not made any mistakes in last k (threshold value passed) iterations, if yes then weights
                        ##have converged and if we stop the loop. If No, then we continue checking predictions
            break
      
       
    return w ##Returns the converged weight vector
        

In [8]:
##Function to make Predictions

def predictions(X,w): ##Takes parameters as input vector X and converged weight vector w
    
    y_pred = np.round(np.sum(X*w,axis = 1)) ##Performs row wise element wise product of weight vector and input vector and
                                            ##takes it sum and rounds to nearest integer.
    
    y_pred[y_pred >= 0]  = 1 ##If predicted y value is gretaer than or equal to 0 we store class label as 1
    y_pred[y_pred < 0]  = -1 ##If predicted y value is less than 0 we store class label as -1
   
    
    return y_pred ##Returns array of predicted class labels

In [9]:
##Function To calculate Error made by model. We have taken error function has percentage of misclassifications by model.

def error(yn,y_pred):
   
    count = 0
    for i in range(len(yn)):
        if(yn[i]!=y_pred[i]):
            count+=1
    return((count/len(yn))*100)

In [10]:
## We perform 10-Fold Cross validation to determine value of Hyperparameter Learning Rate

f = ['variance','skewness','curtosis','entropy'] 

error_min = 100 ## We initialize minimum error to 100%
best_weight = np.array([]) ##Empty array to store Best Model weights
best_learnrate = 0 ##Stores the learning rate for which Average weight is minimum

for j in range(0,10): ##Loops over different values of Learing rate from 0.1 to 1 in steps of 0.1
   
    e = 0 ##Initialize error for particular current learning rate to 0
    
    w1 = np.array([0.0,0.0,0.0,0.0]) ##Array to store sum of weights for each fold in cross validation
    
    for i in range(0,10): ##loops over cross validation sets
       
        test = train[i*100:(i+1)*100] ##Splitted the training data into 9:1 ratio for train:test set
        train_cross = pd.concat([train[0:i*100],train[(i+1)*100:]])
        
        w = percepalgo(train_cross[f],train_cross['class'],0.1*(j+1)) ##Finds Converged weight vector over a training set and learning rate
        
        y_pred = predictions(np.array(test[f]),w) ##Calculates the predicted class labels for the test set using converged weigth vector from above
        
        e += error(np.array(test['class']),np.array(y_pred)) ##Calculates model error and adds it to overall error for a learning rate
        
        w1 += w ##Adds the current weight vector to overall weight vector for a learning rate
        
    if(e/10 < error_min):  ##Checks if average error for this learning rate is lesser than current minimum error.
                           ## IF yes, then we store minimum error as current average error for this learning rate , store the
                           ## learning rate and also the average weight vector
        error_min = e/10
        best_weight = w1/10
        best_learnrate = 0.1 *(j+1)
   

##Outputs the Best Model Parameters and corresponding learning rate
print('Best Model weights : ',best_weight,'with average error of',error_min,'%')
print('Best Learning Rate with Minimum Average Error is',best_learnrate)

Best Model weights :  [-11.08176065  -7.6350983   -7.33905558  -5.28575598] with average error of 4.4 %
Best Learning Rate with Minimum Average Error is 0.30000000000000004


In [11]:
##Function to calculate F- Score for a target class

def fscorecalc(t,y_pred,yn):
    tp = fp = fn = tn = 0 ##Initialixe True Positive (tp) , True Negative (tn) , False Positive (fp) and False Negative (fn) to 0
    
    for i in range(len(y_pred)): ##For each prediction checks if predicted value and given value is same, if same then
                                 ##checks if predicted value is equal to target, if yes then True Positive else True Negative
                                 ##If predicted value and actual value are not same, then check if predicted value same as target 
                                 ##then False Positive else False Negative
        if(y_pred[i] == yn[i] ):
            if(y_pred[i] == t):
                tp+=1
            else:
                tn+=1
        else:
            if(y_pred[i]== t):
                fp+=1
            else:
                fn+=1
    
    prec = tp / (tp + fp)    ##Precision is calculated as True Positive over sum of True Positive and False Positive
    
    recall = tp / (tp + fn)   ##Recall is claculated as True Positive over sum of True Positive and False Negative
    
    return (2*prec*recall) / (prec + recall) ##Returns F score which is calculated as harmonic mean of precision and recall
                

In [12]:
y_pred = predictions(np.array(validation[f]),best_weight) ##Calculates predictions for Validation Set data

print('Error of Best Model on Validation Set is ',error(np.array(validation['class']),np.array(y_pred)),'%') ##Model Error

print('F Score for class 1 is',fscorecalc(1,y_pred,np.array(validation['class']))) ##F-Score for Class label '1'

print('F Score for class -1 is',fscorecalc(-1,y_pred,np.array(validation['class']))) ##F-Score for class label '-1'

Error of Best Model on Validation Set is  5.459770114942529 %
F Score for class 1 is 0.9396825396825398
F Score for class -1 is 0.9501312335958005
