In [47]:
# Данные с банкнотами

**Banknote authentication Data Set was taken from UCI repository.** 

**Data Set Information:**

Data were extracted from images that were taken from genuine and forged banknote-like specimens. For digitization, an industrial camera usually used for print inspection was used. The final images have 400x 400 pixels. Due to the object lens and distance to the investigated object gray-scale pictures with a resolution of about 660 dpi were gained. Wavelet Transform tool were used to extract features from images.


**Attribute Information:**

1. variance of Wavelet Transformed image (continuous)
2. skewness of Wavelet Transformed image (continuous)
3. curtosis of Wavelet Transformed image (continuous)
4. entropy of image (continuous)
5. class (integer)

In [None]:
import pandas as pd, numpy as np

In [None]:
data = pd.read_csv('/content/data_banknote_authentication.txt', header=None)

In [None]:
data.head()

In [None]:
# negative instances

data[data[4]==0]

Unnamed: 0,0,1,2,3,4
0,3.62160,8.66610,-2.807300,-0.44699,0
1,4.54590,8.16740,-2.458600,-1.46210,0
2,3.86600,-2.63830,1.924200,0.10645,0
3,3.45660,9.52280,-4.011200,-3.59440,0
4,0.32924,-4.45520,4.571800,-0.98880,0
...,...,...,...,...,...
757,2.66060,3.16810,1.961900,0.18662,0
758,3.93100,1.85410,-0.023425,1.23140,0
759,0.01727,8.69300,1.398900,-3.96680,0
760,3.24140,0.40971,1.401500,1.19520,0


In [None]:
# positive instances

data[data[4]==1]

Unnamed: 0,0,1,2,3,4
762,-1.39710,3.31910,-1.392700,-1.99480,1
763,0.39012,-0.14279,-0.031994,0.35084,1
764,-1.66770,-7.15350,7.892900,0.96765,1
765,-3.84830,-12.80470,15.682400,-1.28100,1
766,-3.56810,-8.21300,10.083000,0.96765,1
...,...,...,...,...,...
1367,0.40614,1.34920,-1.450100,-0.55949,1
1368,-1.38870,-4.87730,6.477400,0.34179,1
1369,-3.75030,-13.45860,17.593200,-2.77710,1
1370,-3.56370,-8.38270,12.393000,-1.28230,1


In [None]:
data.shape[0], 762/data.shape[0], 610/data.shape[0]

(1372, 0.5553935860058309, 0.4446064139941691)

In [None]:
data.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'class']

In [None]:
data.head(3)

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0


In [None]:
from sklearn.preprocessing import Normalizer

In [None]:
scaler = Normalizer()

data_scaled = data.copy()

In [None]:
data_scaled.iloc[:, :-1]

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.62160,8.66610,-2.8073,-0.44699
1,4.54590,8.16740,-2.4586,-1.46210
2,3.86600,-2.63830,1.9242,0.10645
3,3.45660,9.52280,-4.0112,-3.59440
4,0.32924,-4.45520,4.5718,-0.98880
...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949
1368,-1.38870,-4.87730,6.4774,0.34179
1369,-3.75030,-13.45860,17.5932,-2.77710
1370,-3.56370,-8.38270,12.3930,-1.28230


In [None]:
scl = scaler.fit_transform(data_scaled.iloc[:, :-1])

In [None]:
scl.shape, data_scaled['class'].shape

((1372, 4), (1372,))

In [None]:
data_new = np.hstack((scl, data_scaled['class'][:, np.newaxis]))

  """Entry point for launching an IPython kernel.


In [None]:
data_new

array([[ 0.36905562,  0.88311049, -0.28607517, -0.04555008,  0.        ],
       [ 0.4650453 ,  0.83552454, -0.25151463, -0.14957274,  0.        ],
       [ 0.76378018, -0.52123157,  0.38015153,  0.02103063,  0.        ],
       ...,
       [-0.16567159, -0.59454115,  0.77718941, -0.12267994,  1.        ],
       [-0.23090319, -0.54314116,  0.80298095, -0.0830842 ,  1.        ],
       [-0.645067  , -0.16699315,  0.68117898,  0.30331015,  1.        ]])

In [None]:
data_sc_new = pd.DataFrame(data_new, columns=data.columns)

In [None]:
data_sc_new

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,0.369056,0.883110,-0.286075,-0.045550,0.0
1,0.465045,0.835525,-0.251515,-0.149573,0.0
2,0.763780,-0.521232,0.380152,0.021031,0.0
3,0.301268,0.829982,-0.349606,-0.313278,0.0
4,0.050902,-0.688796,0.706823,-0.152873,0.0
...,...,...,...,...,...
1367,0.193595,0.643125,-0.691221,-0.266693,1.0
1368,-0.168665,-0.592375,0.786715,0.041512,1.0
1369,-0.165672,-0.594541,0.777189,-0.122680,1.0
1370,-0.230903,-0.543141,0.802981,-0.083084,1.0


In [None]:
data_sc_new.iloc[:, 1].shape

(1372,)

Let us binarize the features, which are already normalised.

In [None]:
for i in range(data_sc_new.shape[1]-1):
  for j in range(data_sc_new.iloc[:, i].shape[0]):
    if data_sc_new.iloc[j, i] > 0:
      data_sc_new.iloc[j, i] = 1
    elif data_sc_new.iloc[j, i] < 0:
      data_sc_new.iloc[j, i] = 0
    #print(data_sc_new.iloc[j, i])


We have all the data labelled, so we may suppose about 30% of them unlabelled in order to provide the lazy classification algorithm and classify supposed unlabelled instances.

In [None]:
0.3 * data_sc_new.shape[0]  # determine the size of the unlabelled part as 411

411.59999999999997

In [None]:
data_sc_new.iloc[553:964, 4].value_counts()

0.0    209
1.0    202
Name: class, dtype: int64

In [None]:
data_copy = data_sc_new.copy()

data_copy['class_2'] = data_copy['class']

In [None]:
data_copy.iloc[553:964, 5] = -1 # we change this part of dataset setting it to undetermined instances

In [None]:
data_copy['class'].value_counts()

0.0    762
1.0    610
Name: class, dtype: int64

In [None]:
data_copy['class_2'].value_counts()

 0.0    553
-1.0    411
 1.0    408
Name: class_2, dtype: int64

**So, we obtained three classes: positive, negative and undetermined:**

In [None]:
positive = data_copy[data_copy['class_2'] == 1]
negative = data_copy[data_copy['class_2'] == 0]
undetermined = data_copy[data_copy['class_2'] == -1]

In [None]:
negative.head(4)

Unnamed: 0,variance,skewness,curtosis,entropy,class,class_2
0,1.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,1.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0


In [None]:
def intersect(a, b):
    """
    for two examples, the intersection is 1 at some position if 1 is in both examples at the 
    corresponding position. Similarly for zero.
    And it is -1 if the values at this position in two examples differ.
    """
    intersection = []
    ex1, ex2 = a, b
    intersection = [ex1[i] if ex1[i] == ex2[i] else -1 for i in range(len(ex1))]
    return intersection

In [None]:
intersect(positive.iloc[1, :4],  undetermined.iloc[1, :4])

[-1, 0.0, 1.0, 1.0]

In [None]:
positive.iloc[i, :4].shape[0]

4

In [None]:
def formIntersections(context):
    intersections = []
    for i in range(context.iloc[:, :4].shape[0]):
        for j in range(i + 1, context.iloc[:, :4].shape[0]):
            intersections.append(intersect(context.iloc[i, :4], context.iloc[j, :4]))
    return intersections

In [None]:
positiveIntersections = formIntersections(positive)
negativeIntersections = formIntersections(negative)

In [None]:
len(positiveIntersections), len(negativeIntersections)

(83028, 152628)

In [None]:
positiveIntersections[0]

[0.0, 0.0, 1.0, 1.0]

In [None]:
def intersectionCoversExample(intersection, example):
    """
    Checks if an intersection covers an example
    For instance, [0,-1,1,-1] covers [0,1,1,0]
    (minus 1 might be treated as either 0 or 1)
    But [1,-1,1,0] does not cover [1,0,0,0]
    """
    for i in range(len(intersection)):
        if (intersection[i] == 1 and example[i] == 0) or (intersection[i] == 0 and example[i] == 1):
            return False
    return True

In [None]:
def intersectionNotCoversSet(intersection, objectSet):
    """
    Checks if an intersection covers all examples from objectSet
    For instance, [0,-1,1,-1] covers a set [[0,1,1,0], [0,0,1,0]]
    (minus 1 might be treated as either 0 or 1)
    But [1,-1,1,0] does not cover a set [[1,0,0,0],[1,0,1,0]]
    """
    for example in objectSet:
        if not intersectionCoversExample(intersection, example):
            return True
    return False

In [46]:
#def formHypotheses():
#positiveIntersections = self.formIntersections(self.positiveCxt)
#negativeIntersections = self.formIntersections(self.negativeCxt)
positiveHypotheses, negativeHypotheses = [], []
for intersection in positiveIntersections:
    if intersectionNotCoversSet(intersection, negativeIntersections):
        positiveHypotheses.append(intersection)                
for intersection in negativeIntersections:
    if intersectionNotCoversSet(intersection, positiveIntersections):
        negativeHypotheses.append(intersection)
#positiveHypotheses = positiveHypotheses
#negativeHypotheses = negativeHypotheses
#return positiveHypotheses, negativeHypotheses

In [48]:
len(positiveHypotheses), len(negativeHypotheses)

(76697, 150904)

In [49]:
positiveHypotheses

[[0.0, 0.0, 1.0, 1.0],
 [0.0, -1, -1, -1],
 [0.0, -1, 1.0, -1],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, -1],
 [-1, 0.0, 1.0, 1.0],
 [-1, -1, -1, 1.0],
 [0.0, -1, -1, -1],
 [0.0, -1, 1.0, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, -1],
 [-1, -1, -1, 1.0],
 [0.0, -1, -1, -1],
 [0.0, -1, 1.0, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, 1.0, -1],
 [0.0, -1, 1.0, 1.0],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, -1],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, -1, -1, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, -1, 1.0, 1.0],
 [0.0, -1, -1, -1],
 [0.0, -1, -1, -1],
 [-1, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, 1.0],
 [0.0, 0.0, 1.0, -1],
 [0.0, 0.0, 1.0, 1.

In [50]:
undetermined.iloc[:, :4].head()

Unnamed: 0,variance,skewness,curtosis,entropy
553,1.0,0.0,1.0,0.0
554,1.0,0.0,1.0,1.0
555,1.0,1.0,1.0,0.0
556,1.0,1.0,0.0,0.0
557,1.0,1.0,1.0,1.0


In [51]:
def classify():
    """
    Classifies  examples with unknown labels from unknownCxtFile
    If the number of positive hypotheses covered by the description of
    the example (votingPositiveNum) is equal to the the number of negative
    hypotheses covered by the description of the example, then the example 
    is classified as undefined.
    Otherwise, its label is the result of the voting procedure.
    """
    labels = []
    
    positiveCoverage = [[intersectionCoversExample(posHyp, testObject)
                          for posHyp in positiveHypotheses] 
                          for testObject in undetermined.iloc[:, :4].values]
    negativeCoverage = [[intersectionCoversExample(negHyp, testObject)
                          for negHyp in negativeHypotheses] 
                          for testObject in undetermined.iloc[:, :4].values]
    for i in range(len(positiveCoverage)):
        votingPositiveNum = sum(positiveCoverage[i])
        votingNegativeNum = sum(negativeCoverage[i])
        labels.append(-1) if votingPositiveNum == votingNegativeNum  \
                        else labels.append(int(votingPositiveNum > votingNegativeNum)) 
    return labels

In [52]:
len(classify(undetermined.iloc[:, :4]))

411

In [53]:
classify(undetermined.iloc[:, :4])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,


In [54]:
undetermined.shape[0]

411

In [55]:
undetermined['pred'] = classify(undetermined.iloc[:, :4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
undetermined[undetermined['pred'] == -1] # we have no undetermined predicted class at all

Unnamed: 0,variance,skewness,curtosis,entropy,class,class_2,pred


In [58]:
undetermined[undetermined['class']==undetermined['pred']].shape[0]

349

In [59]:
349 / 411

0.8491484184914841

**So, we have an accuracy of predicting unlabelled class 84.9%**