<h1> Multiple Instance Learning - A Lazy Learning Approach </h1>

<p> This python notebook has been created to understand the working of CKNN (lazy learner) algorithm for multiple instance learning problem.  Refer <a href="http://cogprints.org/2124/3/wang_ICML2000.pdf"> paper</a>.  In this implementation, stock selection problem has been solved using CKNN.  In the stock selection problem, for each day, the top 80% of stocks have been segregated as positive bags and bottom 20% of stocks have been segregated as negative bags </p>

<h3> Import libraries </h3>

In [0]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
import math
from sklearn.model_selection import train_test_split

<h3> Define the CitationKNN Model </h3>

In [0]:
class CitationKNN(object):

    def __init__(self):
        self._bags = None
        self._bag_predictions = None
        self._labels = None
        self._full_bags = None
        self._DM = None

    def fit(self, train_bags, train_labels, **kwargs):
        self._bags = train_bags
        self._labels = train_labels
        self._no_of_references = kwargs['references']
        self._no_of_citers = kwargs['citers']

    def predict(self, Testbags):
        train_bags = self._bags
        pred_labels = np.array([])
        self._DM = self.DistanceMatrixCKNN(train_bags)

        for i in range(0, len(Testbags)):

            citers = []
            references = []
            distances = []

            for j in range(0, len(train_bags)):
                distance = _min_hau_bag(Testbags[i], train_bags[j])
                distances.append(distance)
                self._DM[j].append(distance)

            self._DM.append(distances)
            last = len(self._DM) - 1
            self._DM[last].append(0)
            arr = np.array( self._DM[last] )
            references = arr.argsort()[:self._no_of_references + 1]

            index = np.argwhere(references==last)
            references = np.delete(references, index)

            for j in range(0, len(self._DM) - 1):
                arr = np.array( self._DM[j] )
                neighbors = arr.argsort()[:self._no_of_citers + 1]
                if last in neighbors:
                    citers.append(j)

            relevant_test_labels = []
            for j in range(0, len(references)):
                relevant_test_labels.append(self._labels[references[j]])
            for j in range(0, len(citers)):
                relevant_test_labels.append(self._labels[citers[j]])

            if (relevant_test_labels.count(1.0)):
                label_out = 1.0
            else:
                label_out = -1.0
            
            pred_labels = np.append(pred_labels,label_out)

            self._DM.pop()
            for j in range(0, len(self._DM)):
                self._DM[j].pop()

        return pred_labels

    def DistanceMatrixCKNN (self, full_bag):
        w, h = len(full_bag), len(full_bag)
        Matrix = [[0 for x in range(w)] for y in range(h)] 
        count=0        
        for i in range(0, len(full_bag)):
                for j in range(0, len(full_bag)):
                    Matrix[i][j] = _min_hau_bag(full_bag[i], full_bag[j])
                    
        return Matrix

<h3> Define the distance metric used by CKNN </h3>

In [0]:
def _min_hau_bag(X,Y):
    Hausdorff_distance = max( min((min([list(dist.euclidean(x, y) for y in Y) for x in X]))),
                               min((min([list(dist.euclidean(x, y) for x in X) for y in Y])))
                              )
    return Hausdorff_distance

<h3> Read the data and create positive and negative bags </h3>

In [0]:
def create_bags():
    file_path = 'Israel_Stocks.csv'
    df = pd.read_csv(file_path,nrows=1500)
    df = df.drop(['Ex Coefficient','Index Adjusted Free Float Rate','* Ex Type', 'Date', 'Symbol', 'Label'],axis=1)
    df = df.dropna(subset = df.columns)
    day = 1
    bag = []
    positive_bags = []
    negative_bags = []
    for index,row in df.iterrows():
        if(row['Day No.'] != day):
            positive_bags.append(bag[0:int(0.8*len(bag))])
            negative_bags.append(bag[int(0.8*len(bag)):])
            day += 1
            bag = []
        bag.append(row.values)
    if (len(bag) > 0):
        positive_bags.append(bag[0:int(0.8*len(bag))])
        negative_bags.append(bag[int(0.8*len(bag)):])
        
    positive_labels = [1]*len(positive_bags)
    negative_labels = [-1]*len(negative_bags)
    return positive_bags + negative_bags, positive_labels + negative_labels

<h3> Split the bags into train and test bags </h3>

In [26]:
bags,labels, = create_bags()
train_bags, test_bags, train_labels, test_labels = train_test_split(bags,labels,random_state=10)
print (len(train_bags))
print (len(test_bags))

73
25


<h3> Test the model for different k values </h3>

In [27]:
for k in range(2,4):

    print ("------MIL_CKNN------")
    cknn_clf = CitationKNN() 
    parameters_cknn = {'references': k, 'citers': k+2}
    cknn_clf.fit(train_bags, train_labels, **parameters_cknn)
    pred_labels = cknn_clf.predict(test_bags)
    total_count = len(test_labels)
    corr_count= 0
    for i in range(len(test_labels)):
        if (test_labels[i] == pred_labels[i]):
            corr_count += 1
    acc = float(corr_count) / total_count
    print("k=" + str(k)+", references="+str(k)+", citers="+str(k+2))
    print("acc=" + str(acc))

------MIL_CKNN------
k=2, references=2, citers=4
acc=0.36
------MIL_CKNN------
k=3, references=3, citers=5
acc=0.4
