In [2]:
def collect_images(dataPath):
    """
        Dataset 1 : Resize all the images two 48 x 42 from 192 x 168.
    """
    every_path = list(os.walk(dataPath))
    data = []
    
    for i in range(0,len(every_path)):
        dirPath, dirName, fileNames = every_path[i]
        if(len(dirName) == 0):
            for j in fileNames:
                single_doc = []
                single_doc_loc = dirPath + '/' + j
                img = io.imread(single_doc_loc)
                
                if(img.shape[0] == 192 and img.shape[1] == 168): # removing images with other dim, there are some.
                    img_re = resize(img, (48, 42), anti_aliasing = True)
                    img_re = img_re.flatten().tolist()
                    img_re.append(dirPath.split('/')[-1])
                    data.append(img_re)
    
    return np.array(data)

In [2]:
def collect_data(dataPath):
    """
        Dataset 2
    """
    every_path = list(os.walk(dataPath))
    train_data = []
    test_data = []
    meta = []
    
    for i in range(0,len(every_path)):
        dirPath, dirName, fileNames = every_path[i]
        folder = dirPath.split('/')[-1]
        
        if(len(dirName) == 0):
            for j in fileNames:
                single_doc_loc = dirPath + '/' + j
                
                with open(single_doc_loc, 'rb') as fo:
                    dt = pickle.load(fo, encoding='bytes')
                    
                    if(folder == 'train'):
                        train_data.append(decoding(dt))
                    elif(folder == 'test'):
                        test_data.append(decoding(dt))
                    elif(folder == 'meta'):
                        meta.append(decoding(dt))
                        
    return train_data, test_data, meta

In [3]:
def decoding(d):
    
    datapt = {}
    for k, v in d.items():
        if isinstance(v, bytes):
            datapt[k.decode("utf-8")] = v.decode("utf-8")          
        else:
            datapt[k.decode("utf-8")] = v
            
    return datapt           
            
    

In [4]:
def dataMatrix(data, n):
    """ Dataset 1 """
    mat = []
    
    # Handle the case  with unbalanced data - to be done.
    
    # Balanced data set
    total_images = 0
    for k in n.keys():
        total_images+= n[k]
    
    print('total valid images:', total_images)
    
    c = len(n.keys()) # no. of class
    
    for i in range(0, total_images):
        mat.append(data[i].flatten().tolist())
    
    matrix = np.array(mat, dtype='uint8')
    return matrix

    

In [5]:
def createDataMatrix(imgs):
    """ Dataset 2 """
    data_mat = []
    data_label = []
    
    for batch in imgs:
        for i in range(0, batch['data'].shape[0]):
            data_mat.append(rgbToGray(batch['data'][i,:]).tolist())
            data_label.append(batch['labels'][i])
    
    labels = np.array(data_label)[:,np.newaxis]
    data_mat = np.append(data_mat, labels, axis=1)
    
    return data_mat
    
    

In [1]:
def pca(data, req_energy_per):
    
    mean_sub = data - data.mean(axis = 0)

    mat_cv = np.cov(data, rowvar = False)

    eg_val, eg_vec = np.linalg.eig(mat_cv) #ignore imaginary part in small eigen values, due to numerical error

    eg_val = eg_val.real
    sorted_eg_val = sorted(eg_val.real, reverse = True)
    
#     print(sorted_eg_val)
    cnt = 0
    current_energy = 0
    
    energy_req = req_energy_per * sum(sorted_eg_val)
    
    for k in sorted_eg_val:
        if(current_energy >= energy_req):
            break
        current_energy+=k
        cnt+=1
        
    vals,vecs = np.linalg.eig(mat_cv)
    vals = vals[vals.imag == 0]
    vals = vals.real
    
    sorted_vals_pos = np.argsort(vals)[::-1][:cnt]
    
    top_vec = []
    for s in sorted_vals_pos:
        top_vec.append(vecs[:,s].tolist())
    
    vecs = np.array(top_vec)

    lower_dim_pts = np.dot(vecs, data.T).T
            
    return lower_dim_pts, vecs

In [7]:
def viewImg(imgArr, gray = False):
    if(not gray):
        io.imshow(imgArr)
    else:
        io.imshow(imgArr, cmap="gray")
    return
    

In [8]:
def rgbToGray(img):
    """
        Assuming rgb values are in row major order
        Using: Y = 0.2125 R + 0.7154 G + 0.0721 B
    """
  
    img = np.reshape(img, (3, 1024)) # 32*32 = 1024 features
    new_img = []
    
    for j in range(0, img.shape[1]):
        gray_pixel = round(0.2125*img[0,j] + 0.7154*img[1,j] + 0.0721*img[2,j])
        new_img.append(gray_pixel)
        
    img_gray = np.array(new_img)
   
    # viewImg(np.reshape(img_gray, (32,32)), True)
    
    return img_gray
    
    
    

In [9]:
def pickling(data, file):
    pk = open(file+'.pickle', 'wb')
    pickle.dump(data, pk)
    pk.close()
    return

In [10]:
def unpickle(file):
    pk = open(file+'.pickle', 'rb')
    return pickle.load(pk)

In [11]:
def startKFold(train_data, K, b = False, bag = False):
    """
        Args: training data, K = no. of folds
        returns: best trained classifier
    """
    global no_of_iter # for adaboost
    
    all_sets = np.split(train_data, K, axis=0)

    accuracy = []
    errorate = []
    model = []
    for i in range(0, K):
        
        v_label = all_sets[i][:,-1] # Validation test labels
        v_set = all_sets[i][:,:-1] # Validation test set
        
        train_si = []

        train_si = [ j for j in range(0, K) if(j!=i)]
        train_set = all_sets[train_si[0]]
        
        for j in range(1, len(train_si)):
            train_set = np.concatenate((train_set, all_sets[train_si[j]]), axis=0)
            
        if(b):
            # have to convert to pandas from np array
            train_Acc, test_Acc = Adaboost(no_of_iter, pd.DataFrame(train_set), pd.DataFrame(all_sets[i]))
            print('Cross Validation Fold ',i,' test set accuracy is :', test_Acc)
            accuracy.append(train_Acc)
            errorate.append(100 - train_Acc)
        
        elif(bag):
             # have to convert to pandas from np array
            train_Acc, test_Acc = bagging(no_of_iter, pd.DataFrame(train_set), pd.DataFrame(all_sets[i])) # no normalisation
            print('Cross Validation Fold ',i,' test set accuracy is :', test_Acc)
            accuracy.append(train_Acc)
            errorate.append(100 - train_Acc)
        
            
        else:
            
            t_label = train_set[:,-1] # Validation training labels
            t_set = train_set[:, :-1] # Validation training set
            
            classifier = GaussianNB()
            classifier.fit(t_set, t_label)
            model.append(classifier)
            accuracy.append(classifier.score(v_set, v_label))
    
    if(b):
        print('Mean accuracy (boosting): ', stat.mean(accuracy),'%', 'Standard deviation of accuracy: ',stat.pstdev(accuracy))
        print('Mean of error rate: ', stat.mean(errorate),'%', ' Standard deviation of error rate: ',stat.pstdev(errorate))
        return
    
    elif(bag):
        print('Mean accuracy(bagging): ', stat.mean(accuracy),'%', 'Standard deviation of accuracy: ',stat.pstdev(accuracy))
        print('Mean of error rate: ', stat.mean(errorate),'%', ' Standard deviation of error rate: ',stat.pstdev(errorate))
        return        
        
    else:
        print('Mean of accuracy: ', stat.mean(accuracy), 'Standard deviation of accuracy: ',stat.pstdev(accuracy))
        best_model_pos = accuracy.index(max(accuracy))
        return model[best_model_pos]
    

In [12]:
def LDA(train_set):
    """
        Args: training set (in x feature space) last column class labels are integers
        Output: Projected set (in y feature space) Y = W^t . X
    """
    ts = pd.DataFrame(train_set)
    ts.rename(columns = {ts.columns[-1] : "class"}, inplace = True)
   
    # No. of points in class [Ni]
    unique, counts = np.unique(train_set[:,-1], return_counts=True)
   
    # Classes labels [wi] and their respective counts
    w = unique.tolist()
    N = counts.tolist()
    
    data_sep_classwise = {}
    
    for i in w:
        # ith class
        data_sep_classwise[i] = (ts.loc[ts['class'] == i])
    
    # Compute mean of every class i [0 - 9] = [1/Ni sum(x belonging to class wi)]
    mean = {}
    for i,v in data_sep_classwise.items():
        mean[int(i)] = v.iloc[:,:-1].mean(axis = 0)       
        
    # Overall mean [1/N sum(all x)]
    overall_mean = ts.iloc[:, :-1].mean(axis = 0)
    
    # Sum of cov. matrices of each class Si = Sw [withing class scatter matrix of the data in x feature space]
    f = v.iloc[:,:-1].shape[1] # no. of features
    Sw = pd.DataFrame(np.zeros((f,f)))
    
    for i,v in data_sep_classwise.items():
        if(w[0]!=0):
            Sw+=(v.iloc[:,:-1].cov()) #*(N[int(i)-1]-1)
        else:
            Sw+=(v.iloc[:,:-1].cov()) #*(N[int(i)]-1)          
            
   
    # Compute between class scatter in the x feature space [sum(over classes) [Ni (ui -u)(ui-ui)^T]]
    Sb = pd.DataFrame(np.zeros((f,f)))
    
    for i,m in mean.items():
        term = (m - overall_mean).to_frame()
        if(w[0]!=0):
            Sb+=N[int(i)-1]*(term.dot(term.T))
        else:
            Sb+=N[int(i)]*(term.dot(term.T))
    
    # Compute Sw^-1. Sb
    W = np.dot(np.linalg.inv(Sw.to_numpy()), Sb)
    
    
    # eigen vectors and eigen values, of Sw^-1. Sb eigen vectors are projection vectors
#     vals, vecs = eigs(W, k = len(w)-1, which = 'LR', maxiter = 30000)
    vals,vecs = np.linalg.eig(W)
    vals = vals[vals.imag == 0]
    vals = vals.real
    
    sorted_vals_pos = np.argsort(vals)[::-1][:len(w)-1]
    
    top_vec = []
    for s in sorted_vals_pos:
        top_vec.append(vecs[:,s].tolist())
    
    vecs = np.array(top_vec)
    
    lda_space_pt = np.dot(vecs, train_set[:,:-1].T).T  
    lda_space = np.append(lda_space_pt, train_set[:,-1][:,np.newaxis], axis = 1)
    
    return lda_space, vecs
    

In [None]:
def classify(train_data, test_data):
    """
        1. Train Naive Bayes
        2. Find out accuracy on Test data.
    """
    classifier = GaussianNB()
    classifier.fit(train_data[:,:-1], train_data[:,-1])
    print('Accuracy: ', classifier.score(test_data[:,:-1], test_data[:,-1]))
    return
    

In [13]:
def crossValidation(data, K, b = False, bag = False):
    np.random.shuffle(data)
    return startKFold(data, K, b, bag)
    
    

In [14]:
def classifyTestset(model, test_d):
    """
        Args: best trained model
        output: Accuracy, Confusion matrix
    """
    print('Accuracy using best model on Test set: ', model.score(test_d[:,:-1], test_d[:,-1]))
    cm = np.zeros((no_of_classes, no_of_classes))
    
    predicted_cls = model.predict(test_d[:,:-1])
    
    for i in range(0, len(predicted_cls)):
        cm[int(predicted_cls[i]), int(test_d[i,-1])]+=1
        
    return cm
    

In [15]:
def randomSplitDataset(nr, a):
    """
        Args: a = percentage of data points for training set
    """
    no_train_rows = round((a*nr)/100)    
    return no_train_rows
    

In [16]:
def makeSplit(nr, K):
    req_rows = nr - (nr % K)
    return req_rows
    

In [None]:
def collect_letters(path):
    data = pd.read_csv(path+'/letter-recognition.data', header = None)
    return data
    

In [None]:
def superClassifier(h_alpha, data):
    """
        h_alpha : all the alphas with prediction of every weak classifier for all the data
        data : find the accuracy on this data. With true labels given in last column.
    """
    
    match = 0
    
    for i in range(0, data.shape[0]):
#         clear_output(wait = True)
        predicted_label = h_alpha.groupby([i+1])[[0]].sum()[0].idxmax()
        predicted_label = int(predicted_label)
        if(predicted_label == data.iloc[i,-1]):
            match+=1
#         print('Current progress: ', np.round((i/data.shape[0])*100,2))
 
    return np.round((match/data.shape[0])*100, 2)

In [None]:
def Adaboost(N, train_data, test_data):
    """
    Args:
        - N : no. of rounds of boosting
        - Train : Training set
        - Test : Testing set
    Returns:
        - Train & test acc.
        
    Using decision tree upto 2 levels of tree and 5 nodes.
    
    """
    nd_train = train_data.shape[0]
    nd_test = test_data.shape[0]
    
    nc = len(alphabet.keys()) - 1
    
    ar = np.zeros((N, nd_train + 1))  # each row : alpha of kth clf | its predictions
    ar1 = np.zeros((N, nd_test + 1))
    
    H_alpha_train = pd.DataFrame(data=ar)
    H_alpha_test = pd.DataFrame(data=ar1)

    arr = np.zeros((nd_train, 3))
    
    df = pd.DataFrame(data= arr, columns = ['weight', 'tlabel', 'plabel'])
    df['weight'] = 1/nd_train
    df['tlabel'] = train_data.iloc[:, -1]
    
    # AdaBoost
    
    for k in range(0, N):
    
        # call weak learner - minimizes error approx. of train_data with weights as input.
        
        dt = DecisionTreeClassifier(max_depth = 2, max_leaf_nodes = 3)
 
        dt.fit(train_data.iloc[:,:-1], df['tlabel'], sample_weight = df['weight'].values)
        
        predictions_train = dt.predict(train_data.iloc[:,:-1])
        predictions_test = dt.predict(test_data.iloc[:,:-1])
        
        df['plabel'] = predictions_train
        
        # training error
        eps = df.loc[df['plabel'] != df['tlabel'], 'weight'].sum()/df['weight'].sum()
        
        alpha = np.log((1-eps)/eps) + np.log(nc)  # Learning Rate 1, SAMME
        
        if(eps <= (1 - 1/nd_train)):

            df.loc[df['tlabel'] != df['plabel'], 'weight']*= np.exp(alpha)
            df['weight']/=df['weight'].sum()
            H_alpha_train.loc[k] = [alpha] + predictions_train.tolist()  
            H_alpha_test.loc[k] = [alpha] + predictions_test.tolist()  
            
        else:
            break
    
    train_acc = superClassifier(H_alpha_train, train_data)
    test_acc = superClassifier(H_alpha_test, test_data)
                    
    return train_acc, test_acc
    

In [None]:
def baggingClassifier(h, data):
    
    predicted_labels = np.argmax(h, axis=1) 
    acc = accuracy_score(predicted_labels, data.iloc[:,-1])*100
    return acc

In [3]:
def bagging(N, train_data, test_data, norm = False):
    """
    Args:
        - N : no. of rounds of boosting
        - Train : Training set
        - Test : Testing set
    Returns:
        - Train & test acc.
        
    Using decision tree upto 2 levels of tree and 5 nodes.
    
    """
    nd_train = train_data.shape[0]
    nd_test = test_data.shape[0]
    
    nc = len(alphabet.keys())
    
    H_train = np.zeros((nd_train, nc))
    H_test = np.zeros((nd_test, nc))

    h_train = np.zeros((N, nd_train), dtype='int64')
    h_test = np.zeros((N, nd_test), dtype ='int64')
    
    for k in range(0, N):
        
        tdata = train_data.sample(n=train_data.shape[0], replace = True).reset_index(drop = True)
        
        dt = DecisionTreeClassifier(max_depth = 2, max_leaf_nodes = 3)
 
        dt.fit(tdata.iloc[:,:-1], tdata.iloc[:, -1])
        
        score_train = dt.predict_proba(train_data.iloc[:,:-1])
        score_test = dt.predict_proba(test_data.iloc[:,:-1])
        
        if(norm == 'tanh'):
            # Normalise using min max/ tanh/ zscore
            score_train = np.tanh(score_train)
            score_test = np.tanh(score_test)
                  # Sums of scores
            H_train+=score_train
            H_test+=score_test
            
        elif(norm == 'minmax'):
            sc = MinMaxScaler(feature_range=(0, 1), copy = False)
            sc = sc.fit(score_train)
            sc.transform(score_train)
            
            sc = MinMaxScaler(feature_range=(0, 1), copy = False)
            sc = sc.fit(score_test)
            sc.transform(score_test)
              # Sums of scores
            H_train+=score_train
            H_test+=score_test
            
        elif(norm == 'zscore'):
            score_train = zscore(score_train)
            score_test = zscore(score_test)
                  # Sums of scores
            H_train+=score_train
            H_test+=score_test
            
        else:
            predictions_train = dt.predict(train_data.iloc[:,:-1])
            predictions_test = dt.predict(test_data.iloc[:,:-1])
            h_train[k,:] = predictions_train.tolist()  
            h_test[k,:] = predictions_test.tolist()  
           
            
                
    if(not norm):
        train_acc = baggingClassifier2(h_train, train_data)
        test_acc = baggingClassifier2(h_test, test_data)
    else:
        train_acc = baggingClassifier(H_train, train_data)
        test_acc = baggingClassifier(H_test, test_data)
                    
    return train_acc, test_acc
    

In [4]:
def baggingClassifier2(h ,data):
    predictions = []
    for i in range(0, data.shape[0]):
#         print(type(np.bincount(h[:,i])))
        predictions.append(np.bincount(h[:,i]).argmax())
        
    acc = accuracy_score(predictions, data.iloc[:,-1])*100
    return acc