In [39]:
from abc import ABCMeta, abstractmethod
import numpy as np
from numpy.random import choice
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomTreesEmbedding
from sythetic_ver2 import Sythetic
from sythetic_ver2 import txn_class
from sythetic_ver2 import cust_class
from scipy.sparse import issparse
from sklearn.ensemble.bagging import _generate_indices
from sklearn.ensemble.bagging import _generate_bagging_indices

In [2]:
t1=txn_class(5,[[.9,.1,.0,.0,.0,.0,.0,.0,.0,.0], #single-pattern
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1]])

t2=txn_class(5,[[.1,.9,.0,.0,.0,.0,.0,.0,.0,.0], #two-order interactive-items
               [.9,.1,.0,.0,.0,.0,.0,.0,.0,.0],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1]])

t3=txn_class(5,[[.1,.1,.1,.1,.1,.1,.1,.1,.1,.1], #two-order interactive-items
               [.1,.0,.0,.0,.1,.0,.1,.0,.7,.0],
               [.1,.0,.0,.0,.1,.1,.1,.6,.0,.0],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
               [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1]])

t4=txn_class(5,[[.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],#three-order interactive-items
                [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
                [.0,.0,.0,.0,.1,.8,.1,.0,.0,.0],
                [.0,.0,.0,.0,.0,.1,.8,.1,.0,.0],
                [.0,.0,.0,.0,.0,.0,.1,.8,.1,.0]])

t5=txn_class(5,[[.0,.0,.9,.0,.0,.0,.0,.0,.0,.1],#full-order interactive-items
                [.1,.0,.9,.0,.0,.0,.0,.0,.0,.0],
                [.0,.0,.9,.0,.0,.1,.0,.0,.0,.0],
                [.0,.0,.0,.0,.9,.0,.0,.1,.0,.0],
                [.0,.0,.0,.0,.9,.0,.1,.0,.0,.0]])

t6=txn_class(5,[[.0,.0,.8,.0,.0,.1,.0,.0,.0,.1],#full-order interactive-items
                [.1,.0,.9,.0,.0,.0,.0,.0,.0,.0],
                [.0,.0,.7,.1,.0,.1,.1,.0,.0,.0],
                [.1,.0,.1,.0,.7,.0,.0,.1,.0,.0],
                [.1,.0,.1,.0,.1,.0,.1,.0,.6,.0]])

t7=txn_class(5,[[.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],#random sample
                [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
                [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
                [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1],
                [.1,.1,.1,.1,.1,.1,.1,.1,.1,.1]])

c1=cust_class(0,t1,t2,t7)
c2=cust_class(1,t3,t4,t7)
c3=cust_class(2,t5,t6,t7)

s=Sythetic()
s.doc_final([(c1,[50,50,50],50),(c2,[50,50,50],50),(c3,[50,50,50],50)],'full')
s.count2vect()

<150x14237 sparse matrix of type '<class 'numpy.int64'>'
	with 112837 stored elements in Compressed Sparse Row format>

In [67]:
class Sampleing(metaclass=ABCMeta):
    @abstractmethod
    def __init__(self,
                 random_state=None, 
                 bootstrap_features=False,
                 bootstrap_samples=False, 
                 n_features=10, 
                 n_samples=10,
                 max_features=5, 
                 max_samples=5):
        self.random_state=random_state
        self.bootstrap_features=bootstrap_features
        self.bootstrap_samples=bootstrap_samples
        self.n_features=n_features
        self.n_samples=n_samples
        self.max_features=max_features
        self.max_samples=max_samples
    
    def fit(self,X):
        if issparse(X):
            self.X=X.toarray()
        else:
            self.X=X
        self.n_samples=len(self.X)
        self.n_features=self.X.shape[1]
        if self.max_samples<=1:
            self.max_samples=int(self.n_samples*self.max_samples)
        if self.max_features<=1:
            self.max_features=int(self.n_features*self.max_features)
        self.feat_indices,self.sample_indices=_generate_bagging_indices(self.random_state, 
                                                              self.bootstrap_features,
                                                              self.bootstrap_samples, 
                                                              self.n_features, 
                                                              self.n_samples,
                                                              self.max_features, 
                                                              self.max_samples)
        
        return self.feat_indices,self.sample_indices
    
    def fit_transform(self,X):
        feat_indices,sample_indices=self.fit(X)
        return (self.X[sample_indices])[:,feat_indices]
    
    def transform(self,new_X):# new_X is a 2D-matrix
        return new_X[:,self.feat_indices]

In [72]:
class Pasting(Sampleing): # Pasting is sampleing samples without replacement
    def __init__(self,
                 random_state=None, 
                 bootstrap_features=False,
                 bootstrap_samples=False, 
                 max_features=1,
                 max_samples=0.8 
                ):

        super().__init__(
                 random_state=random_state, 
                 bootstrap_features=bootstrap_features,
                 bootstrap_samples=bootstrap_samples, 
                 max_features=max_features,
                 max_samples=max_samples 
                 )
        
class Bagging(Sampleing): # Bagging is sampleing samples with replacement(bootstrap)
    def __init__(self,
                 random_state=None, 
                 bootstrap_features=False,
                 bootstrap_samples=True, 
                 max_features=1,
                 max_samples=0.8 
                ):

        super().__init__(
                 random_state=random_state, 
                 bootstrap_features=bootstrap_features,
                 bootstrap_samples=bootstrap_samples, 
                 max_features=max_features,
                 max_samples=max_samples 
                 )
        
class Random_Subspaces(Sampleing): # Random_Subspaces is sampleing features with replacement
    def __init__(self,
                 random_state=None, 
                 bootstrap_features=True,
                 bootstrap_samples=False, 
                 max_features=0.8,
                 max_samples=1
                ):

        super().__init__(
                 random_state=random_state, 
                 bootstrap_features=bootstrap_features,
                 bootstrap_samples=bootstrap_samples, 
                 max_features=max_features,
                 max_samples=max_samples 
                 )
        
class Random_Patches(Sampleing): # Random_Patches is sampleing both on samples and features with replacement
    def __init__(self,
                 random_state=None, 
                 bootstrap_features=True,
                 bootstrap_samples=True, 
                 max_features=0.8,
                 max_samples=0.8
                ):

        super().__init__(
                 random_state=random_state, 
                 bootstrap_features=bootstrap_features,
                 bootstrap_samples=bootstrap_samples, 
                 max_features=max_features,
                 max_samples=max_samples 
                 )

In [78]:
model_dic={'sampling':[Pasting,Bagging,Random_Subspaces,Random_Patches],'dim_dedu':[SVD,PCA],'cluster':[KMeans]}
# add more alternatives in 'dim_deduction' and 'cluster'
class estimator():
    
    def __init__(self,model_dic):
        self.layer0=choice(model_dic['sampling'],1)[0]
        self.layer1=choice(model_dic['dim_dedu'],1)[0]
        self.layer2=choice(model_dic['cluster'],1)[0]
        
        
    def fit(self,X,K):
        self.layer0_=self.layer0(max_samples=0.9)
        self.layer0_output=self.layer0_.fit_transform(X)
        self.layer1_=self.layer1(n_components=K)
        self.layer1_output=self.layer1_.fit_transform(self.layer0_output)
        self.layer2_=self.layer2(n_clusters=K)
        self.layer2_.fit(self.layer1_output)
        #print('fit successfully!')
    
    def pairwise_predict(self,c1,c2):
        c1_layer0_output=self.layer0_.transform(c1)
        c1_layer1_output=self.layer1_.transform(c1_layer0_output)
        c1_label=self.layer2_.predict(c1_layer1_output)
        #print(c1_label)
        c2_layer0_output=self.layer0_.transform(c2)
        c2_layer1_output=self.layer1_.transform(c2_layer0_output)
        c2_label=self.layer2_.predict(c2_layer1_output)
        #print(c2_label)
        return (c1_label[0]!=c2_label[0])# if not same class return True

class ensemble():

    def __init__(self,n_estimators,model_dic):
        self.n_setimators=n_estimators
        self.estimators_=[estimator(model_dic) for i in range(self.n_setimators)]
    
    def fit(self,X,K):
        self.training_X=X
        for est in self.estimators_:
            est.fit(X,K)
    
    def pairwise_predict(self,c1,c2):
        self.est_pred=[]
        for est in self.estimators_:
            self.est_pred.append(est.pairwise_predict(c1,c2))
        prob_diff_class=(sum(self.est_pred)+0.0)/len(self.est_pred)
        return prob_diff_class
    
    def cust_diff_mat(self,X_new):
        new_cust_num=len(X_new)
        cust_diff_mat=np.zeros((new_cust_num,new_cust_num))
        for idx_cust in range(new_cust_num-1):
            for idx_latter in range(idx_cust+1,new_cust_num):
                c1=np.array([X_new[idx_cust]])
                c2=np.array([X_new[idx_latter]])
                cust_diff_mat[idx_cust,idx_latter]=self.pairwise_predict(c1,c2)
        cust_diff_mat+=cust_diff_mat.T
        return cust_diff_mat
                

In [79]:
e=ensemble(5,model_dic)

In [80]:
e.fit(X=s.count_mat,K=3)

In [81]:
new_cust=s.count_mat.toarray()[45:55]# from 45-50 customers are class1, 50-55 customers are class2 

In [82]:
e.cust_diff_mat(new_cust)
# this is customers mutual difference matrix, 
# the cell with rows_i, col_j represents the prob of customer_i, customer_j are in different class.
# 0. means they are almost the same class, 1. means they are very different that should be different class.
# this value is up to the voting result from ensemble methods, whose voting values are generated by all estimators in ensemble.

array([[0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.]])

In [88]:
[e.estimators_[i].layer0 for i in range(5)]

[__main__.Pasting,
 __main__.Pasting,
 __main__.Bagging,
 __main__.Random_Patches,
 __main__.Random_Patches]

In [87]:
[e.estimators_[i].layer1 for i in range(5)]

[sklearn.decomposition.pca.PCA,
 sklearn.decomposition.truncated_svd.TruncatedSVD,
 sklearn.decomposition.pca.PCA,
 sklearn.decomposition.pca.PCA,
 sklearn.decomposition.pca.PCA]

In [89]:
[e.estimators_[i].layer2 for i in range(5)]

[sklearn.cluster.k_means_.KMeans,
 sklearn.cluster.k_means_.KMeans,
 sklearn.cluster.k_means_.KMeans,
 sklearn.cluster.k_means_.KMeans,
 sklearn.cluster.k_means_.KMeans]