In [0]:
import numpy as np
import random
from collections import defaultdict
import sys
import time


class Point:
    def __init__(self, label, doc_id, tfidf):
        def compute_l2_square():
            # compute norm2 square
            ans = 0.0
            for x in tfidf.values():
                ans += x**2
            ans=ans**0.5
            for i in tfidf:
                tfidf[i]=tfidf[i]/ans
                
            self.tfidf=tfidf
            self.l2_square=ans

        self.label = label
        self.doc_id = doc_id
        compute_l2_square()

class Cluster:
    def __init__(self,fea_dim):
        self.centroid=np.zeros(fea_dim)
        self.l2_square=0.0

    def set_centroid(self,centroid):
        l2=np.linalg.norm(centroid)
        self.centroid=centroid/l2
        self.l2_square=l2**2
    
def Load_data(pathin):
    def tfidf(doc):
        tf_idf = defaultdict(int)
        fea = doc.split()
        for i in fea:
            tmp = i.split(':')
            tf_idf[int(tmp[0])] = float(tmp[1])
        return tf_idf

    with open(pathin, 'r') as f:
        d_lines = f.read().splitlines()
    data = []
    for d in d_lines:
        fea = d.split('<<>>')
        label = fea[0]
        doc_id = fea[1]
        data.append(Point(label, doc_id,tfidf(fea[2])))
    return data

X_train = Load_data("/content/drive/My Drive/Project2/Datasets/train_tfidf_df=3")
X_test = Load_data("/content/drive/My Drive/Project2/Datasets/test_tfidf")



In [0]:
class FCM:
    def __init__(self,c_cluster,n_doc,fea_dim):
        def init_memberships_value():
            matrix_mbs=np.random.randint(1,c_cluster*n_doc,(c_cluster,1))
            matrix_mbs=matrix_mbs/matrix_mbs.sum()
            for i in range(n_doc-1):
                tmp=np.random.randint(1,c_cluster*n_doc,(c_cluster,1))
                tmp=tmp/tmp.sum()
                matrix_mbs=np.hstack((tmp,matrix_mbs))

            return matrix_mbs
        
        def init_c_cluster():
            clusters=[]
            for i in range(c_cluster):
                tmp=Cluster(fea_dim)
                clusters.append(tmp)
            return clusters
        
        self.c_cluster=c_cluster
        self.fea_dim=fea_dim
        self.n_doc=n_doc
        self.it=0
        self.mbs_matrix=init_memberships_value()
        self.list_cluster=init_c_cluster()
    
    def dist_square_point_i_cluster_j(self,x,c):
        '''
        x is a point 
        c is a cluster
        '''
        # compute dot product
        ans=0.0
        for i in x.tfidf:
            ans+=x.tfidf[i]*c.centroid[i]
        # return norm 2 square
        return 2-2*ans
    
    def update_centroid(self,X):
        for i in range(self.c_cluster):
            mbs_i_square=self.mbs_matrix[i]**2
            centroid_i=np.zeros(self.fea_dim)
            for j in range(self.n_doc):
                for k in X[j].tfidf:
                    centroid_i[k]+=X[j].tfidf[k]*mbs_i_square[j]
            centroid_i=centroid_i/sum(mbs_i_square)
            self.list_cluster[i].set_centroid(centroid_i)
    
    
    def compute_memberships_marix(self,X):
        n_doc=len(X)
        new_mbs_matrix=np.zeros((self.c_cluster,n_doc))
        for j in range(n_doc):
            dist_square_j= [ self.dist_square_point_i_cluster_j(X[j],c) for c in self.list_cluster ]
            for i in range(self.c_cluster):
                new_mbs_matrix[i][j]= 1/sum([ dist_square_j[i]/dist for dist in dist_square_j])
        
        return new_mbs_matrix
    
    def check_stop(self,new_mbs_matrix,tol):
        if(np.linalg.norm(new_mbs_matrix-self.mbs_matrix)<tol):
            return True
        else:
            return False
    
    def fit(self,X_train,tol,maxiter):
        for i in range(maxiter):
            self.update_centroid(X_train)
            new_mbs_matrix=self.compute_memberships_marix(X_train)
            if(self.check_stop(new_mbs_matrix,tol)):
                self.mbs_matrix=new_mbs_matrix
                self.it=i+1
                break
            self.mbs_matrix=new_mbs_matrix
    
    def predict(self,X_test):
        mbs_matrix=self.compute_memberships_marix(X_test)
        pre_label=mbs_matrix.argmax(axis=0)
        return pre_label
    
    def purity(self,X_test):
        pre_label=self.predict(X_test)
        ans=np.zeros((20,20))
        for i in range(len(pre_label)):
            ans[int(X_test[i].label)][int(pre_label[i])]+=1
        kq=sum(np.max(ans,axis=1))
        return str(kq)+'/'+str(len(pre_label))


In [15]:
for i in range(5):
    t=time.time()
    model=FCM(20,11314,20167)
    model.fit(X_train,1e-10,100)
    print("time train =",time.time()-t,end='; ')
    pre_train=model.purity(X_train)
    print('pre_train= ',pre_train,end='; ')
    pre_test=model.purity(X_test)
    print('pre_test= ',pre_test)

time train = 267.3720464706421; pre_train=  9393.0/11314; pre_test=  6251.0/7532
time train = 268.242146730423; pre_train=  9138.0/11314; pre_test=  6142.0/7532
time train = 275.6221568584442; pre_train=  7692.0/11314; pre_test=  5217.0/7532
time train = 273.6336648464203; pre_train=  8960.0/11314; pre_test=  6071.0/7532
time train = 276.53499698638916; pre_train=  9331.0/11314; pre_test=  6246.0/7532


In [14]:
for i in range(5):
    t=time.time()
    model=FCM(20,11314,20167)
    model.fit(X_train,1e-8,100)
    print("time train =",time.time()-t,end='; ')
    pre_train=model.purity(X_train)
    print('pre_train= ',pre_train,end='; ')
    pre_test=model.purity(X_test)
    print('pre_test= ',pre_test)

time train = 215.77995920181274; pre_train=  8568.0/11314; pre_test=  5710.0/7532
time train = 216.8885715007782; pre_train=  8462.0/11314; pre_test=  5616.0/7532
time train = 218.52702140808105; pre_train=  8651.0/11314; pre_test=  5674.0/7532
time train = 217.3688952922821; pre_train=  8466.0/11314; pre_test=  5695.0/7532
time train = 218.3750126361847; pre_train=  8935.0/11314; pre_test=  6000.0/7532


In [13]:
for i in range(5):
    t=time.time()
    model=FCM(20,11314,20167)
    model.fit(X_train,1e-7,100)
    print("time train =",time.time()-t,end='; ')
    pre_train=model.purity(X_train)
    print('pre_train= ',pre_train,end='; ')
    pre_test=model.purity(X_test)
    print('pre_test= ',pre_test)

time train = 188.74726748466492; pre_train=  8852.0/11314; pre_test=  5963.0/7532
time train = 186.0112545490265; pre_train=  8741.0/11314; pre_test=  5832.0/7532
time train = 186.34207963943481; pre_train=  8394.0/11314; pre_test=  5680.0/7532
time train = 188.7613170146942; pre_train=  7524.0/11314; pre_test=  5052.0/7532
time train = 188.73846554756165; pre_train=  8189.0/11314; pre_test=  5467.0/7532


In [12]:
for i in range(5):
    t=time.time()
    model=FCM(20,11314,20167)
    model.fit(X_train,1e-6,100)
    print("time train =",time.time()-t,end='; ')
    pre_train=model.purity(X_train)
    print('pre_train= ',pre_train,end='; ')
    pre_test=model.purity(X_test)
    print('pre_test= ',pre_test)

time train = 163.98655652999878; pre_train=  8211.0/11314; pre_test=  5573.0/7532
time train = 163.29537677764893; pre_train=  8508.0/11314; pre_test=  5625.0/7532
time train = 163.86400413513184; pre_train=  8125.0/11314; pre_test=  5491.0/7532
time train = 162.65938067436218; pre_train=  8304.0/11314; pre_test=  5609.0/7532
time train = 161.15688610076904; pre_train=  8717.0/11314; pre_test=  5898.0/7532


In [10]:
for i in range(5):
    t=time.time()
    model=FCM(20,11314,20167)
    model.fit(X_train,1e-6,100)
    print(time.time()-t)
    ans=model.purity(X_test)
    print(ans)

167.26055884361267
6082.0/7532
164.42284059524536
5838.0/7532
164.4973225593567
4895.0/7532
164.92693638801575
6204.0/7532
164.412992477417
4875.0/7532
