# **Install Raw Data**

In [1]:
import pandas as pd

In [2]:
train_data=pd.read_csv("agNews_train.csv",header=0,names=['id','categories','text'])
test_data=pd.read_csv("agNews_test.csv",header=0,names=['id','categories','text'])

In [3]:
train_data['categories'] = train_data['categories'].map({1:0,2:1,3:2,4:3})
test_data['categories'] = test_data['categories'].map({1:0,2:1,3:2,4:3})

train_data.head()

Unnamed: 0,id,categories,text
0,0,0,"Iran Says Its Missiles Can Now Reach 1,250 Mil..."
1,1,0,Italian women kidnapped in raid on relief agen...
2,2,0,Insurgent Alliance Is Fraying in Fallujah Re...
3,3,0,Rahul the #39;darling #39; at AICC conclave N...
4,4,0,Judges Postpone Milosevic Trial for Month (AP)...


In [4]:
train_data['categories'].value_counts()

0    1000
1    1000
2    1000
3    1000
Name: categories, dtype: int64

In [5]:
test_data['categories'].value_counts()

0    400
1    400
2    400
3    400
Name: categories, dtype: int64

# **Preprocessing**

In [6]:
import nltk
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fengwenxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stopwords = nltk.corpus.stopwords.words("english")
stopwords_1 = stopwords + ["dont"]

keep_words = ['of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'no', 'nor', 'not', 'be', 'do', 'can', 'have']

stopwords_2 = []
for stop in stopwords:
    if stop not in keep_words:
        stopwords_2.append(stop)

raw_train_text_list = train_data['text'].values.tolist()
raw_test_text_list = test_data['text'].values.tolist()

In [8]:
def is_num(num):
    pattern = re.compile(r'[-+]?[0-9\.,]*[0-9][a-zA-Z]*$')
    result = pattern.match(num)
    return result

def is_be(word):
	pattern = ['be', 'is', 'are', 'am', 'was', 'were', 'being', 'been']
	return word in pattern

def is_do(word):
	pattern = ['do', 'does', 'did', 'doing', 'done']
	return word in pattern

def is_dont(word):
    pattern = ["don't", "doesn't"]
    return word in pattern

def is_cant(word):
    pattern = ["cannot", "can't"]
    return word in pattern

def is_have(word):
	pattern = ['have', 'has', 'had', 'having']
	return word in pattern

def is_will(word):
	pattern = ['shall', 'should', 'will', 'would', 'd', 'll']
	return word in pattern

def is_get(word):
	pattern = ['get', 'gets', 'got', 'gotten', 'getting']
	return word in pattern

def is_seem(word):
	pattern = ['seem', 'seeming', 'seems', 'seemed']
	return word in pattern

def is_a(word):
	pattern = ['a', 'the', 'an', 'this', 'that']
	return word in pattern

def is_good(word):
    pattern = ['good', 'better', 'best']
    return word in pattern

def is_bad(word):
    pattern = ['bad', 'worse', 'worst']
    return word in pattern

def trans_num(title):
    title_list = title.split(" ")
    new_title = []
    for word in title_list:
        if (is_num(word)):
            word = "isnum"
        new_title.append(word)
    new_title = " ".join(new_title)
    return new_title

In [9]:
def preprocess(raw_title):
    raw_lists = raw_title.split(" ")
    title = []
  
    for raw in raw_lists:
        #1. Case Folding
        raw = str(raw.lower())
		#2. Lab Numbers
        new_raw = trans_num(raw)
		#3. Remove Punctuations
        letters_only = re.sub("[^a-zA-Z0-9]", " ", new_raw)
        word_1 = " ".join(letters_only.split())
        words = word_1.split(" ")
	 	#4. Stemming & Lemmatization
        for word in words:
            if (word == ""):
                word = ""
            elif (is_be(word)):
                word = "be"
            elif (is_do(word)):
                word = "do"
            elif (is_dont(word)):
                word = "do not"
            elif (is_cant(word)):
                word = "can not"
            elif (is_have(word)):
                word = "have"
            elif (is_will(word)):
                word = "will"
            elif (is_get(word)):
                word = "get"
            elif (is_seem(word)):
                word = "seem"
            elif (is_a(word)):
                word = "a"
            elif (is_num(word)):
                word = "is_num"
            elif (is_good(word)):
                word = "good"
            elif (is_bad(word)):
                word = "bad"
            elif (word[-3:] == "ing") & (len(word) > 5):
                if word[-4] == word[-5]:
                    word = word[:-4]
                elif word[-5] in ['a', 'e', 'i', 'o', 'u']:
                    word = word[:-3] + 'e'
                else:
                    word = word[:-3]
            elif(word[-3:] == "ies"):
                word = word[:-3] + "y"
            elif (word[-2:] == "ed") & (len(word) > 5):
                word = word[:-2]
            elif (word[-1] == "s") & (len(word) > 3):
                word = word[:-1]
            if word != '':
                title.append(word)
    title = " ".join(title)
    return title

In [10]:
train_list = []
test_list = []

for text in raw_train_text_list:
    train_list.append(preprocess(text))
    
for text in raw_test_text_list:
    test_list.append(preprocess(text))

In [11]:
train_list[0]

'iran say its missile can now reach isnum mile tehran reuter iran have increas a range of its missile to isnum mile a senior official be quot as saye on tuesday'

# **Word Preprocessing**

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vectorizer = TfidfVectorizer(max_features = 5000)
X_train = vectorizer.fit_transform(train_list).toarray()
X_test = vectorizer.fit_transform(test_list).toarray()

In [14]:
X_train.shape

(4000, 5000)

# **Models**

In [15]:
def three_metrics(true_label,preds):  
    from sklearn.metrics import accuracy_score, normalized_mutual_info_score, adjusted_rand_score, mutual_info_score, adjusted_mutual_info_score
    ACC = round(accuracy_score(true_label,preds),3)
    if ACC<=1/len(true_label.unique()): #说明：聚类的label，和真实的label没对上（改好了，不需要调整）
        keys = list(pd.value_counts(preds).index)
        values = list(pd.value_counts(true_label).index)
        dic = dict(zip(keys, values))
        preds = pd.Series(preds).map(dic)
    NMI = round(normalized_mutual_info_score(true_label.to_numpy(),preds),3)
    ARI = round(adjusted_rand_score(true_label.to_numpy(),preds),3)
    ACC = round(accuracy_score(true_label,preds),3)
    # MI = mutual_info_score(true_label,preds)
    return {'ACC':ACC,'NMI':NMI,'ARI':ARI}

In [16]:
true_label = train_data['categories']

### K-means

In [17]:
#K-Means
from sklearn.cluster import KMeans
clustering_model = KMeans(n_clusters = 4, 
                          init = 'k-means++',
                          max_iter = 300, n_init = 10,random_state=123)
clustering_model.fit(X_train)
KMeans_label = clustering_model.predict(X_train)
print('Baseline K-means',three_metrics(true_label,KMeans_label))

Baseline K-means {'ACC': 0.412, 'NMI': 0.123, 'ARI': 0.062}


### Fuzzy C Means

In [19]:
#FCM
from fcmeans import FCM
fcm = FCM(n_clusters=4)
fcm.fit(X_train)
FCM_label = fcm.predict(X_train)
print('Baseline FuzzyC',three_metrics(true_label,FCM_label))

Baseline FuzzyC {'ACC': 0.286, 'NMI': 0.13, 'ARI': 0.097}


### LDA

In [20]:
import numpy as np

In [21]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.fit_transform(X_test)

In [22]:
lda = LatentDirichletAllocation(n_components=4, random_state=456)
lda.fit(X_test2)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_train2))
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)
LDA_label = list(np.array(doc_topic_dist.argmax(axis=1)).T[0])
print('Baseline LDA',three_metrics(true_label,LDA_label))

Baseline LDA {'ACC': 0.208, 'NMI': 0.009, 'ARI': 0.009}


### Deep Embedded Clustering

In [56]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler


maxlen = 4096 #only use this number of most frequent words
training_samples = 8000
validation_samples = 4500
max_words = 100000

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=max_words)

numeric_columns = train_data.columns.values.tolist()

train = vectorizer.fit_transform(train_list)

train

<4527x13617 sparse matrix of type '<class 'numpy.float64'>'
	with 214152 stored elements in Compressed Sparse Row format>

In [59]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_list) # generates word index
sequences = tokenizer.texts_to_sequences(train_list) # transforms strings in list of intergers
word_index = tokenizer.word_index # calculated word index
print(f"{len(word_index)} unique tokens found")

data = pad_sequences(sequences, maxlen=maxlen) #transforms integer lists into 2D tensor

13642 unique tokens found


In [60]:
scaler = MinMaxScaler() 
data_1 = scaler.fit_transform(data)

In [61]:
x = data_1
x

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.43920413e-01, 5.97550045e-04, 1.15660107e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.47383935e-04, 1.26979385e-03, 2.78971575e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.90567428e-03, 1.44980580e-01, 4.52386338e-04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.49521002e-03, 4.18285031e-03, 1.65121013e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.68459838e-04, 3.65999402e-03, 3.01590892e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.68459838e-04, 0.00000000e+00, 3.01590892e-04]])

In [62]:
from time import time
import keras.backend as K
from tensorflow.keras.layers import Layer, InputSpec
from keras.layers import Dense, Input, Embedding
from keras.models import Model
from tensorflow.keras.optimizers import SGD
from keras import callbacks
from keras.initializers import VarianceScaling
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

%matplotlib inline

In [63]:
def autoencoder(dims, act='relu', init='glorot_uniform'):
    """
    Fully connected symmetric auto-encoder model.
  
    dims: list of the sizes of layers of encoder like [500, 500, 2000, 10]. 
          dims[0] is input dim, dims[-1] is size of the latent hidden layer.

    act: activation function
    
    return:
        (autoencoder_model, encoder_model): Model of autoencoder and model of encoder
    """
    n_stacks = len(dims) - 1
    
    input_data = Input(shape=(dims[0],), name='input')
    x = input_data
    
    # internal layers of encoder
    for i in range(n_stacks-1):
        x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)

    # latent hidden layer
    encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x)

    x = encoded
    # internal layers of decoder
    for i in range(n_stacks-1, 0, -1):
        x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)

    # decoder output
    x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
    
    decoded = x
    
    autoencoder_model = Model(inputs=input_data, outputs=decoded, name='autoencoder')
    encoder_model     = Model(inputs=input_data, outputs=encoded, name='encoder')
    
    return autoencoder_model, encoder_model

In [64]:
n_clusters = 2 
n_epochs   = 15
batch_size = 128

In [65]:
dims = [x.shape[-1], 500, 500, 2000, 10] 
init = VarianceScaling(scale=1. / 3., mode='fan_in',
                           distribution='uniform')
pretrain_optimizer = SGD(lr=1, momentum=0.9)
pretrain_epochs = n_epochs
batch_size = batch_size
# save_dir = 'kaggle/working'

  super(SGD, self).__init__(name, **kwargs)


In [66]:
dims

[4096, 500, 500, 2000, 10]

In [67]:
class ClusteringLayer(Layer):
    '''
    Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
    sample belonging to each cluster. The probability is calculated with student's t-distribution.
    '''

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight(name='clusters', shape=(self.n_clusters, input_dim), initializer='glorot_uniform') 
        
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        ''' 
        student t-distribution, as used in t-SNE algorithm.
        It measures the similarity between embedded point z_i and centroid µ_j.
                 q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
                 q_ij can be interpreted as the probability of assigning sample i to cluster j.
                 (i.e., a soft assignment)
       
        inputs: the variable containing data, shape=(n_samples, n_features)
        
        Return: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
        '''
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure all of the values of each sample sum up to 1.
        
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [68]:
autoencoder, encoder = autoencoder(dims, init=init)

2022-05-12 05:15:40.636050: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [81]:
autoencoder.compile(optimizer=pretrain_optimizer, loss='mse')
autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x153465040>

In [82]:
clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
model = Model(inputs=encoder.input, outputs=clustering_layer)

In [83]:
model.compile(optimizer=SGD(0.01, 0.9), loss='kld')

In [84]:
encoder.predict(x)

array([[ 2.09156133e-05, -1.03542360e-03,  1.39652647e-03, ...,
        -3.72194394e-04,  3.69547168e-04,  2.21411872e-04],
       [-3.96784744e-04, -8.15429550e-04,  4.08606662e-04, ...,
        -7.89205893e-04,  2.22029033e-04,  6.54699397e-05],
       [-5.63650974e-05, -5.58513857e-04,  3.28945578e-04, ...,
        -1.09319808e-03,  3.49186361e-04, -3.48822563e-04],
       ...,
       [-1.16920943e-04, -7.42095697e-04,  4.24565398e-04, ...,
        -5.47854812e-04,  5.52729180e-04,  1.64238387e-04],
       [-2.70342513e-04, -2.69885117e-04,  5.70217147e-04, ...,
        -6.09003531e-04, -1.20140685e-04, -4.43551282e-04],
       [-4.12418682e-04,  3.83455190e-05,  2.08073296e-04, ...,
        -5.29231329e-04,  1.94052060e-04, -6.35411008e-04]], dtype=float32)

In [85]:
kmeans = KMeans(n_clusters=n_clusters, n_init=20)
y_pred = kmeans.fit_predict(encoder.predict(x))

In [92]:
y_pred_last = np.copy(y_pred)
y_pred_last

array([1, 0, 0, ..., 0, 0, 0], dtype=int32)

In [93]:
y_pred_last.shape

(4527,)

In [94]:
t = np.array(train_data['categories'].map({'earn':0,'acq':1}).values.tolist())

In [99]:
t = train_data['categories'].map({'earn':0,'acq':1})

In [100]:
print('Baseline DEC',three_metrics(t,list(y_pred_last)))

Baseline DEC {'ACC': 0.662, 'NMI': 0.034, 'ARI': 0.073}
