# Bert+K-means

In [183]:
import pandas as pd
import numpy as np
import torch
import codecs
from transformers import BertModel, BertTokenizer, BertConfig

model_path = '/Users/liuxiaosu/PycharmProjects/basic_model/chinese_L-12_H-768_A-12/'
config_path = model_path + 'bert_config.json'
checkpoint_path = model_path + '/bert_model.ckpt'
dict_path = model_path + 'vocab.txt'

In [184]:
train_data = pd.read_csv("data/train.csv")
col = train_data.columns.tolist()

train_data.drop(['category', 'query2', 'label'], axis=1, inplace=True)
train_data = (train_data.values)
# 3-category
data1 = train_data[:100].tolist()
data2 = train_data[500:600].tolist()
data3 = train_data[1200:1300].tolist()
train_data = data1 + data2 + data3
# print((train_data[1]))
# print((train_data[1139])) 

In [201]:
# Original BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
myconfig = BertConfig(
    output_hidden_states = True, 
    vocab_size = 21128,
    )
model = BertModel.from_pretrained(model_path, config = myconfig)

### Customize tokenizer for Chinese encoding

In [134]:
token_dict = {}
with codecs.open(dict_path) as dictionary:
    for word in dictionary:
        token = word.strip()
        token_dict[token] = len(token_dict)

class OurTokenizer4Chinese(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_whitespace(c):
                R.append('[unused1]')
            else:
                R.append('[UNK]')
        return R

tokenizer = OurTokenizer(token_dict)
tokenizer.tokenize('今天天气不错')

### Extract all the vectors of the texts and save them

In [185]:
text2vec = []
cnt = 0
for text in train_data:
    input_ids = torch.tensor(tokenizer.encode(text[0], add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_state = outputs[0]
    cls = last_hidden_state[0][0]
    text2vec.append(cls)
    cnt += 1
print(type(cls))

<class 'torch.Tensor'>


In [186]:
for i, vec in enumerate(text2vec):
    text2vec[i] = vec.tolist()

In [194]:
def saveFile(filename):
    with open(filename, 'a') as f:
        for vec in text2vec:
            f.write(str(vec))
            f.write('\n')


In [189]:
saveFile('textvec.txt')

## Use kmeans to classify the texts

In [211]:
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt

In [225]:
print("clustering documents ...")
train_x = text2vec
km = KMeans(n_clusters = 90,
               max_iter = 1000,
               tol = 0.001,
               verbose = 1,
               n_init = 3)
km.fit(train_x)
print("kmean: k = {}".format(3, int(km.inertia_)))

clustering documents ...
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 142.55057691211306
start iteration
done sorting
end inner loop
Iteration 1, inertia 142.55057691211306
center shift 0.000000e+00 within tolerance 1.264244e-04
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 144.9035300554161
start iteration
done sorting
end inner loop
Iteration 1, inertia 144.9035300554161
center shift 0.000000e+00 within tolerance 1.264244e-04
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 174.11934129246112
start iteration
done sorting
end inner loop
Iteration 1, inertia 174.11934129246112
center shift 0.000000e+00 within tolerance 1.264244e-04
kmean: k = 3


In [226]:
clusters = km.labels_.tolist()

In [227]:
score = metrics.silhouette_score(X=train_x, labels=clusters)
print(score)