In [1]:
import sys
from spgbn import *
from scipy.sparse import coo_matrix
import numpy as np
import os
sys.path.append('./data')
sys.path.append('./data/20news-bydate')
%load_ext autoreload
%autoreload 2

### Load data

+ 1th col: the index of document; (1~11269 for train and 1~7505 for test)

+ 2th col: the index of word; 

+ 3th col: the number of the word appearing in the document.

In [2]:
train_data = np.loadtxt('./data/20news-bydate/train.data').astype(int)
test_data = np.loadtxt('./data/20news-bydate/test.data').astype(int)
print(train_data.shape)
print(test_data.shape)

(1467345, 3)
(967874, 3)


In [3]:
test_data[:,0] = test_data[:,0] + np.max(train_data[:,0])
train_test = np.vstack((train_data, test_data))
print(train_test.shape)

(2435219, 3)


V = 61188 means that the dim of bag-of-words is 61188.

N = 18774 means that the num of doc is 18774 (11269 + 7505).

In [4]:
row = train_test[:,1] - 1
col = train_test[:,0] - 1
data = train_test[:,2]
X_all = coo_matrix((data, (row, col))).toarray()
print(X_all.shape)

(61188, 18774)


In [8]:
print(X_all.sum(axis = 1)) # the num of each word appearing in all docs
print(len(X_all.sum(axis = 1))) # the num of words

[ 497 2084  490 ...    2    2    2]
61188


### Load label

In [5]:
train_label = np.loadtxt('./data/20news-bydate/train.label').astype(int)
test_label = np.loadtxt('./data/20news-bydate/test.label').astype(int)
Y_all = np.hstack((train_label, test_label))
# The class of doc ordered according to the indexes of documents (there are 18774 documents in total).
print(Y_all.shape)

(18774,)


In [11]:
# useless
train_idx = np.arange(len(train_label))
test_idx = np.arange(len(test_label)) + len(train_label)
print(train_idx)
print(test_idx)

[    0     1     2 ... 11266 11267 11268]
[11269 11270 11271 ... 18771 18772 18773]


### Filter according to document class

In [12]:
# to count the num of docs belong to class 13 and class 14
train_label_1314 = train_label[(train_label == 13) | (train_label == 14)]
test_label_1314 = test_label[(test_label == 13) | (test_label == 14)]

train_label_1314_idx = np.arange(len(train_label_1314))
test_label_1314_idx = np.arange(len(test_label_1314)) + len(train_label_1314)
# There are 1971 documents belong to 13 or 14 classes.
print(train_label_1314_idx)
print(test_label_1314_idx)

[   0    1    2 ... 1182 1183 1184]
[1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226
 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254
 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324
 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
 1367 1368 1369 1370 1371 1372 1373 1374 

In [13]:
# dex is the bool array to filter the documents belong to 13 or 14 out of all 18774 documents
dex = (Y_all == 13) | (Y_all == 14)
X_all = X_all[:,dex]
print(X_all.shape)

(61188, 1971)


In [14]:
print(X_all.sum(axis = 1))

[ 18 118   0 ...   0   0   0]


### Filter according to words

In [11]:
with open('./data/stop-word-list.txt', 'r') as file:
    stopwords = [line.strip() for line in file.readlines()]
print('The num of stopwords:', len(stopwords))
with open('./data/20news-bydate/vocabulary.txt', 'r') as file:
    WO = [line.strip() for line in file.readlines()]
print('The num of vocabularies:', len(WO))

The num of stopwords: 319
The num of vocabularies: 61188


In [12]:
dex = [1 if word not in stopwords else 0 for word in WO]
print('The sum of dex:', np.sum(dex))
WO = [word for i, word in enumerate(WO) if dex[i]]
print('The length of remaining WO:', len(WO))
X_all = X_all[np.array(dex).astype(bool), :]
print('The shape of remaining X_all:', X_all.shape)

The sum of dex: 60883
The length of remaining WO: 60883
The shape of remaining X_all: (60883, 1971)


In [13]:
tmp = np.sum(X_all, axis = 1)
tmp = tmp >= 5
WO = [word for i, word in enumerate(WO) if tmp[i]]
print('The length of remaining WO:', len(WO))
X_all = X_all[tmp, :]
print('The shape of remaining X_all:', X_all.shape)

The length of remaining WO: 7729
The shape of remaining X_all: (7729, 1971)


## Set model parameters

In [9]:
# Set the number of nodes of each layer, the length of K is the num of layers. 
K = [800]
L = [100]

spgbn = SPGBN(K, L, device = 'cpu')
spgbn.initial(X_all)
spgbn.test(X_all, num_epochs=100)

: 