-
Notifications
You must be signed in to change notification settings - Fork 0
/
SKN.py
164 lines (132 loc) · 5.24 KB
/
SKN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
import os
import io
import sys
import gc
import string
import sqlite3
# import scipy
import numpy as np
from collections import OrderedDict
CommonWords = ['all', 'just', 'being', 'over', 'both', 'through', 'yourselves', 'its', 'before', 'herself', 'had', 'should',
'to', 'only', 'under', 'ours', 'has', 'do', 'them', 'his', 'very', 'they', 'not', 'during', 'now', 'him', 'nor', 'did',
'this', 'she', 'each', 'further', 'where', 'few', 'because', 'doing', 'some', 'are', 'our', 'ourselves', 'out', 'what',
'for', 'while', 'does', 'above', 'between', 't', 'be', 'we', 'who', 'were', 'here', 'hers', 'by', 'on', 'about', 'of',
'against', 's', 'or', 'own', 'into', 'yourself', 'down', 'your', 'from', 'her', 'their', 'there', 'been', 'whom', 'too',
'themselves', 'was', 'until', 'more', 'himself', 'that', 'but', 'don', 'with', 'than', 'those', 'he', 'me', 'myself',
'these', 'up', 'will', 'below', 'can', 'theirs', 'my', 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 'have',
'in', 'any', 'if', 'again', 'no', 'when', 'same', 'how', 'other', 'which', 'you', 'after', 'most', 'such', 'why', 'a',
'off', 'i', 'yours', 'so', 'the', 'having', 'once', 'article'
]
TopicDict = {}
Vocab = {}
path = "./20news/Train/"
outputPath = "./JiaJia/"
TopicList = []
indexV = 1
def getDirList(path, DirList):
for item in os.listdir(path):
if not item.startswith('.') and os.path.isdir(os.path.join(path, item)):
getDirList(path+item+'/', DirList)
DirList.append(path+item+'/')
return DirList
def perDirFileList(path):
FileList = []
for item in os.listdir(path):
if not item.startswith('.') and os.path.isfile(os.path.join(path, item)):
FileList.append(item)
return FileList
def getUnigrams(OneLine):
trantab = string.maketrans('@.,',' ')
delEStr = "!\"#$%&'()*+-/:;<=>?[\]^_`{|}~" #!\"#$%&'()*+-/:;<=>?[\]^_`{|}~
OneLine = OneLine.translate(trantab, delEStr)
words = OneLine.lower().strip().split()
return words
def parseVocab(path):
global indexV
global Vocab
global CommonWords
f = open(path)
Lines = f.readlines()
f.close()
UnigramList = []
for line in Lines:
UnigramList = getUnigrams(line) # slice each line with some rule
for unigram in UnigramList: # <--------------------
if unigram.isalpha() and not unigram in CommonWords: # <--------------------
if not unigram in Vocab:
Vocab[unigram] = indexV
indexV += 1
def parseToSklnFmt(path):
global Vocab
global CommonWords
# shared_items = set(Vocab.items()) & set(_Vocab.items())
# print len(shared_items)
perFileDic = {}
f = open(path)
Lines = f.readlines() # fully get a file's content (each line to a list)
f.close()
UnigramList = []
for line in Lines:
UnigramList = getUnigrams(line)
for unigram in UnigramList: # <--------------------
if unigram.isalpha() and unigram not in CommonWords: # <--------------------
if Vocab[unigram] in perFileDic:
perFileDic[Vocab[unigram]] += 1
else:
perFileDic[Vocab[unigram]] = 1
perFileDic = OrderedDict( sorted(perFileDic.items() ) )
return perFileDic
##################################################################################################################
TopicList = []
TopicList = getDirList(path, TopicList)
TopicList += ["./20news/Test/"]
for topic in TopicList:
FileList = perDirFileList(topic)
for file in FileList:
parseVocab(topic + file)
# print Vocab
print len(Vocab)
##################################################################################################################
# Training data
index = 0
outPutString = ""
TopicList = []
fo = open(outputPath+"/train.txt", 'wb+')
TopicList = getDirList("./20news/Train/", TopicList)
for topic in TopicList: # topic is path
TopicDict[topic] = index
FileList = perDirFileList(topic)
for file in FileList:
fo.write(str(index) + ' ') # topic number
perFileDic = parseToSklnFmt(topic + file)
for key in perFileDic:
fo.write(' ' + str(key) + ":" + str(perFileDic[key]))
fo.write('\n')
index += 1
fo.close()
##################################################################################################################
f = open('ans.test.txt')
anslist = f.readlines()
f.close()
answers = []
for ans in anslist:
qq = ans.strip().split(' ')
answers.append(TopicDict['./20news/Train/'+qq[1]+'/'])
##################################################################################################################
# Testing data
fo = open(outputPath+"/test.txt", 'wb+')
FileList = perDirFileList("./20news/Test/")
index = 0
for file in FileList:
fo.write(str(answers[int(file)-1]) + " ") # write topic number frome ans.
DicPerFile = parseToSklnFmt("./20news/Test/" + file)
for key in DicPerFile:
fo.write(' ' + str(key) + ":" + str(DicPerFile[key]))
index += 1
fo.write('\n')
fo.close()
print FileList[42]
f = open("./20news/Test/" + FileList[42])
read = f.readlines()
f.close