In [1]:
import os
import sys

char_level_category_file = 'cnews_data/char-level/cnews.category.txt'
char_level_train_file = 'cnews_data/char-level/cnews.train.txt'
char_level_val_file = 'cnews_data/char-level/cnews.val.txt'
char_level_test_file  = 'cnews_data/char-level/cnews.test.txt'
char_level_vocab_file = 'cnews_data/char-level/cnews.vocab.txt'

char_level_feature_folder = 'cnews_data/char-level-feature/'
char_level_train_feature_file = 'cnews_data/char-level-feature/cnews.train.txt'
char_level_val_feature_file = 'cnews_data/char-level-feature/cnews.val.txt'
char_level_test_feature_file  = 'cnews_data/char-level-feature/cnews.test.txt'

if not os.path.exists(char_level_feature_folder):
    os.mkdir(char_level_feature_folder)

In [2]:
class Category:
    def __init__(self, category_file):
        self._category_to_id = {}
        with open(category_file, 'r') as f:
            lines = f.readlines()
        for line in lines:
            category, idx = line.strip('\r\n').split('\t')
            category = category.decode('utf-8')
            idx = int(idx)
            self._category_to_id[category] = idx
    
    def category_to_id(self, category):
        return self._category_to_id[category]
    
    def size(self):
        return len(self._category_to_id)
        
category_vocab = Category(char_level_category_file)
print category_vocab.size()

10


In [9]:
def generate_feature_dict(train_file, feature_threshold=10):
    feature_dict = {}
    with open(train_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        label, content = line.decode('utf-8').strip('\r\n').split('\t')
        for word in content:
            if not word in feature_dict:
                feature_dict.setdefault(word, 0)
            feature_dict[word] += 1
    filtered_feature_dict = {}
    for feature_name in feature_dict:
        if feature_dict[feature_name] < feature_threshold:
            continue
        if not feature_name in filtered_feature_dict:
            filtered_feature_dict[feature_name] = len(filtered_feature_dict) + 1
    return filtered_feature_dict
        

def generate_feature_line(line, feature_dict, category_vocab):
    label, content = line.decode('utf-8').strip('\r\n').split('\t')
    label_id = category_vocab.category_to_id(label)
    feature_example = {}
    for word in content:
        if not word in feature_dict:
            continue
        feature_id = feature_dict[word]
        feature_example.setdefault(feature_id, 0)
        feature_example[feature_id] += 1
    feature_line = '%d' % label_id
    sorted_feature_example = sorted(feature_example.items(), key=lambda d:d[0])
    for item in sorted_feature_example:
        feature_line += ' %d:%d' % item
    return feature_line

def convert_raw_to_feature(raw_file, feature_file, feature_dict, category_vocab):
    with open(raw_file, 'r') as f:
        lines = f.readlines()
    with open(feature_file, 'w') as f:
        for line in lines:
            feature_line = generate_feature_line(line, feature_dict, category_vocab)
            f.write('%s\n' % feature_line)

feature_dict = generate_feature_dict(char_level_train_file, feature_threshold=200)
print len(feature_dict)

3110


In [10]:
convert_raw_to_feature(char_level_train_file, char_level_train_feature_file, feature_dict, category_vocab)
convert_raw_to_feature(char_level_val_file, char_level_val_feature_file, feature_dict, category_vocab)
convert_raw_to_feature(char_level_test_file, char_level_test_feature_file, feature_dict, category_vocab)

In [None]:
'''
Results of char-level feature using SVM model
       >0      >50     >100      >200
Train  100%    100%    100%      100%
Valid  91.26%  91.3%   91.34%    91.32%
Test   93.09%  93.07%  93.07%    93.06%
'''